Class: Mindee::Input::Source::LocalInputSource

Inherits:
Object
  • Object
show all
Defined in:
lib/mindee/input/sources/local_input_source.rb

Overview

Base class for loading documents.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(io_stream, filename, repair_pdf: false) ⇒ LocalInputSource

Returns a new instance of LocalInputSource.

Parameters:

  • io_stream (StringIO, File)
  • filename (String)
  • repair_pdf (bool) (defaults to: false)

Raises:



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/mindee/input/sources/local_input_source.rb', line 36

def initialize(io_stream, filename, repair_pdf: false)
  @io_stream = io_stream
  @filename = filename
  @file_mimetype = if repair_pdf
                     Marcel::MimeType.for @io_stream
                   else
                     Marcel::MimeType.for @io_stream, name: @filename
                   end
  if ALLOWED_MIME_TYPES.include? @file_mimetype
    logger.debug("Loaded new input #{@filename} from #{self.class}")
    return
  end

  if filename.end_with?('.pdf') && repair_pdf
    rescue_broken_pdf(@io_stream)
    @file_mimetype = Marcel::MimeType.for @io_stream

    logger.debug("Loaded new input #{@filename} from #{self.class}")
    return if ALLOWED_MIME_TYPES.include? @file_mimetype
  end

  raise Errors::MindeeMimeTypeError, @file_mimetype.to_s
end

Instance Attribute Details

#file_mimetypeString (readonly)

Returns:

  • (String)


29
30
31
# File 'lib/mindee/input/sources/local_input_source.rb', line 29

def file_mimetype
  @file_mimetype
end

#filenameString (readonly)

Returns:

  • (String)


27
28
29
# File 'lib/mindee/input/sources/local_input_source.rb', line 27

def filename
  @filename
end

#io_streamStringIO (readonly)

Returns:

  • (StringIO)


31
32
33
# File 'lib/mindee/input/sources/local_input_source.rb', line 31

def io_stream
  @io_stream
end

Instance Method Details

#compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) ⇒ Object

Compresses the file, according to the provided info.

Parameters:

  • quality (Integer) (defaults to: 85)

    Quality of the output file.

  • max_width (Integer, nil) (defaults to: nil)

    Maximum width (Ignored for PDFs).

  • max_height (Integer, nil) (defaults to: nil)

    Maximum height (Ignored for PDFs).

  • force_source_text (bool) (defaults to: false)

    Whether to force the operation on PDFs with source text. This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation. WARNING: this operation is strongly discouraged.

  • disable_source_text (bool) (defaults to: true)

    If the PDF has source text, whether to re-apply it to the original or not. Needs force_source_text to work.



142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/mindee/input/sources/local_input_source.rb', line 142

def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
  buffer = if pdf?
             Mindee::PDF::PDFCompressor.compress_pdf(
               @io_stream,
               quality: quality,
               force_source_text_compression: force_source_text,
               disable_source_text: disable_source_text
             )
           else
             Mindee::Image::ImageCompressor.compress_image(
               @io_stream,
               quality: quality,
               max_width: max_width,
               max_height: max_height
             )
           end
  @io_stream = buffer
  @io_stream.rewind
end

#count_pagesInteger

Returns the page count for a document. Defaults to one for images.

Returns:

  • (Integer)


125
126
127
128
129
130
131
# File 'lib/mindee/input/sources/local_input_source.rb', line 125

def count_pages
  return 1 unless pdf?

  @io_stream.seek(0)
  pdf_processor = Mindee::PDF::PDFProcessor.open_pdf(@io_stream)
  pdf_processor.pages.size
end

#pdf?Boolean

Shorthand for pdf mimetype validation.

Returns:

  • (Boolean)


77
78
79
# File 'lib/mindee/input/sources/local_input_source.rb', line 77

def pdf?
  @file_mimetype.to_s == 'application/pdf'
end

#process_pdf(options) ⇒ Object

Parses a PDF file according to provided options.

Parameters:

  • options (PageOptions, nil)

    Page cutting/merge options:

    • :page_indexes Zero-based list of page indexes.

    • :operation Operation to apply on the document, given the ‘page_indexes specified:

      • :KEEP_ONLY - keep only the specified pages, and remove all others.

      • :REMOVE - remove the specified pages, and keep all others.

    • :on_min_pages Apply the operation only if document has at least this many pages.



89
90
91
92
# File 'lib/mindee/input/sources/local_input_source.rb', line 89

def process_pdf(options)
  @io_stream.seek(0)
  @io_stream = PDF::PDFProcessor.parse(@io_stream, options)
end

#read_contents(close: true) ⇒ Array<String, [String, aBinaryString ], [Hash, nil] >

Reads a document.

Parameters:

  • close (bool) (defaults to: true)

Returns:

  • (Array<String, [String, aBinaryString ], [Hash, nil] >)


97
98
99
100
101
102
103
104
105
# File 'lib/mindee/input/sources/local_input_source.rb', line 97

def read_contents(close: true)
  logger.debug("Reading data from: #{@filename}")
  @io_stream.seek(0)
  # Avoids needlessly re-packing some files
  data = @io_stream.read
  @io_stream.rewind
  @io_stream.close if close
  ['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
end

#rescue_broken_pdf(stream) ⇒ Object

Attempts to fix pdf files if mimetype is rejected. “Broken PDFs” are often a result of third-party injecting invalid headers. This attempts to remove them and send the file

Parameters:

  • stream (StringIO)

Raises:



64
65
66
67
68
69
70
71
72
73
74
# File 'lib/mindee/input/sources/local_input_source.rb', line 64

def rescue_broken_pdf(stream)
  stream.gets('%PDF-')
  raise Errors::MindeePDFError if stream.eof? || stream.pos > 500

  stream.pos = stream.pos - 5
  data = stream.read
  @io_stream.close

  @io_stream = StringIO.new
  @io_stream << data
end

#source_text?bool

Checks whether the file has source text if it is a pdf. False otherwise

Returns:

  • (bool)

    True if the file is a PDF and has source text.



164
165
166
# File 'lib/mindee/input/sources/local_input_source.rb', line 164

def source_text?
  Mindee::PDF::PDFTools.source_text?(@io_stream)
end

#write_to_file(path) ⇒ Object

Write the file to a given path. Uses the initial file name by default.

Parameters:

  • path (String)

    Path to write the file to.



109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/mindee/input/sources/local_input_source.rb', line 109

def write_to_file(path)
  full_path = if File.directory?(path) || path.end_with?('/')
                File.join(path, @filename)
              else
                path
              end
  FileUtils.mkdir_p(File.dirname(full_path))
  @io_stream.rewind
  File.binwrite(full_path, @io_stream.read)
  logger.debug("Wrote file successfully to #{full_path}")
  @io_stream.rewind
end