Class: Mindee::Input::Source::LocalInputSource

Inherits:
Object
  • Object
show all
Defined in:
lib/mindee/input/sources.rb

Overview

Base class for loading documents.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(io_stream, filename, fix_pdf: false) ⇒ LocalInputSource

Returns a new instance of LocalInputSource.

Parameters:

  • io_stream (StringIO)
  • filename (String)
  • fix_pdf (Boolean) (defaults to: false)

Raises:



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/mindee/input/sources.rb', line 58

def initialize(io_stream, filename, fix_pdf: false)
  @io_stream = io_stream
  @filename = filename
  @file_mimetype = if fix_pdf
                     Marcel::MimeType.for @io_stream
                   else
                     Marcel::MimeType.for @io_stream, name: @filename
                   end
  return if ALLOWED_MIME_TYPES.include? @file_mimetype

  if filename.end_with?('.pdf') && fix_pdf
    rescue_broken_pdf(@io_stream)
    @file_mimetype = Marcel::MimeType.for @io_stream

    return if ALLOWED_MIME_TYPES.include? @file_mimetype
  end

  raise InvalidMimeTypeError, @file_mimetype.to_s
end

Instance Attribute Details

#file_mimetypeString (readonly)

Returns:

  • (String)


51
52
53
# File 'lib/mindee/input/sources.rb', line 51

def file_mimetype
  @file_mimetype
end

#filenameString (readonly)

Returns:

  • (String)


49
50
51
# File 'lib/mindee/input/sources.rb', line 49

def filename
  @filename
end

#io_streamStringIO (readonly)

Returns:

  • (StringIO)


53
54
55
# File 'lib/mindee/input/sources.rb', line 53

def io_stream
  @io_stream
end

Instance Method Details

#compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) ⇒ Object

Compresses the file, according to the provided info.

Parameters:

  • quality (Integer) (defaults to: 85)

    Quality of the output file.

  • max_width (Integer, nil) (defaults to: nil)

    Maximum width (Ignored for PDFs).

  • max_height (Integer, nil) (defaults to: nil)

    Maximum height (Ignored for PDFs).

  • force_source_text (Boolean) (defaults to: false)

    Whether to force the operation on PDFs with source text. This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation. WARNING: this operation is strongly discouraged.

  • disable_source_text (Boolean) (defaults to: true)

    If the PDF has source text, whether to re-apply it to the original or not. Needs force_source_text to work.



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/mindee/input/sources.rb', line 140

def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
  buffer = if pdf?
             Mindee::PDF::PDFCompressor.compress_pdf(
               @io_stream,
               quality: quality,
               force_source_text_compression: force_source_text,
               disable_source_text: disable_source_text
             )
           else
             Mindee::Image::ImageCompressor.compress_image(
               @io_stream,
               quality: quality,
               max_width: max_width,
               max_height: max_height
             )
           end
  @io_stream = buffer
  @io_stream.rewind
end

#count_pdf_pagesObject



123
124
125
126
127
128
129
# File 'lib/mindee/input/sources.rb', line 123

def count_pdf_pages
  return 1 unless pdf?

  @io_stream.seek(0)
  pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
  pdf_processor.pages.size
end

#pdf?Boolean

Shorthand for pdf mimetype validation.

Returns:

  • (Boolean)


95
96
97
# File 'lib/mindee/input/sources.rb', line 95

def pdf?
  @file_mimetype.to_s == 'application/pdf'
end

#process_pdf(options) ⇒ Object

Parses a PDF file according to provided options.

Parameters:

  • options (Hash, nil)

    Page cutting/merge options:

    • :page_indexes Zero-based list of page indexes.

    • :operation Operation to apply on the document, given the ‘page_indexes specified:

      • :KEEP_ONLY - keep only the specified pages, and remove all others.

      • :REMOVE - remove the specified pages, and keep all others.

    • :on_min_pages Apply the operation only if document has at least this many pages.



107
108
109
110
# File 'lib/mindee/input/sources.rb', line 107

def process_pdf(options)
  @io_stream.seek(0)
  @io_stream = PdfProcessor.parse(@io_stream, options)
end

#read_document(close: true) ⇒ Array<String, [String, aBinaryString ], [Hash, nil] >

Reads a document.

Parameters:

  • close (Boolean) (defaults to: true)

Returns:

  • (Array<String, [String, aBinaryString ], [Hash, nil] >)


115
116
117
118
119
120
121
# File 'lib/mindee/input/sources.rb', line 115

def read_document(close: true)
  @io_stream.seek(0)
  # Avoids needlessly re-packing some files
  data = @io_stream.read
  @io_stream.close if close
  ['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
end

#rescue_broken_pdf(stream) ⇒ Object

Attempts to fix pdf files if mimetype is rejected. “Broken PDFs” are often a result of third-party injecting invalid headers. This attempts to remove them and send the file

Parameters:

  • stream (StringIO)

Raises:



82
83
84
85
86
87
88
89
90
91
92
# File 'lib/mindee/input/sources.rb', line 82

def rescue_broken_pdf(stream)
  stream.gets('%PDF-')
  raise UnfixablePDFError if stream.eof? || stream.pos > 500

  stream.pos = stream.pos - 5
  data = stream.read
  @io_stream.close

  @io_stream = StringIO.new
  @io_stream << data
end

#source_text?Boolean

Checks whether the file has source text if it is a pdf. False otherwise

Returns:

  • (Boolean)

    True if the file is a PDF and has source text.



162
163
164
# File 'lib/mindee/input/sources.rb', line 162

def source_text?
  Mindee::PDF::PDFTools.source_text?(@io_stream)
end