Class: Mindee::Input::Source::LocalInputSource
- Inherits:
-
Object
- Object
- Mindee::Input::Source::LocalInputSource
- Defined in:
- lib/mindee/input/sources/local_input_source.rb
Overview
Base class for loading documents.
Direct Known Subclasses
Base64InputSource, BytesInputSource, FileInputSource, PathInputSource
Instance Attribute Summary collapse
- #file_mimetype ⇒ String readonly
- #filename ⇒ String readonly
- #io_stream ⇒ StringIO | File readonly
Class Method Summary collapse
-
.fix_pdf(stream, maximum_offset: 500) ⇒ StringIO
Attempt to fix the PDF data in the given stream.
Instance Method Summary collapse
-
#apply_page_options(options) ⇒ Object
Cuts a PDF file according to provided options.
-
#compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) ⇒ Object
Compresses the file, according to the provided info.
-
#fix_pdf!(maximum_offset: 500) ⇒ void
Attempts to fix the PDF data in the file.
-
#initialize(io_stream, filename, repair_pdf: false) ⇒ LocalInputSource
constructor
A new instance of LocalInputSource.
-
#page_count ⇒ Integer
Returns the page count for a document.
-
#pdf? ⇒ Boolean
Shorthand for PDF mimetype validation.
-
#process_pdf(options) ⇒ Object
deprecated
Deprecated.
Use #apply_page_options instead.
-
#read_contents(close: true) ⇒ Array<>
Reads a document.
-
#rescue_broken_pdf(_) ⇒ Object
deprecated
Deprecated.
See #fix_pdf! or #self#self.fix_pdf instead.
-
#source_text? ⇒ bool
Checks whether the file has source text if it is a pdf.
-
#write_to_file(path) ⇒ Object
Write the file to a given path.
Constructor Details
#initialize(io_stream, filename, repair_pdf: false) ⇒ LocalInputSource
Returns a new instance of LocalInputSource.
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 37 def initialize(io_stream, filename, repair_pdf: false) @io_stream = io_stream @filename = filename @file_mimetype = if repair_pdf Marcel::MimeType.for @io_stream else Marcel::MimeType.for @io_stream, name: @filename end if ALLOWED_MIME_TYPES.include? @file_mimetype logger.debug("Loaded new input #{@filename} from #{self.class}") return end if filename.end_with?('.pdf') && repair_pdf fix_pdf! logger.debug("Loaded new input #{@filename} from #{self.class}") return if ALLOWED_MIME_TYPES.include? @file_mimetype end raise Error::MindeeMimeTypeError, @file_mimetype.to_s end |
Instance Attribute Details
#file_mimetype ⇒ String (readonly)
30 31 32 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 30 def file_mimetype @file_mimetype end |
#filename ⇒ String (readonly)
28 29 30 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 28 def filename @filename end |
#io_stream ⇒ StringIO | File (readonly)
32 33 34 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 32 def io_stream @io_stream end |
Class Method Details
.fix_pdf(stream, maximum_offset: 500) ⇒ StringIO
Attempt to fix the PDF data in the given stream.
85 86 87 88 89 90 91 92 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 85 def self.fix_pdf(stream, maximum_offset: 500) out_stream = StringIO.new stream.gets('%PDF-') raise Error::MindeePDFError if stream.eof? || stream.pos > maximum_offset stream.pos = stream.pos - 5 out_stream << stream.read end |
Instance Method Details
#apply_page_options(options) ⇒ Object
Cuts a PDF file according to provided options.
102 103 104 105 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 102 def () @io_stream.seek(0) @io_stream = PDF::PDFProcessor.parse(@io_stream, ) end |
#compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) ⇒ Object
Compresses the file, according to the provided info.
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 165 def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) unless Mindee::Dependency.all_deps_available? raise NotImplementedError, Mindee::Dependency::MINDEE_DEPENDENCIES_LOAD_ERROR end buffer = if pdf? Mindee::PDF::PDFCompressor.compress_pdf( @io_stream, quality: quality, force_source_text_compression: force_source_text, disable_source_text: disable_source_text ) else Mindee::Image::ImageCompressor.compress_image( @io_stream, quality: quality, max_width: max_width, max_height: max_height ) end @io_stream = buffer @io_stream.rewind end |
#fix_pdf!(maximum_offset: 500) ⇒ void
This method returns an undefined value.
Attempts to fix the PDF data in the file.
74 75 76 77 78 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 74 def fix_pdf!(maximum_offset: 500) @io_stream = LocalInputSource.fix_pdf(@io_stream, maximum_offset: maximum_offset) @io_stream.rewind @file_mimetype = Marcel::MimeType.for @io_stream end |
#page_count ⇒ Integer
Returns the page count for a document. Defaults to one for images.
145 146 147 148 149 150 151 152 153 154 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 145 def page_count unless Mindee::Dependency.all_deps_available? raise NotImplementedError, Mindee::Dependency::MINDEE_DEPENDENCIES_LOAD_ERROR end return 1 unless pdf? @io_stream.seek(0) pdf_processor = Mindee::PDF::PDFProcessor.open_pdf(@io_stream) pdf_processor.pages.size end |
#pdf? ⇒ Boolean
Shorthand for PDF mimetype validation.
66 67 68 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 66 def pdf? @file_mimetype.to_s == 'application/pdf' end |
#process_pdf(options) ⇒ Object
Use #apply_page_options instead.
109 110 111 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 109 def process_pdf() () end |
#read_contents(close: true) ⇒ Array<>
Reads a document.
116 117 118 119 120 121 122 123 124 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 116 def read_contents(close: true) logger.debug("Reading data from: #{@filename}") @io_stream.seek(0) # Avoids needlessly re-packing some files data = @io_stream.read @io_stream.rewind @io_stream.close if close [data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }] end |
#rescue_broken_pdf(_) ⇒ Object
See #fix_pdf! or Mindee::Input::Source::LocalInputSource#self#self.fix_pdf instead.
61 62 63 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 61 def rescue_broken_pdf(_) fix_pdf! end |
#source_text? ⇒ bool
Checks whether the file has source text if it is a pdf. false otherwise
191 192 193 194 195 196 197 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 191 def source_text? unless Mindee::Dependency.all_deps_available? raise NotImplementedError, Mindee::Dependency::MINDEE_DEPENDENCIES_LOAD_ERROR end Mindee::PDF::PDFTools.source_text?(@io_stream) end |
#write_to_file(path) ⇒ Object
Write the file to a given path. Uses the initial file name by default.
128 129 130 131 132 133 134 135 136 137 138 139 140 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 128 def write_to_file(path) t_path = if File.directory?(path || '') || path.to_s.end_with?('/') File.join(path || '', @filename) else path end full_path = File.(t_path || '') FileUtils.mkdir_p(File.dirname(full_path)) @io_stream.rewind File.binwrite(full_path, @io_stream.read || '') logger.debug("Wrote file successfully to #{full_path}") @io_stream.rewind end |