Class: Mindee::Extraction::PdfExtractor::PdfExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/mindee/extraction/pdf_extractor/pdf_extractor.rb

Overview

Pdf extraction class.

Instance Method Summary collapse

Constructor Details

#initialize(local_input) ⇒ PdfExtractor

Returns a new instance of PdfExtractor.

Parameters:



11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/mindee/extraction/pdf_extractor/pdf_extractor.rb', line 11

def initialize(local_input)
  @filename = local_input.filename
  if local_input.pdf?
    @source_pdf = local_input.io_stream
  else
    pdf_image = Extraction::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
    io_buffer = StringIO.new
    pdf_image.save(io_buffer)

    @source_pdf = io_buffer
  end
end

Instance Method Details

#cut_pages(page_indexes) ⇒ StreamIO

Creates a new Pdf from pages and save it into a buffer.

Parameters:

  • page_indexes (Array<Integer>)

    List of page number to use for merging in the original Pdf.

Returns:

  • (StreamIO)

    The buffer containing the new Pdf.



33
34
35
36
37
38
39
# File 'lib/mindee/extraction/pdf_extractor/pdf_extractor.rb', line 33

def cut_pages(page_indexes)
  options = {
    page_indexes: page_indexes,
  }

  Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
end

#extract_invoices(page_indexes, strict: false) ⇒ Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>

Extracts invoices as complete PDFs from the document.

Parameters:

  • page_indexes (Array<Array<Integer>, InvoiceSplitterV1PageGroup>)
  • strict (Boolean) (defaults to: false)

Returns:



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/mindee/extraction/pdf_extractor/pdf_extractor.rb', line 73

def extract_invoices(page_indexes, strict: false)
  raise 'No indexes provided.' if page_indexes.empty?
  unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
    return extract_sub_documents(page_indexes)
  end
  return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict

  correct_page_indexes = []
  current_list = []
  previous_confidence = nil
  page_indexes.each_with_index do |page_index, i|
    confidence = page_index.confidence
    page_list = page_index.page_indexes

    if confidence >= 0.5 && previous_confidence.nil?
      current_list = page_list
    elsif confidence >= 0.5 && i < page_indexes.length - 1
      correct_page_indexes << current_list
      current_list = page_list
    elsif confidence < 0.5 && i == page_indexes.length - 1
      current_list.concat page_list
      correct_page_indexes << current_list
    else
      correct_page_indexes << current_list
      correct_page_indexes << page_list
    end
    previous_confidence = confidence
  end
  extract_sub_documents(correct_page_indexes)
end

#extract_sub_documents(page_indexes) ⇒ Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>

Extract the sub-documents from the main pdf, based on the given list of page indexes.

Parameters:

  • page_indexes (Array<Array<Integer>>)

    List of page number to use for merging in the original Pdf.

Returns:



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/mindee/extraction/pdf_extractor/pdf_extractor.rb', line 44

def extract_sub_documents(page_indexes)
  extracted_pdfs = []
  extension = File.extname(@filename)
  basename = File.basename(@filename, extension)
  page_indexes.each do |page_index_list|
    if page_index_list.empty? || page_index_list.nil?
      raise "Empty indexes aren't allowed for extraction #{page_index_list}"
    end

    page_index_list.each do |page_index|
      raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
    end
    formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
    field_filename = "#{basename}_#{format('%03d',
                                           (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
    extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
                                                                       field_filename)
    extracted_pdfs << extracted_pdf
  end
  extracted_pdfs
end

#page_countInteger

Retrieves the page count for the Pdf object.

Returns:

  • (Integer)


26
27
28
# File 'lib/mindee/extraction/pdf_extractor/pdf_extractor.rb', line 26

def page_count
  Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
end