Module: Mindee::PDF::PDFProcessor

Defined in:
lib/mindee/pdf/pdf_processor.rb

Overview

PDF document processing

Class Method Summary collapse

Class Method Details

.get_page(pdf_doc, page_id) ⇒ StringIO

Retrieves a PDF document’s page.

Parameters:

  • pdf_doc (Origami::PDF)

    Origami PDF handle.

  • page_id (Integer)

    Page ID.

Returns:

  • (StringIO)


74
75
76
77
78
79
80
81
82
83
# File 'lib/mindee/pdf/pdf_processor.rb', line 74

def self.get_page(pdf_doc, page_id)
  stream = StringIO.new
  pdf_doc.save(stream)

  options = PageOptions.new(params: {
                              page_indexes: [page_id - 1],
                            })

  parse(stream, options)
end

.indexes_from_keep(page_indexes, all_pages) ⇒ Object

Parameters:

  • page_indexes (Array)
  • all_pages (Array)


36
37
38
39
40
41
42
43
44
45
46
# File 'lib/mindee/pdf/pdf_processor.rb', line 36

def self.indexes_from_keep(page_indexes, all_pages)
  pages_to_keep = Set.new
  page_indexes.each do |idx|
    idx = (all_pages.length - (idx + 2)) if idx.negative?
    page = all_pages[idx]
    next if page.nil?

    pages_to_keep << page
  end
  all_pages.to_set - pages_to_keep
end

.indexes_from_remove(page_indexes, all_pages) ⇒ Object

Parameters:

  • page_indexes (Array[Integer])
  • all_pages (Array)


50
51
52
53
54
55
56
57
58
59
# File 'lib/mindee/pdf/pdf_processor.rb', line 50

def self.indexes_from_remove(page_indexes, all_pages)
  pages_to_remove = Set.new
  page_indexes.each do |idx|
    idx = (all_pages.length - (idx + 2)) if idx.negative?
    page = all_pages[idx]
    next if page.nil?

    pages_to_remove << page
  end
end

.open_pdf(io_stream) ⇒ Origami::PDF

Parameters:

  • io_stream (StringIO)

Returns:

  • (Origami::PDF)


63
64
65
66
67
# File 'lib/mindee/pdf/pdf_processor.rb', line 63

def self.open_pdf(io_stream)
  pdf_parser = Origami::PDF::LinearParser.new({ verbosity: Origami::Parser::VERBOSE_QUIET })
  io_stream.seek(0)
  pdf_parser.parse(io_stream)
end

.parse(io_stream, options) ⇒ StringIO

Parameters:

Returns:

  • (StringIO)


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/mindee/pdf/pdf_processor.rb', line 15

def self.parse(io_stream, options)
  current_pdf = open_pdf(io_stream)
  pages_count = current_pdf.pages.size
  return current_pdf.to_io_stream if options.on_min_pages.to_i > pages_count

  all_pages = (0..pages_count - 1).to_a

  if options.operation == :KEEP_ONLY
    pages_to_remove = indexes_from_keep(options.page_indexes, all_pages)
  elsif options.operation == :REMOVE
    pages_to_remove = indexes_from_remove(options.page_indexes, all_pages)
  else
    raise ArgumentError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{options.operation}'"
  end

  current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a
  current_pdf.to_io_stream
end