Module: Mindee::Extraction::ImageExtractor

Defined in:
lib/mindee/extraction/common/image_extractor.rb

Overview

Image Extraction wrapper class.

Class Method Summary collapse

Class Method Details

.attach_image_as_new_file(input_buffer, format: 'jpg') ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/mindee/extraction/common/image_extractor.rb', line 15

def self.attach_image_as_new_file(input_buffer, format: 'jpg')
  # Attaches an image as a new page in a PdfDocument object.
  #
  # @param [StringIO] input_buffer Input buffer. Only supports JPEG.
  # @return [Origami::PDF] A PdfDocument handle.

  magick_image = MiniMagick::Image.read(input_buffer)
  # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
  # converted.
  magick_image.format(format)
  original_density = magick_image.resolution
  scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for
  # the pdf otherwise the resulting image shrinks.
  magick_image.format('pdf', 0, { density: scale_factor.to_s })
  Origami::PDF.read(StringIO.new(magick_image.to_blob))
end

.create_extracted_image(buffer, file_name, page_id, element_id) ⇒ Object

Generates an ExtractedImage.

Parameters:

  • buffer (StringIO)

    Buffer containing the image.

  • file_name (String)

    Name for the file.

  • page_id (Object)

    ID of the page the file was generated from.

  • element_id (Object)

    ID of the element of a given page.



153
154
155
156
157
158
159
160
# File 'lib/mindee/extraction/common/image_extractor.rb', line 153

def self.create_extracted_image(buffer, file_name, page_id, element_id)
  buffer.rewind
  ExtractedImage.new(
    Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
    page_id,
    element_id
  )
end

.crop_image(image, min_max_x, min_max_y) ⇒ Object

Crops a MiniMagick Image from a the given bounding box.

Parameters:



115
116
117
118
119
120
121
122
123
124
125
# File 'lib/mindee/extraction/common/image_extractor.rb', line 115

def self.crop_image(image, min_max_x, min_max_y)
  width = image[:width].to_i
  height = image[:height].to_i

  image.format('jpg')
  new_width = (min_max_x.max - min_max_x.min) * width
  new_height = (min_max_y.max - min_max_y.min) * height
  image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")

  image
end

.determine_file_extension(input_source) ⇒ String

Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.

Parameters:

Returns:

  • (String)

    A valid file extension.



139
140
141
142
143
144
145
# File 'lib/mindee/extraction/common/image_extractor.rb', line 139

def self.determine_file_extension(input_source)
  if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
    'jpg'
  else
    File.extname(input_source.filename).strip.downcase[1..]
  end
end

.extract_images_from_polygons(input_source, pdf_stream, page_id, polygons) ⇒ Array<Mindee::Extraction::ExtractedImage>

Extracts images from their positions on a file (as polygons).

Parameters:

Returns:



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/mindee/extraction/common/image_extractor.rb', line 53

def self.extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
  extracted_elements = []

  polygons.each_with_index do |polygon, element_id|
    polygon = normalize_polygon(polygon)
    page_content = read_page_content(pdf_stream)

    min_max_x = Geometry.get_min_max_x([
                                         polygon.top_left,
                                         polygon.bottom_right,
                                         polygon.top_right,
                                         polygon.bottom_left,
                                       ])
    min_max_y = Geometry.get_min_max_y([
                                         polygon.top_left,
                                         polygon.bottom_right,
                                         polygon.top_right,
                                         polygon.bottom_left,
                                       ])
    file_extension = determine_file_extension(input_source)
    cropped_image = crop_image(page_content, min_max_x, min_max_y)
    if file_extension == 'pdf'
      cropped_image.format('jpg')
    else
      cropped_image.format(file_extension)
    end

    buffer = StringIO.new
    write_image_to_buffer(cropped_image, buffer)
    file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"

    extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
  end

  extracted_elements
end

.extract_multiple_images_from_source(input_source, page_id, polygons) ⇒ Array<Mindee::Extraction::ExtractedImage>

Extracts multiple images from a given local input source.

to extract.

Parameters:

Returns:



39
40
41
42
43
44
# File 'lib/mindee/extraction/common/image_extractor.rb', line 39

def self.extract_multiple_images_from_source(input_source, page_id, polygons)
  new_stream = load_input_source_pdf_page_as_image(input_source, page_id)
  new_stream.seek(0)

  extract_images_from_polygons(input_source, new_stream, page_id, polygons)
end

.load_input_source_pdf_page_as_image(input_file, page_id) ⇒ MiniMagick::Image

Loads a single_page from an image file or a pdf document.

Parameters:

  • input_file (LocalInputSource)

    Local input.

  • page_id (Integer)

    Page ID.

Returns:

  • (MiniMagick::Image)

    A valid PdfDocument handle.



167
168
169
170
171
172
173
174
# File 'lib/mindee/extraction/common/image_extractor.rb', line 167

def self.load_input_source_pdf_page_as_image(input_file, page_id)
  input_file.io_stream.rewind
  if input_file.pdf?
    Mindee::PDF::PdfProcessor.get_page(Origami::PDF.read(input_file.io_stream), page_id)
  else
    input_file.io_stream
  end
end

.normalize_polygon(polygon) ⇒ Object

Retrieves the bounding box of a polygon.

Parameters:



93
94
95
96
97
98
99
# File 'lib/mindee/extraction/common/image_extractor.rb', line 93

def self.normalize_polygon(polygon)
  if polygon.is_a?(Mindee::Geometry::Polygon)
    Mindee::Geometry.get_bounding_box(polygon)
  else
    polygon
  end
end

.read_page_content(pdf_stream) ⇒ MiniMagick::Image

Loads a buffer into a MiniMagick Image.

Parameters:

  • pdf_stream (StringIO)

    Buffer containg the PDF

Returns:

  • (MiniMagick::Image)

    a valid MiniMagick image handle.



105
106
107
108
# File 'lib/mindee/extraction/common/image_extractor.rb', line 105

def self.read_page_content(pdf_stream)
  pdf_stream.rewind
  MiniMagick::Image.read(pdf_stream)
end

.write_image_to_buffer(image, buffer) ⇒ Object

Writes a MiniMagick::Image to a buffer.

Parameters:

  • image (MiniMagick::Image)

    a valid MiniMagick image.

  • buffer (StringIO)


131
132
133
# File 'lib/mindee/extraction/common/image_extractor.rb', line 131

def self.write_image_to_buffer(image, buffer)
  image.write(buffer)
end