Source code for doctr.models.zoo

# Copyright (C) 2021-2023, Mindee.

# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.

from typing import Any

from .detection.zoo import detection_predictor
from .kie_predictor import KIEPredictor
from .predictor import OCRPredictor
from .recognition.zoo import recognition_predictor

__all__ = ["ocr_predictor", "kie_predictor"]


def _predictor(
    det_arch: Any,
    reco_arch: Any,
    pretrained: bool,
    pretrained_backbone: bool = True,
    assume_straight_pages: bool = True,
    preserve_aspect_ratio: bool = True,
    symmetric_pad: bool = True,
    det_bs: int = 2,
    reco_bs: int = 128,
    detect_orientation: bool = False,
    detect_language: bool = False,
    **kwargs,
) -> OCRPredictor:
    # Detection
    det_predictor = detection_predictor(
        det_arch,
        pretrained=pretrained,
        pretrained_backbone=pretrained_backbone,
        batch_size=det_bs,
        assume_straight_pages=assume_straight_pages,
        preserve_aspect_ratio=preserve_aspect_ratio,
        symmetric_pad=symmetric_pad,
    )

    # Recognition
    reco_predictor = recognition_predictor(
        reco_arch,
        pretrained=pretrained,
        pretrained_backbone=pretrained_backbone,
        batch_size=reco_bs,
    )

    return OCRPredictor(
        det_predictor,
        reco_predictor,
        assume_straight_pages=assume_straight_pages,
        preserve_aspect_ratio=preserve_aspect_ratio,
        symmetric_pad=symmetric_pad,
        detect_orientation=detect_orientation,
        detect_language=detect_language,
        **kwargs,
    )


[docs]def ocr_predictor( det_arch: Any = "db_resnet50", reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, pretrained_backbone: bool = True, assume_straight_pages: bool = True, preserve_aspect_ratio: bool = True, symmetric_pad: bool = True, export_as_straight_boxes: bool = False, detect_orientation: bool = False, detect_language: bool = False, **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. >>> import numpy as np >>> from doctr.models import ocr_predictor >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) >>> out = model([input_page]) Args: det_arch: name of the detection architecture or the model itself to use (e.g. 'db_resnet50', 'db_mobilenet_v3_large') reco_arch: name of the recognition architecture or the model itself to use (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset pretrained_backbone: If True, returns a model with a pretrained backbone assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages without rotated textual elements. preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before running the detection model on it. symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions (potentially rotated) as straight bounding boxes. detect_orientation: if True, the estimated general page orientation will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. kwargs: keyword args of `OCRPredictor` Returns: OCR predictor """ return _predictor( det_arch, reco_arch, pretrained, pretrained_backbone=pretrained_backbone, assume_straight_pages=assume_straight_pages, preserve_aspect_ratio=preserve_aspect_ratio, symmetric_pad=symmetric_pad, export_as_straight_boxes=export_as_straight_boxes, detect_orientation=detect_orientation, detect_language=detect_language, **kwargs, )
def _kie_predictor( det_arch: Any, reco_arch: Any, pretrained: bool, pretrained_backbone: bool = True, assume_straight_pages: bool = True, preserve_aspect_ratio: bool = True, symmetric_pad: bool = True, det_bs: int = 2, reco_bs: int = 128, detect_orientation: bool = False, detect_language: bool = False, **kwargs, ) -> KIEPredictor: # Detection det_predictor = detection_predictor( det_arch, pretrained=pretrained, pretrained_backbone=pretrained_backbone, batch_size=det_bs, assume_straight_pages=assume_straight_pages, preserve_aspect_ratio=preserve_aspect_ratio, symmetric_pad=symmetric_pad, ) # Recognition reco_predictor = recognition_predictor( reco_arch, pretrained=pretrained, pretrained_backbone=pretrained_backbone, batch_size=reco_bs, ) return KIEPredictor( det_predictor, reco_predictor, assume_straight_pages=assume_straight_pages, preserve_aspect_ratio=preserve_aspect_ratio, symmetric_pad=symmetric_pad, detect_orientation=detect_orientation, detect_language=detect_language, **kwargs, )
[docs]def kie_predictor( det_arch: Any = "db_resnet50", reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, pretrained_backbone: bool = True, assume_straight_pages: bool = True, preserve_aspect_ratio: bool = True, symmetric_pad: bool = True, export_as_straight_boxes: bool = False, detect_orientation: bool = False, detect_language: bool = False, **kwargs: Any, ) -> KIEPredictor: """End-to-end KIE architecture using one model for localization, and another for text recognition. >>> import numpy as np >>> from doctr.models import ocr_predictor >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) >>> out = model([input_page]) Args: det_arch: name of the detection architecture or the model itself to use (e.g. 'db_resnet50', 'db_mobilenet_v3_large') reco_arch: name of the recognition architecture or the model itself to use (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset pretrained_backbone: If True, returns a model with a pretrained backbone assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages without rotated textual elements. preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before running the detection model on it. symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions (potentially rotated) as straight bounding boxes. detect_orientation: if True, the estimated general page orientation will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. kwargs: keyword args of `OCRPredictor` Returns: KIE predictor """ return _kie_predictor( det_arch, reco_arch, pretrained, pretrained_backbone=pretrained_backbone, assume_straight_pages=assume_straight_pages, preserve_aspect_ratio=preserve_aspect_ratio, symmetric_pad=symmetric_pad, export_as_straight_boxes=export_as_straight_boxes, detect_orientation=detect_orientation, detect_language=detect_language, **kwargs, )