Source code for doctr.datasets.svhn

# Copyright (C) 2021-2025, Mindee.

# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.

import os
from typing import Any

import h5py
import numpy as np
from tqdm import tqdm

from .datasets import VisionDataset
from .utils import convert_target_to_relative, crop_bboxes_from_image

__all__ = ["SVHN"]


[docs] class SVHN(VisionDataset): """SVHN dataset from `"The Street View House Numbers (SVHN) Dataset" <http://ufldl.stanford.edu/housenumbers/>`_. .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svhn-grid.png&src=0 :align: center >>> from doctr.datasets import SVHN >>> train_set = SVHN(train=True, download=True) >>> img, target = train_set[0] Args: train: whether the subset should be the training one use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) recognition_task: whether the dataset should be used for recognition task detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ TRAIN = ( "http://ufldl.stanford.edu/housenumbers/train.tar.gz", "4b17bb33b6cd8f963493168f80143da956f28ec406cc12f8e5745a9f91a51898", "svhn_train.tar", ) TEST = ( "http://ufldl.stanford.edu/housenumbers/test.tar.gz", "57ac9ceb530e4aa85b55d991be8fc49c695b3d71c6f6a88afea86549efde7fb5", "svhn_test.tar", ) def __init__( self, train: bool = True, use_polygons: bool = False, recognition_task: bool = False, detection_task: bool = False, **kwargs: Any, ) -> None: url, sha256, name = self.TRAIN if train else self.TEST super().__init__( url, file_name=name, file_hash=sha256, extract_archive=True, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs, ) if recognition_task and detection_task: raise ValueError( "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + "To get the whole dataset with boxes and labels leave both parameters to False." ) self.train = train self.data: list[tuple[str | np.ndarray, str | dict[str, Any] | np.ndarray]] = [] np_dtype = np.float32 tmp_root = os.path.join(self.root, "train" if train else "test") # Load mat data (matlab v7.3 - can not be loaded with scipy) with h5py.File(os.path.join(tmp_root, "digitStruct.mat"), "r") as f: img_refs = f["digitStruct/name"] box_refs = f["digitStruct/bbox"] for img_ref, box_ref in tqdm( iterable=zip(img_refs, box_refs), desc="Preparing and Loading SVHN", total=len(img_refs) ): # convert ascii matrix to string img_name = "".join(map(chr, f[img_ref[0]][()].flatten())) # File existence check if not os.path.exists(os.path.join(tmp_root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_name)}") # Unpack the information box = f[box_ref[0]] if box["left"].shape[0] == 1: box_dict = {k: [int(vals[0][0])] for k, vals in box.items()} else: box_dict = {k: [int(f[v[0]][()].item()) for v in vals] for k, vals in box.items()} # Convert it to the right format coords: np.ndarray = np.array( [box_dict["left"], box_dict["top"], box_dict["width"], box_dict["height"]], dtype=np_dtype ).transpose() label_targets = list(map(str, box_dict["label"])) if use_polygons: # (x, y) coordinates of top left, top right, bottom right, bottom left corners box_targets: np.ndarray = np.stack( [ np.stack([coords[:, 0], coords[:, 1]], axis=-1), np.stack([coords[:, 0] + coords[:, 2], coords[:, 1]], axis=-1), np.stack([coords[:, 0] + coords[:, 2], coords[:, 1] + coords[:, 3]], axis=-1), np.stack([coords[:, 0], coords[:, 1] + coords[:, 3]], axis=-1), ], axis=1, ) else: # x, y, width, height -> xmin, ymin, xmax, ymax box_targets = np.stack( [ coords[:, 0], coords[:, 1], coords[:, 0] + coords[:, 2], coords[:, 1] + coords[:, 3], ], axis=-1, ) if recognition_task: crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_name), geoms=box_targets) for crop, label in zip(crops, label_targets): if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0 and " " not in label: self.data.append((crop, label)) elif detection_task: self.data.append((img_name, box_targets)) else: self.data.append((img_name, dict(boxes=box_targets, labels=label_targets))) self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"