# Copyright (C) 2021-2025, Mindee.# This program is licensed under the Apache License 2.0.# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.importglobimportosfromtypingimportAnyimportnumpyasnpfromPILimportImagefromscipyimportioassiofromtqdmimporttqdmfrom.datasetsimportVisionDatasetfrom.utilsimportconvert_target_to_relative,crop_bboxes_from_image__all__=["SynthText"]
[docs]classSynthText(VisionDataset):"""SynthText dataset from `"Synthetic Data for Text Localisation in Natural Images" <https://arxiv.org/abs/1604.06646>`_ | `"repository" <https://github.com/ankush-me/SynthText>`_ | `"website" <https://www.robots.ox.ac.uk/~vgg/data/scenetext/>`_. .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svt-grid.png&src=0 :align: center >>> from doctr.datasets import SynthText >>> train_set = SynthText(train=True, download=True) >>> img, target = train_set[0] Args: train: whether the subset should be the training one use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) recognition_task: whether the dataset should be used for recognition task detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """URL="https://thor.robots.ox.ac.uk/~vgg/data/scenetext/SynthText.zip"SHA256="28ab030485ec8df3ed612c568dd71fb2793b9afbfa3a9d9c6e792aef33265bf1"# filter corrupted or missing imagesBLACKLIST=("67/fruits_129_","194/window_19_",)def__init__(self,train:bool=True,use_polygons:bool=False,recognition_task:bool=False,detection_task:bool=False,**kwargs:Any,)->None:super().__init__(self.URL,None,file_hash=None,extract_archive=True,pre_transforms=convert_target_to_relativeifnotrecognition_taskelseNone,**kwargs,)ifrecognition_taskanddetection_task:raiseValueError("`recognition_task` and `detection_task` cannot be set to True simultaneously. "+"To get the whole dataset with boxes and labels leave both parameters to False.")self.train=trainself.data:list[tuple[str|np.ndarray,str|dict[str,Any]|np.ndarray]]=[]np_dtype=np.float32# Load mat datatmp_root=os.path.join(self.root,"SynthText")ifself.SHA256elseself.root# define folder to write SynthText recognition datasetreco_folder_name="SynthText_recognition_train"ifself.trainelse"SynthText_recognition_test"reco_folder_name="Poly_"+reco_folder_nameifuse_polygonselsereco_folder_namereco_folder_path=os.path.join(tmp_root,reco_folder_name)reco_images_counter=0ifrecognition_taskandos.path.isdir(reco_folder_path):self._read_from_folder(reco_folder_path)returnelifrecognition_taskandnotos.path.isdir(reco_folder_path):os.makedirs(reco_folder_path,exist_ok=False)mat_data=sio.loadmat(os.path.join(tmp_root,"gt.mat"))train_samples=int(len(mat_data["imnames"][0])*0.9)set_slice=slice(train_samples)ifself.trainelseslice(train_samples,None)paths=mat_data["imnames"][0][set_slice]boxes=mat_data["wordBB"][0][set_slice]labels=mat_data["txt"][0][set_slice]delmat_dataforimg_path,word_boxes,txtintqdm(iterable=zip(paths,boxes,labels),desc="Preparing and Loading SynthText",total=len(paths)):# File existence checkifnotos.path.exists(os.path.join(tmp_root,img_path[0])):raiseFileNotFoundError(f"unable to locate {os.path.join(tmp_root,img_path[0])}")labels=[eltforwordintxt.tolist()foreltinword.split()]# (x, y) coordinates of top left, top right, bottom right, bottom left cornersword_boxes=(word_boxes.transpose(2,1,0)ifword_boxes.ndim==3elsenp.expand_dims(word_boxes.transpose(1,0),axis=0))ifnotuse_polygons:# xmin, ymin, xmax, ymaxword_boxes=np.concatenate((word_boxes.min(axis=1),word_boxes.max(axis=1)),axis=1)ifrecognition_task:crops=crop_bboxes_from_image(img_path=os.path.join(tmp_root,img_path[0]),geoms=word_boxes)forcrop,labelinzip(crops,labels):if(crop.shape[0]>0andcrop.shape[1]>0andlen(label)>0andlen(label)<30and" "notinlabel):# write data to diskwithopen(os.path.join(reco_folder_path,f"{reco_images_counter}.txt"),"w")asf:f.write(label)tmp_img=Image.fromarray(crop)tmp_img.save(os.path.join(reco_folder_path,f"{reco_images_counter}.png"))reco_images_counter+=1elifdetection_task:self.data.append((img_path[0],np.asarray(word_boxes,dtype=np_dtype)))else:self.data.append((img_path[0],dict(boxes=np.asarray(word_boxes,dtype=np_dtype),labels=labels)))ifrecognition_task:self._read_from_folder(reco_folder_path)self.root=tmp_rootdefextra_repr(self)->str:returnf"train={self.train}"def_read_from_folder(self,path:str)->None:img_paths=glob.glob(os.path.join(path,"*.png"))forimg_pathintqdm(iterable=img_paths,desc="Preparing and Loading SynthText",total=len(img_paths)):withopen(os.path.join(path,f"{os.path.basename(img_path)[:-4]}.txt"),"r")asf:self.data.append((img_path,f.read()))