# Copyright (C) 2021-2025, Mindee.# This program is licensed under the Apache License 2.0.# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.importjsonimportosfrompathlibimportPathfromtypingimportAnyimportnumpyasnpfromtqdmimporttqdmfrom.datasetsimportAbstractDatasetfrom.utilsimportconvert_target_to_relative,crop_bboxes_from_image__all__=["COCOTEXT"]
[docs]classCOCOTEXT(AbstractDataset):""" COCO-Text dataset from `"COCO-Text: Dataset and Benchmark for Text Detection and Recognition in Natural Images" <https://arxiv.org/pdf/1601.07140v2>`_ | `"homepage" <https://bgshih.github.io/cocotext/>`_. >>> # NOTE: You need to download the dataset first. >>> from doctr.datasets import COCOTEXT >>> train_set = COCOTEXT(train=True, img_folder="/path/to/coco_text/train2014/", >>> label_path="/path/to/coco_text/cocotext.v2.json") >>> img, target = train_set[0] >>> test_set = COCOTEXT(train=False, img_folder="/path/to/coco_text/train2014/", >>> label_path = "/path/to/coco_text/cocotext.v2.json") >>> img, target = test_set[0] Args: img_folder: folder with all the images of the dataset label_path: path to the annotations file of the dataset train: whether the subset should be the training one use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) recognition_task: whether the dataset should be used for recognition task detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `AbstractDataset`. """def__init__(self,img_folder:str,label_path:str,train:bool=True,use_polygons:bool=False,recognition_task:bool=False,detection_task:bool=False,**kwargs:Any,)->None:super().__init__(img_folder,pre_transforms=convert_target_to_relativeifnotrecognition_taskelseNone,**kwargs)# Task checkifrecognition_taskanddetection_task:raiseValueError(" 'recognition' and 'detection task' cannot be set to True simultaneously. "+" To get the whole dataset with boxes and labels leave both parameters to False ")# File existence checkifnotos.path.exists(label_path)ornotos.path.exists(img_folder):raiseFileNotFoundError(f"unable to find {label_pathifnotos.path.exists(label_path)elseimg_folder}")tmp_root=img_folderself.train=trainnp_dtype=np.float32self.data:list[tuple[str|Path|np.ndarray,str|dict[str,Any]|np.ndarray]]=[]withopen(label_path,"r")asfile:data=json.load(file)# Filter images based on the setimg_items=[imgforimgindata["imgs"].items()if(img[1]["set"]=="train")==train]box:list[float]|np.ndarrayforimg_id,img_infointqdm(img_items,desc="Preparing and Loading COCOTEXT",total=len(img_items)):img_path=os.path.join(img_folder,img_info["file_name"])# File existence checkifnotos.path.exists(img_path):# pragma: no coverraiseFileNotFoundError(f"Unable to locate {img_path}")# Get annotations for the current image (only legible text)annotations=[annforannindata["anns"].values()ifann["image_id"]==int(img_id)andann["legibility"]=="legible"]# Some images have no annotations with readable textifnotannotations:# pragma: no covercontinue_targets=[]forannotationinannotations:x,y,w,h=annotation["bbox"]ifuse_polygons:# (x, y) coordinates of top left, top right, bottom right, bottom left cornersbox=np.array([[x,y],[x+w,y],[x+w,y+h],[x,y+h],],dtype=np_dtype,)else:# (xmin, ymin, xmax, ymax) coordinatesbox=[x,y,x+w,y+h]_targets.append((annotation["utf8_string"],box))text_targets,box_targets=zip(*_targets)ifrecognition_task:crops=crop_bboxes_from_image(img_path=os.path.join(tmp_root,img_path),geoms=np.asarray(box_targets,dtype=int).clip(min=0))forcrop,labelinzip(crops,list(text_targets)):iflabeland" "notinlabel:self.data.append((crop,label))elifdetection_task:self.data.append((img_path,np.asarray(box_targets,dtype=int).clip(min=0)))else:self.data.append((img_path,dict(boxes=np.asarray(box_targets,dtype=int).clip(min=0),labels=list(text_targets)),))self.root=tmp_rootdefextra_repr(self)->str:returnf"train={self.train}"