# Copyright (C) 2021-2025, Mindee.# This program is licensed under the Apache License 2.0.# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.importjsonimportosfrompathlibimportPathfromtypingimportAnyimportnumpyasnpfromtqdmimporttqdmfrom.datasetsimportVisionDatasetfrom.utilsimportconvert_target_to_relative,crop_bboxes_from_image__all__=["FUNSD"]
[docs]classFUNSD(VisionDataset):"""FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 :align: center >>> from doctr.datasets import FUNSD >>> train_set = FUNSD(train=True, download=True) >>> img, target = train_set[0] Args: train: whether the subset should be the training one use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) recognition_task: whether the dataset should be used for recognition task detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """URL="https://guillaumejaume.github.io/FUNSD/dataset.zip"SHA256="c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f"FILE_NAME="funsd.zip"def__init__(self,train:bool=True,use_polygons:bool=False,recognition_task:bool=False,detection_task:bool=False,**kwargs:Any,)->None:super().__init__(self.URL,self.FILE_NAME,self.SHA256,True,pre_transforms=convert_target_to_relativeifnotrecognition_taskelseNone,**kwargs,)ifrecognition_taskanddetection_task:raiseValueError("`recognition_task` and `detection_task` cannot be set to True simultaneously. "+"To get the whole dataset with boxes and labels leave both parameters to False.")self.train=trainnp_dtype=np.float32# Use the subsetsubfolder=os.path.join("dataset","training_data"iftrainelse"testing_data")# # list imagestmp_root=os.path.join(self.root,subfolder,"images")self.data:list[tuple[str|np.ndarray,str|dict[str,Any]|np.ndarray]]=[]forimg_pathintqdm(iterable=os.listdir(tmp_root),desc="Preparing and Loading FUNSD",total=len(os.listdir(tmp_root))):# File existence checkifnotos.path.exists(os.path.join(tmp_root,img_path)):raiseFileNotFoundError(f"unable to locate {os.path.join(tmp_root,img_path)}")stem=Path(img_path).stemwithopen(os.path.join(self.root,subfolder,"annotations",f"{stem}.json"),"rb")asf:data=json.load(f)_targets=[(word["text"],word["box"])forblockindata["form"]forwordinblock["words"]iflen(word["text"])>0]text_targets,box_targets=zip(*_targets)ifuse_polygons:# xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left cornersbox_targets=[# type: ignore[assignment][[box[0],box[1]],[box[2],box[1]],[box[2],box[3]],[box[0],box[3]],]forboxinbox_targets]ifrecognition_task:crops=crop_bboxes_from_image(img_path=os.path.join(tmp_root,img_path),geoms=np.asarray(box_targets,dtype=np_dtype))forcrop,labelinzip(crops,list(text_targets)):# filter labels with unknown charactersifnotany(charinlabelforcharin["☑","☐","\u03bf","\uf703","\uf702"," "]):self.data.append((crop,label.replace("–","-")))elifdetection_task:self.data.append((img_path,np.asarray(box_targets,dtype=np_dtype)))else:self.data.append((img_path,dict(boxes=np.asarray(box_targets,dtype=np_dtype),labels=list(text_targets)),))self.root=tmp_rootdefextra_repr(self)->str:returnf"train={self.train}"