@@ -56,6 +56,34 @@ class VideoRawData:
5656 attributeName : str = None
5757
5858
59+ class DocumentRawData :
60+ docName : str = None
61+ folderName : str = None
62+ docStatus : str = None
63+ docUrl : str = None
64+ docAnnotator : str = None
65+ docQA : str = None
66+ # tag
67+ tagId : int = None
68+ tag : str = None
69+ # instance
70+ instanceId : int = None
71+ instanceStart : int = None
72+ instanceEnd : int = None
73+ type : str = None
74+ className : str = None
75+ createdAt : str = None
76+ createdBy : str = None
77+ creatorRole : str = None
78+ updatedAt : str = None
79+ updatedBy : str = None
80+ updatorRole : str = None
81+ # attribute
82+ attributeId : int = None
83+ attributeGroupName : str = None
84+ attributeName : str = None
85+
86+
5987class DataAggregator :
6088 def __init__ (
6189 self ,
@@ -124,6 +152,8 @@ def aggregate_annotations_as_df(self):
124152 return self .aggregate_image_annotations_as_df (annotation_paths )
125153 elif self .project_type == constances .ProjectType .VIDEO .name :
126154 return self .aggregate_video_annotations_as_df (annotation_paths )
155+ elif self .project_type == constances .ProjectType .DOCUMENT .name :
156+ return self .aggregate_document_annotations_as_df (annotation_paths )
127157
128158 def aggregate_video_annotations_as_df (self , annotation_paths : List [str ]):
129159 raws = []
@@ -205,7 +235,61 @@ def aggregate_video_annotations_as_df(self, annotation_paths: List[str]):
205235 raws .append (instance_raw )
206236 if not instances :
207237 raws .append (raw_data )
208- return pd .DataFrame ([raw .__dict__ for raw in raws ], dtype = object )
238+ df = pd .DataFrame ([raw .__dict__ for raw in raws ], dtype = object )
239+ return df .where (pd .notnull (df ), None )
240+
241+ def aggregate_document_annotations_as_df (self , annotation_paths : List [str ]):
242+ raws = []
243+ for annotation_path in annotation_paths :
244+ annotation_path = Path (annotation_path )
245+ annotation_data = json .load (open (annotation_path ))
246+ raw_data = DocumentRawData ()
247+ # metadata
248+ raw_data .docName = annotation_data ["metadata" ]["name" ]
249+ raw_data .folderName = (
250+ annotation_path .parent .name
251+ if annotation_path .parent != self .project_root
252+ else None
253+ )
254+ raw_data .docStatus = annotation_data ["metadata" ].get ("status" )
255+ raw_data .docUrl = annotation_data ["metadata" ].get ("url" )
256+ raw_data .docAnnotator = annotation_data ["metadata" ].get ("annotatorEmail" )
257+ raw_data .docQA = annotation_data ["metadata" ].get ("qaEmail" )
258+ # append tags
259+ for idx , tag in enumerate (annotation_data .get ("tags" , [])):
260+ tag_row = copy .copy (raw_data )
261+ tag_row .tagId = idx
262+ tag_row .tag = tag
263+ raws .append (tag_row )
264+ # append instances
265+ instances = annotation_data .get ("instances" , [])
266+ for idx , instance in enumerate (instances ):
267+ instance_raw = copy .copy (raw_data )
268+ instance_raw .instanceId = int (idx )
269+ instance_raw .instanceStart = instance .get ("start" )
270+ instance_raw .instanceEnd = instance .get ("end" )
271+ instance_raw .type = instance .get ("type" )
272+ instance_raw .className = instance .get ("className" )
273+ instance_raw .createdAt = instance .get ("createdAt" )
274+ instance_raw .createdBy = instance .get ("createdBy" , {}).get ("email" )
275+ instance_raw .creatorRole = instance .get ("createdBy" , {}).get ("role" )
276+ instance_raw .updatedAt = instance .get ("updatedAt" )
277+ instance_raw .updatedBy = instance .get ("updatedBy" , {}).get ("email" )
278+ instance_raw .updatorRole = instance .get ("updatedBy" , {}).get ("role" )
279+ attributes = instance .get ("attributes" , [])
280+ # append attributes
281+ for attribute_id , attribute in enumerate (attributes ):
282+ attribute_raw = copy .copy (instance_raw )
283+ attribute_raw .attributeId = attribute_id
284+ attribute_raw .attributeGroupName = attribute .get ("groupName" )
285+ attribute_raw .attributeName = attribute .get ("name" )
286+ raws .append (attribute_raw )
287+ if not attributes :
288+ raws .append (instance_raw )
289+ if not instances :
290+ raws .append (raw_data )
291+ df = pd .DataFrame ([raw .__dict__ for raw in raws ], dtype = object )
292+ return df .where (pd .notnull (df ), None )
209293
210294 def aggregate_image_annotations_as_df (self , annotations_paths : List [str ]):
211295 annotation_data = {
0 commit comments