Fix attach urls for video-text-image- add tqdm

dshabin · dshabin · commit 7e3c9a93089a · 2021-09-16T15:37:37.000+04:00
diff --git a/src/superannotate/lib/app/helpers.py b/src/superannotate/lib/app/helpers.py
@@ -1,3 +1,4 @@
+import uuid
 from ast import literal_eval
 from pathlib import Path
 from typing import List
@@ -120,3 +121,33 @@ def metric_is_plottable(key):
     if key == "total_loss" or "mIoU" in key or "mAP" in key or key == "iteration":
         return True
     return False
+
+
+def get_paths_and_duplicated_from_csv(csv_path):
+    image_data = pd.read_csv(csv_path, dtype=str)
+    image_data = image_data[~image_data["url"].isnull()]
+    if "name" in image_data.columns:
+        image_data["name"] = (
+            image_data["name"]
+            .fillna("")
+            .apply(lambda cell: cell if str(cell).strip() else str(uuid.uuid4()))
+        )
+    else:
+        image_data["name"] = [str(uuid.uuid4()) for _ in range(len(image_data.index))]
+
+    image_data = pd.DataFrame(image_data, columns=["name", "url"])
+    img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
+        orient="records"
+    )
+    duplicate_images = []
+    seen = []
+    images_to_upload = []
+    for i in img_names_urls:
+        temp = i["name"]
+        i["name"] = i["name"].strip()
+        if i["name"] not in seen:
+            seen.append(i["name"])
+            images_to_upload.append(i)
+        else:
+            duplicate_images.append(temp)
+    return images_to_upload, duplicate_images
diff --git a/src/superannotate/lib/app/interface/cli_interface.py b/src/superannotate/lib/app/interface/cli_interface.py
@@ -3,12 +3,10 @@
 import os
 import sys
 import tempfile
-import uuid
 from typing import Any
 from typing import Optional
 
 import lib.core as constances
-import pandas as pd
 from lib import __file__ as lib_path
 from lib.app.helpers import split_project_path
 from lib.app.input_converters.conversion import import_annotation
@@ -22,7 +20,6 @@
 from lib.app.interface.sdk_interface import upload_images_from_folder_to_project
 from lib.app.interface.sdk_interface import upload_preannotations_from_folder_to_project
 from lib.app.interface.sdk_interface import upload_videos_from_folder_to_project
-from lib.app.serializers import ImageSerializer
 from lib.core.entities import ConfigEntity
 from lib.infrastructure.controller import Controller
 from lib.infrastructure.repositories import ConfigRepository
@@ -263,43 +260,6 @@ def attach_document_urls(
         )
         sys.exit(0)
 
-    def _attach_urls(
-        self, project: str, attachments: str, annotation_status: Optional[Any] = None
-    ):
-        project_name, folder_name = split_project_path(project)
-
-        image_data = pd.read_csv(attachments, dtype=str)
-        image_data = image_data[~image_data["url"].isnull()]
-        for ind, _ in image_data[image_data["name"].isnull()].iterrows():
-            image_data.at[ind, "name"] = str(uuid.uuid4())
-
-        image_data = pd.DataFrame(image_data, columns=["name", "url"])
-        img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
-            orient="records"
-        )
-        list_of_not_uploaded = []
-        duplicate_images = []
-        for i in range(0, len(img_names_urls), 500):
-            response = self.controller.attach_urls(
-                project_name=project_name,
-                folder_name=folder_name,
-                files=ImageSerializer.deserialize(
-                    img_names_urls[i : i + 500]  # noqa: E203
-                ),
-                annotation_status=annotation_status,
-            )
-            if response.errors:
-                list_of_not_uploaded.append(response.data[0])
-                duplicate_images.append(response.data[1])
-
-        list_of_uploaded = [
-            image["name"]
-            for image in img_names_urls
-            if image["name"] not in list_of_not_uploaded
-        ]
-
-        return list_of_uploaded, list_of_not_uploaded, duplicate_images
-
     def upload_videos(
         self,
         project,
diff --git a/src/superannotate/lib/app/interface/sdk_interface.py b/src/superannotate/lib/app/interface/sdk_interface.py
@@ -5,7 +5,6 @@
 import os
 import tempfile
 import time
-import uuid
 from collections import Counter
 from collections import namedtuple
 from io import BytesIO
@@ -18,7 +17,6 @@
 
 import boto3
 import lib.core as constances
-import pandas as pd
 import plotly.graph_objects as go
 from lib.app.annotation_helpers import add_annotation_bbox_to_json
 from lib.app.annotation_helpers import add_annotation_comment_to_json
@@ -31,6 +29,7 @@
 from lib.app.exceptions import EmptyOutputError
 from lib.app.helpers import extract_project_folder
 from lib.app.helpers import get_annotation_paths
+from lib.app.helpers import get_paths_and_duplicated_from_csv
 from lib.app.helpers import reformat_metrics_json
 from lib.app.interface.types import AnnotationType
 from lib.app.interface.types import NotEmptyStr
@@ -2287,45 +2286,26 @@ def attach_image_urls_to_project(
     :rtype: tuple
     """
     project_name, folder_name = extract_project_folder(project)
-    project = controller.get_project_metadata(project_name).data
-    if project["project"].project_type == constances.ProjectType.VIDEO.value:
-        raise AppException(
-            "The function does not support projects containing videos attached with URLs"
-        )
-
-    image_data = pd.read_csv(attachments, dtype=str)
-    image_data = image_data[~image_data["url"].isnull()]
-    if "name" in image_data.columns:
-        image_data["name"] = (
-            image_data["name"]
-            .fillna("")
-            .apply(lambda cell: cell if str(cell).strip() else str(uuid.uuid4()))
-        )
-    else:
-        image_data["name"] = [str(uuid.uuid4()) for _ in range(len(image_data.index))]
-
-    image_data = pd.DataFrame(image_data, columns=["name", "url"])
-    img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
-        orient="records"
-    )
+    images_to_upload, duplicate_images = get_paths_and_duplicated_from_csv(attachments)
     list_of_not_uploaded = []
-    duplicate_images = []
-    for i in range(0, len(img_names_urls), 500):
-        response = controller.attach_urls(
-            project_name=project_name,
-            folder_name=folder_name,
-            files=ImageSerializer.deserialize(
-                img_names_urls[i : i + 500]  # noqa: E203
-            ),
-            annotation_status=annotation_status,
-        )
-        if response.errors:
-            list_of_not_uploaded.append(response.data[0])
-            duplicate_images.append(response.data[1])
 
+    with tqdm(total=len(images_to_upload), desc="Attaching urls") as progress_bar:
+        for i in range(0, len(images_to_upload), 500):
+            response = controller.attach_urls(
+                project_name=project_name,
+                folder_name=folder_name,
+                files=ImageSerializer.deserialize(
+                    images_to_upload[i : i + 500]  # noqa: E203
+                ),
+                annotation_status=annotation_status,
+            )
+            if response.errors:
+                list_of_not_uploaded.append(response.data[0])
+                duplicate_images.append(response.data[1])
+            progress_bar.update(len(images_to_upload[i : i + 500]))
     list_of_uploaded = [
         image["name"]
-        for image in img_names_urls
+        for image in images_to_upload
         if image["name"] not in list_of_not_uploaded
     ]
 
@@ -2349,43 +2329,26 @@ def attach_video_urls_to_project(
     :rtype: (list, list, list)
     """
     project_name, folder_name = extract_project_folder(project)
-    project = controller.get_project_metadata(project_name).data
-    if project["project"].project_type != constances.ProjectType.VIDEO.value:
-        raise AppException("The function does not support")
-
-    image_data = pd.read_csv(attachments, dtype=str)
-    image_data = image_data[~image_data["url"].isnull()]
-    if "name" in image_data.columns:
-        image_data["name"] = (
-            image_data["name"]
-            .fillna("")
-            .apply(lambda cell: cell if str(cell).strip() else str(uuid.uuid4()))
-        )
-    else:
-        image_data["name"] = [str(uuid.uuid4()) for _ in range(len(image_data.index))]
-
-    image_data = pd.DataFrame(image_data, columns=["name", "url"])
-    img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
-        orient="records"
-    )
+    images_to_upload, duplicate_images = get_paths_and_duplicated_from_csv(attachments)
     list_of_not_uploaded = []
-    duplicate_images = []
-    for i in range(0, len(img_names_urls), 500):
-        response = controller.attach_urls(
-            project_name=project_name,
-            folder_name=folder_name,
-            files=ImageSerializer.deserialize(
-                img_names_urls[i : i + 500]  # noqa: E203
-            ),
-            annotation_status=annotation_status,
-        )
-        if response.errors:
-            list_of_not_uploaded.append(response.data[0])
-            duplicate_images.append(response.data[1])
 
+    with tqdm(total=len(images_to_upload), desc="Attaching urls") as progress_bar:
+        for i in range(0, len(images_to_upload), 500):
+            response = controller.attach_urls(
+                project_name=project_name,
+                folder_name=folder_name,
+                files=ImageSerializer.deserialize(
+                    images_to_upload[i : i + 500]  # noqa: E203
+                ),
+                annotation_status=annotation_status,
+            )
+            if response.errors:
+                list_of_not_uploaded.append(response.data[0])
+                duplicate_images.append(response.data[1])
+            progress_bar.update(len(images_to_upload[i : i + 500]))
     list_of_uploaded = [
         image["name"]
-        for image in img_names_urls
+        for image in images_to_upload
         if image["name"] not in list_of_not_uploaded
     ]
 
@@ -3642,40 +3605,26 @@ def attach_document_urls_to_project(
     :rtype: tuple
     """
     project_name, folder_name = extract_project_folder(project)
-
-    image_data = pd.read_csv(attachments, dtype=str)
-    image_data = image_data[~image_data["url"].isnull()]
-    if "name" in image_data.columns:
-        image_data["name"] = (
-            image_data["name"]
-            .fillna("")
-            .apply(lambda cell: cell if str(cell).strip() else str(uuid.uuid4()))
-        )
-    else:
-        image_data["name"] = [str(uuid.uuid4()) for _ in range(len(image_data.index))]
-
-    image_data = pd.DataFrame(image_data, columns=["name", "url"])
-    img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
-        orient="records"
-    )
+    images_to_upload, duplicate_images = get_paths_and_duplicated_from_csv(attachments)
     list_of_not_uploaded = []
-    duplicate_images = []
-    for i in range(0, len(img_names_urls), 500):
-        response = controller.attach_urls(
-            project_name=project_name,
-            folder_name=folder_name,
-            files=ImageSerializer.deserialize(
-                img_names_urls[i : i + 500]  # noqa: E203
-            ),
-            annotation_status=annotation_status,
-        )
-        if response.errors:
-            list_of_not_uploaded.append(response.data[0])
-            duplicate_images.append(response.data[1])
 
+    with tqdm(total=len(images_to_upload), desc="Attaching urls") as progress_bar:
+        for i in range(0, len(images_to_upload), 500):
+            response = controller.attach_urls(
+                project_name=project_name,
+                folder_name=folder_name,
+                files=ImageSerializer.deserialize(
+                    images_to_upload[i : i + 500]  # noqa: E203
+                ),
+                annotation_status=annotation_status,
+            )
+            if response.errors:
+                list_of_not_uploaded.append(response.data[0])
+                duplicate_images.append(response.data[1])
+            progress_bar.update(len(images_to_upload[i : i + 500]))
     list_of_uploaded = [
         image["name"]
-        for image in img_names_urls
+        for image in images_to_upload
         if image["name"] not in list_of_not_uploaded
     ]
 
diff --git a/src/superannotate/lib/core/usecases.py b/src/superannotate/lib/core/usecases.py
@@ -3524,7 +3524,9 @@ def execute(self):
                             failed_annotations.append(annotation)
                         yield
 
-                uploaded_annotations = [annotation.path for annotation in uploaded_annotations]
+                uploaded_annotations = [
+                    annotation.path for annotation in uploaded_annotations
+                ]
                 missing_annotations.extend(
                     [annotation.path for annotation in self._missing_annotations]
                 )
diff --git a/tests/data_set/csv_files/text_urls.csv b/tests/data_set/csv_files/text_urls.csv
@@ -9,5 +9,5 @@ https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/text1_%D0%9B%D0%B
 https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/%D5%B6%D5%A1%D6%80%20%D5%A4%D5%B8%D5%BD.txt, textՆարԴոս
 https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,
 https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,
-https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,
-https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,
+https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,same_name
+https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,same_name
diff --git a/tests/integration/test_attach_document_urls.py b/tests/integration/test_attach_document_urls.py
@@ -5,7 +5,7 @@
 from tests.integration.base import BaseTestCase
 
 
-class TestImageUrls(BaseTestCase):
+class TestDocumentUrls(BaseTestCase):
     PROJECT_NAME = "test attach document urls"
     PATH_TO_URLS = "data_set/csv_files/text_urls.csv"
     PROJECT_DESCRIPTION = "desc"
@@ -16,7 +16,36 @@ def test_attach_image_urls(self):
             self.PROJECT_NAME,
             os.path.join(dirname(dirname(__file__)), self.PATH_TO_URLS),
         )
-        self.assertEqual(len(uploaded), 12)
+        self.assertEqual(len(uploaded), 11)
         self.assertEqual(len(could_not_upload), 0)
-        self.assertEqual(len(existing_images), 0)
+        self.assertEqual(len(existing_images), 1)
+
+class TestImageUrls(BaseTestCase):
+    PROJECT_NAME = "test attach image urls"
+    PATH_TO_URLS = "data_set/csv_files/text_urls.csv"
+    PROJECT_DESCRIPTION = "desc"
+    PROJECT_TYPE = "Vector"
 
+    def test_attach_image_urls(self):
+        uploaded, could_not_upload, existing_images = sa.attach_image_urls_to_project(
+            self.PROJECT_NAME,
+            os.path.join(dirname(dirname(__file__)), self.PATH_TO_URLS),
+        )
+        self.assertEqual(len(uploaded), 11)
+        self.assertEqual(len(could_not_upload), 0)
+        self.assertEqual(len(existing_images), 1)
+
+class TestVideoUrls(BaseTestCase):
+    PROJECT_NAME = "test attach video urls"
+    PATH_TO_URLS = "data_set/csv_files/text_urls.csv"
+    PROJECT_DESCRIPTION = "desc"
+    PROJECT_TYPE = "Video"
+
+    def test_attach_image_urls(self):
+        uploaded, could_not_upload, existing_images = sa.attach_video_urls_to_project(
+            self.PROJECT_NAME,
+            os.path.join(dirname(dirname(__file__)), self.PATH_TO_URLS),
+        )
+        self.assertEqual(len(uploaded), 11)
+        self.assertEqual(len(could_not_upload), 0)
+        self.assertEqual(len(existing_images), 1)