Skip to content

Commit 7e3c9a9

Browse files
committed
Fix attach urls for video-text-image- add tqdm
1 parent 1efb59e commit 7e3c9a9

File tree

6 files changed

+117
-146
lines changed

6 files changed

+117
-146
lines changed

src/superannotate/lib/app/helpers.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import uuid
12
from ast import literal_eval
23
from pathlib import Path
34
from typing import List
@@ -120,3 +121,33 @@ def metric_is_plottable(key):
120121
if key == "total_loss" or "mIoU" in key or "mAP" in key or key == "iteration":
121122
return True
122123
return False
124+
125+
126+
def get_paths_and_duplicated_from_csv(csv_path):
127+
image_data = pd.read_csv(csv_path, dtype=str)
128+
image_data = image_data[~image_data["url"].isnull()]
129+
if "name" in image_data.columns:
130+
image_data["name"] = (
131+
image_data["name"]
132+
.fillna("")
133+
.apply(lambda cell: cell if str(cell).strip() else str(uuid.uuid4()))
134+
)
135+
else:
136+
image_data["name"] = [str(uuid.uuid4()) for _ in range(len(image_data.index))]
137+
138+
image_data = pd.DataFrame(image_data, columns=["name", "url"])
139+
img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
140+
orient="records"
141+
)
142+
duplicate_images = []
143+
seen = []
144+
images_to_upload = []
145+
for i in img_names_urls:
146+
temp = i["name"]
147+
i["name"] = i["name"].strip()
148+
if i["name"] not in seen:
149+
seen.append(i["name"])
150+
images_to_upload.append(i)
151+
else:
152+
duplicate_images.append(temp)
153+
return images_to_upload, duplicate_images

src/superannotate/lib/app/interface/cli_interface.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,10 @@
33
import os
44
import sys
55
import tempfile
6-
import uuid
76
from typing import Any
87
from typing import Optional
98

109
import lib.core as constances
11-
import pandas as pd
1210
from lib import __file__ as lib_path
1311
from lib.app.helpers import split_project_path
1412
from lib.app.input_converters.conversion import import_annotation
@@ -22,7 +20,6 @@
2220
from lib.app.interface.sdk_interface import upload_images_from_folder_to_project
2321
from lib.app.interface.sdk_interface import upload_preannotations_from_folder_to_project
2422
from lib.app.interface.sdk_interface import upload_videos_from_folder_to_project
25-
from lib.app.serializers import ImageSerializer
2623
from lib.core.entities import ConfigEntity
2724
from lib.infrastructure.controller import Controller
2825
from lib.infrastructure.repositories import ConfigRepository
@@ -263,43 +260,6 @@ def attach_document_urls(
263260
)
264261
sys.exit(0)
265262

266-
def _attach_urls(
267-
self, project: str, attachments: str, annotation_status: Optional[Any] = None
268-
):
269-
project_name, folder_name = split_project_path(project)
270-
271-
image_data = pd.read_csv(attachments, dtype=str)
272-
image_data = image_data[~image_data["url"].isnull()]
273-
for ind, _ in image_data[image_data["name"].isnull()].iterrows():
274-
image_data.at[ind, "name"] = str(uuid.uuid4())
275-
276-
image_data = pd.DataFrame(image_data, columns=["name", "url"])
277-
img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
278-
orient="records"
279-
)
280-
list_of_not_uploaded = []
281-
duplicate_images = []
282-
for i in range(0, len(img_names_urls), 500):
283-
response = self.controller.attach_urls(
284-
project_name=project_name,
285-
folder_name=folder_name,
286-
files=ImageSerializer.deserialize(
287-
img_names_urls[i : i + 500] # noqa: E203
288-
),
289-
annotation_status=annotation_status,
290-
)
291-
if response.errors:
292-
list_of_not_uploaded.append(response.data[0])
293-
duplicate_images.append(response.data[1])
294-
295-
list_of_uploaded = [
296-
image["name"]
297-
for image in img_names_urls
298-
if image["name"] not in list_of_not_uploaded
299-
]
300-
301-
return list_of_uploaded, list_of_not_uploaded, duplicate_images
302-
303263
def upload_videos(
304264
self,
305265
project,

src/superannotate/lib/app/interface/sdk_interface.py

Lines changed: 49 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import os
66
import tempfile
77
import time
8-
import uuid
98
from collections import Counter
109
from collections import namedtuple
1110
from io import BytesIO
@@ -18,7 +17,6 @@
1817

1918
import boto3
2019
import lib.core as constances
21-
import pandas as pd
2220
import plotly.graph_objects as go
2321
from lib.app.annotation_helpers import add_annotation_bbox_to_json
2422
from lib.app.annotation_helpers import add_annotation_comment_to_json
@@ -31,6 +29,7 @@
3129
from lib.app.exceptions import EmptyOutputError
3230
from lib.app.helpers import extract_project_folder
3331
from lib.app.helpers import get_annotation_paths
32+
from lib.app.helpers import get_paths_and_duplicated_from_csv
3433
from lib.app.helpers import reformat_metrics_json
3534
from lib.app.interface.types import AnnotationType
3635
from lib.app.interface.types import NotEmptyStr
@@ -2287,45 +2286,26 @@ def attach_image_urls_to_project(
22872286
:rtype: tuple
22882287
"""
22892288
project_name, folder_name = extract_project_folder(project)
2290-
project = controller.get_project_metadata(project_name).data
2291-
if project["project"].project_type == constances.ProjectType.VIDEO.value:
2292-
raise AppException(
2293-
"The function does not support projects containing videos attached with URLs"
2294-
)
2295-
2296-
image_data = pd.read_csv(attachments, dtype=str)
2297-
image_data = image_data[~image_data["url"].isnull()]
2298-
if "name" in image_data.columns:
2299-
image_data["name"] = (
2300-
image_data["name"]
2301-
.fillna("")
2302-
.apply(lambda cell: cell if str(cell).strip() else str(uuid.uuid4()))
2303-
)
2304-
else:
2305-
image_data["name"] = [str(uuid.uuid4()) for _ in range(len(image_data.index))]
2306-
2307-
image_data = pd.DataFrame(image_data, columns=["name", "url"])
2308-
img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
2309-
orient="records"
2310-
)
2289+
images_to_upload, duplicate_images = get_paths_and_duplicated_from_csv(attachments)
23112290
list_of_not_uploaded = []
2312-
duplicate_images = []
2313-
for i in range(0, len(img_names_urls), 500):
2314-
response = controller.attach_urls(
2315-
project_name=project_name,
2316-
folder_name=folder_name,
2317-
files=ImageSerializer.deserialize(
2318-
img_names_urls[i : i + 500] # noqa: E203
2319-
),
2320-
annotation_status=annotation_status,
2321-
)
2322-
if response.errors:
2323-
list_of_not_uploaded.append(response.data[0])
2324-
duplicate_images.append(response.data[1])
23252291

2292+
with tqdm(total=len(images_to_upload), desc="Attaching urls") as progress_bar:
2293+
for i in range(0, len(images_to_upload), 500):
2294+
response = controller.attach_urls(
2295+
project_name=project_name,
2296+
folder_name=folder_name,
2297+
files=ImageSerializer.deserialize(
2298+
images_to_upload[i : i + 500] # noqa: E203
2299+
),
2300+
annotation_status=annotation_status,
2301+
)
2302+
if response.errors:
2303+
list_of_not_uploaded.append(response.data[0])
2304+
duplicate_images.append(response.data[1])
2305+
progress_bar.update(len(images_to_upload[i : i + 500]))
23262306
list_of_uploaded = [
23272307
image["name"]
2328-
for image in img_names_urls
2308+
for image in images_to_upload
23292309
if image["name"] not in list_of_not_uploaded
23302310
]
23312311

@@ -2349,43 +2329,26 @@ def attach_video_urls_to_project(
23492329
:rtype: (list, list, list)
23502330
"""
23512331
project_name, folder_name = extract_project_folder(project)
2352-
project = controller.get_project_metadata(project_name).data
2353-
if project["project"].project_type != constances.ProjectType.VIDEO.value:
2354-
raise AppException("The function does not support")
2355-
2356-
image_data = pd.read_csv(attachments, dtype=str)
2357-
image_data = image_data[~image_data["url"].isnull()]
2358-
if "name" in image_data.columns:
2359-
image_data["name"] = (
2360-
image_data["name"]
2361-
.fillna("")
2362-
.apply(lambda cell: cell if str(cell).strip() else str(uuid.uuid4()))
2363-
)
2364-
else:
2365-
image_data["name"] = [str(uuid.uuid4()) for _ in range(len(image_data.index))]
2366-
2367-
image_data = pd.DataFrame(image_data, columns=["name", "url"])
2368-
img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
2369-
orient="records"
2370-
)
2332+
images_to_upload, duplicate_images = get_paths_and_duplicated_from_csv(attachments)
23712333
list_of_not_uploaded = []
2372-
duplicate_images = []
2373-
for i in range(0, len(img_names_urls), 500):
2374-
response = controller.attach_urls(
2375-
project_name=project_name,
2376-
folder_name=folder_name,
2377-
files=ImageSerializer.deserialize(
2378-
img_names_urls[i : i + 500] # noqa: E203
2379-
),
2380-
annotation_status=annotation_status,
2381-
)
2382-
if response.errors:
2383-
list_of_not_uploaded.append(response.data[0])
2384-
duplicate_images.append(response.data[1])
23852334

2335+
with tqdm(total=len(images_to_upload), desc="Attaching urls") as progress_bar:
2336+
for i in range(0, len(images_to_upload), 500):
2337+
response = controller.attach_urls(
2338+
project_name=project_name,
2339+
folder_name=folder_name,
2340+
files=ImageSerializer.deserialize(
2341+
images_to_upload[i : i + 500] # noqa: E203
2342+
),
2343+
annotation_status=annotation_status,
2344+
)
2345+
if response.errors:
2346+
list_of_not_uploaded.append(response.data[0])
2347+
duplicate_images.append(response.data[1])
2348+
progress_bar.update(len(images_to_upload[i : i + 500]))
23862349
list_of_uploaded = [
23872350
image["name"]
2388-
for image in img_names_urls
2351+
for image in images_to_upload
23892352
if image["name"] not in list_of_not_uploaded
23902353
]
23912354

@@ -3642,40 +3605,26 @@ def attach_document_urls_to_project(
36423605
:rtype: tuple
36433606
"""
36443607
project_name, folder_name = extract_project_folder(project)
3645-
3646-
image_data = pd.read_csv(attachments, dtype=str)
3647-
image_data = image_data[~image_data["url"].isnull()]
3648-
if "name" in image_data.columns:
3649-
image_data["name"] = (
3650-
image_data["name"]
3651-
.fillna("")
3652-
.apply(lambda cell: cell if str(cell).strip() else str(uuid.uuid4()))
3653-
)
3654-
else:
3655-
image_data["name"] = [str(uuid.uuid4()) for _ in range(len(image_data.index))]
3656-
3657-
image_data = pd.DataFrame(image_data, columns=["name", "url"])
3658-
img_names_urls = image_data.rename(columns={"url": "path"}).to_dict(
3659-
orient="records"
3660-
)
3608+
images_to_upload, duplicate_images = get_paths_and_duplicated_from_csv(attachments)
36613609
list_of_not_uploaded = []
3662-
duplicate_images = []
3663-
for i in range(0, len(img_names_urls), 500):
3664-
response = controller.attach_urls(
3665-
project_name=project_name,
3666-
folder_name=folder_name,
3667-
files=ImageSerializer.deserialize(
3668-
img_names_urls[i : i + 500] # noqa: E203
3669-
),
3670-
annotation_status=annotation_status,
3671-
)
3672-
if response.errors:
3673-
list_of_not_uploaded.append(response.data[0])
3674-
duplicate_images.append(response.data[1])
36753610

3611+
with tqdm(total=len(images_to_upload), desc="Attaching urls") as progress_bar:
3612+
for i in range(0, len(images_to_upload), 500):
3613+
response = controller.attach_urls(
3614+
project_name=project_name,
3615+
folder_name=folder_name,
3616+
files=ImageSerializer.deserialize(
3617+
images_to_upload[i : i + 500] # noqa: E203
3618+
),
3619+
annotation_status=annotation_status,
3620+
)
3621+
if response.errors:
3622+
list_of_not_uploaded.append(response.data[0])
3623+
duplicate_images.append(response.data[1])
3624+
progress_bar.update(len(images_to_upload[i : i + 500]))
36763625
list_of_uploaded = [
36773626
image["name"]
3678-
for image in img_names_urls
3627+
for image in images_to_upload
36793628
if image["name"] not in list_of_not_uploaded
36803629
]
36813630

src/superannotate/lib/core/usecases.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3524,7 +3524,9 @@ def execute(self):
35243524
failed_annotations.append(annotation)
35253525
yield
35263526

3527-
uploaded_annotations = [annotation.path for annotation in uploaded_annotations]
3527+
uploaded_annotations = [
3528+
annotation.path for annotation in uploaded_annotations
3529+
]
35283530
missing_annotations.extend(
35293531
[annotation.path for annotation in self._missing_annotations]
35303532
)

tests/data_set/csv_files/text_urls.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@ https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/text1_%D0%9B%D0%B
99
https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/%D5%B6%D5%A1%D6%80%20%D5%A4%D5%B8%D5%BD.txt, textՆարԴոս
1010
https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,
1111
https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,
12-
https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,
13-
https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,
12+
https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,same_name
13+
https://sa-public-text-files.s3.us-west-2.amazonaws.com/grmnda/small%20gods_%20tortoise.txt,same_name

tests/integration/test_attach_document_urls.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from tests.integration.base import BaseTestCase
66

77

8-
class TestImageUrls(BaseTestCase):
8+
class TestDocumentUrls(BaseTestCase):
99
PROJECT_NAME = "test attach document urls"
1010
PATH_TO_URLS = "data_set/csv_files/text_urls.csv"
1111
PROJECT_DESCRIPTION = "desc"
@@ -16,7 +16,36 @@ def test_attach_image_urls(self):
1616
self.PROJECT_NAME,
1717
os.path.join(dirname(dirname(__file__)), self.PATH_TO_URLS),
1818
)
19-
self.assertEqual(len(uploaded), 12)
19+
self.assertEqual(len(uploaded), 11)
2020
self.assertEqual(len(could_not_upload), 0)
21-
self.assertEqual(len(existing_images), 0)
21+
self.assertEqual(len(existing_images), 1)
22+
23+
class TestImageUrls(BaseTestCase):
24+
PROJECT_NAME = "test attach image urls"
25+
PATH_TO_URLS = "data_set/csv_files/text_urls.csv"
26+
PROJECT_DESCRIPTION = "desc"
27+
PROJECT_TYPE = "Vector"
2228

29+
def test_attach_image_urls(self):
30+
uploaded, could_not_upload, existing_images = sa.attach_image_urls_to_project(
31+
self.PROJECT_NAME,
32+
os.path.join(dirname(dirname(__file__)), self.PATH_TO_URLS),
33+
)
34+
self.assertEqual(len(uploaded), 11)
35+
self.assertEqual(len(could_not_upload), 0)
36+
self.assertEqual(len(existing_images), 1)
37+
38+
class TestVideoUrls(BaseTestCase):
39+
PROJECT_NAME = "test attach video urls"
40+
PATH_TO_URLS = "data_set/csv_files/text_urls.csv"
41+
PROJECT_DESCRIPTION = "desc"
42+
PROJECT_TYPE = "Video"
43+
44+
def test_attach_image_urls(self):
45+
uploaded, could_not_upload, existing_images = sa.attach_video_urls_to_project(
46+
self.PROJECT_NAME,
47+
os.path.join(dirname(dirname(__file__)), self.PATH_TO_URLS),
48+
)
49+
self.assertEqual(len(uploaded), 11)
50+
self.assertEqual(len(could_not_upload), 0)
51+
self.assertEqual(len(existing_images), 1)

0 commit comments

Comments
 (0)