fix(s3): Skip empty files when downloading dbt projects from S3

tomasfarias · tomasfarias · commit e9f48ff92a22 · 2021-12-24T23:43:08.000+01:00
Me and other users have noticed that when uploading files to S3 using
the AWS web UI an unnamed file may sometimes appear.

Not entirely sure if this is a bug on AWS' side or what, but it's
causing us issues when downloading dbt projects from S3 as we attemp
to create a file with the name of the parent directory (which usually
already exists, or is created and later downloads fail when they are
nested inside that file/directory).

So this deals with the issue by ignoring files like this. Also added a
unit test to cover this case, although it's kind of hard to test the
behaviour as this file apparently only appears when using the web UI.
diff --git a/airflow_dbt_python/hooks/s3.py b/airflow_dbt_python/hooks/s3.py
@@ -52,8 +52,16 @@ def download_one_s3_object(self, target: Path, s3_object):
         """Download a single s3 object."""
         self.log.info("Saving %s file to: %s", s3_object, target)
 
-        with open(target, "wb+") as f:
-            s3_object.download_fileobj(f)
+        try:
+            with open(target, "wb+") as f:
+                s3_object.download_fileobj(f)
+
+        except IsADirectoryError:
+            # Uploading files manually via the AWS UI to S3 can cause files
+            # with empty names to appear. When we attemp to download it, we build
+            # a relative path  that is equal to the parent directory that already
+            # exists.
+            self.log.warning("A file with no name was found in S3 at %s", s3_object)
 
     def get_dbt_project(
         self, s3_project_url: str, project_dir: Optional[str] = None
@@ -112,6 +120,13 @@ def download_many_s3_keys(
         for s3_object_key in s3_keys:
             s3_object = self.get_key(key=s3_object_key, bucket_name=bucket_name)
             path_file = Path(s3_object_key).relative_to(prefix)
+
+            if path_file.suffix == "" and s3_object.key.endswith("/"):
+                # Empty S3 files may also be confused with unwanted directories.
+                # See comment in line 60.
+                self.log.warning("A file with no name was found in S3 at %s", s3_object)
+                continue
+
             local_project_file = target_dir / path_file
             local_project_file.parent.mkdir(parents=True, exist_ok=True)
 
diff --git a/tests/hooks/s3/test_dbt_s3_hook.py b/tests/hooks/s3/test_dbt_s3_hook.py
@@ -168,18 +168,27 @@ def test_get_dbt_project_no_trailing_slash(s3_bucket, tmpdir, dbt_project_file):
 
 
 @pytest.fixture
-def test_files():
-    f1 = Path("seeds/a_seed.csv")
+def test_files(tmp_path_factory):
+    """Create test files to upload to S3."""
+    d = tmp_path_factory.mktemp("test_s3")
+    seed_dir = d / "seeds"
+    seed_dir.mkdir(exist_ok=True)
+    f1 = seed_dir / "a_seed.csv"
+
     with open(f1, "w+") as f:
         f.write("col1,col2\n1,2")
 
-    f2 = Path("models/a_model.sql")
+    models_dir = d / "models"
+    models_dir.mkdir(exist_ok=True)
+    f2 = models_dir / "a_model.sql"
     with open(f2, "w+") as f:
         f.write("SELECT 1")
-    f3 = Path("models/another_model.sql")
+    f3 = models_dir / "another_model.sql"
     with open(f3, "w+") as f:
         f.write("SELECT 2")
+
     yield [f1, f2, f3]
+
     f1.unlink()
     f2.unlink()
     f3.unlink()
@@ -190,11 +199,14 @@ def test_get_dbt_project_from_zip_file(s3_bucket, tmpdir, dbt_project_file, test
     with open(dbt_project_file) as pf:
         project_content = pf.read()
 
+    # Prepare a zip file to upload to S3
     zip_buffer = io.BytesIO()
     with ZipFile(zip_buffer, "a") as zf:
         zf.write(dbt_project_file, "dbt_project.yml")
         for f in test_files:
-            zf.write(f)
+            # Since files  are in a different temporary directory, we need to zip them
+            # with their direct parent, e.g. models/a_model.sql
+            zf.write(f, arcname="/".join([f.parts[-2], f.parts[-1]]))
 
     hook = DbtS3Hook()
     bucket = hook.get_bucket(s3_bucket)
@@ -230,3 +242,38 @@ def test_get_dbt_project_from_zip_file(s3_bucket, tmpdir, dbt_project_file, test
     with open(project_path / "seeds" / "a_seed.csv") as f:
         result = f.read()
     assert result == "col1,col2\n1,2"
+
+
+def test_get_dbt_project_with_empty_file(s3_bucket, tmpdir, dbt_project_file):
+    """Test whether an S3 path without a trailing slash pulls a dbt project."""
+    hook = DbtS3Hook()
+    bucket = hook.get_bucket(s3_bucket)
+
+    with open(dbt_project_file) as pf:
+        project_content = pf.read()
+    bucket.put_object(Key="project/dbt_project.yml", Body=project_content.encode())
+    bucket.put_object(Key="project/models/a_model.sql", Body=b"SELECT 1")
+    bucket.put_object(Key="project/data/a_seed.csv", Body=b"col1,col2\n1,2")
+    bucket.put_object(Key="project/data//", Body=b"")
+
+    project_path = hook.get_dbt_project(
+        f"s3://{s3_bucket}/project",
+        project_dir=str(tmpdir),
+    )
+
+    assert project_path.exists()
+
+    dir_contents = [f for f in project_path.iterdir()]
+    assert sorted(str(f.name) for f in dir_contents) == [
+        "data",
+        "dbt_project.yml",
+        "models",
+    ]
+
+    with open(project_path / "dbt_project.yml") as f:
+        result = f.read()
+    assert result == project_content
+
+    with open(project_path / "models" / "a_model.sql") as f:
+        result = f.read()
+    assert result == "SELECT 1"