Merge branch 'master' into oracle-via-oracledb

dlawin · web-flow · commit d0d0e2fdec5e · 2023-08-22T15:35:36.000-06:00
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -253,8 +253,8 @@ def write_usage(self, prog: str, args: str = "", prefix: Optional[str] = None) -
     "--select",
     "-s",
     default=None,
-    metavar="PATH",
-    help="select dbt resources to compare using dbt selection syntax.",
+    metavar="SELECTION or MODEL_NAME",
+    help="--select dbt resources to compare using dbt selection syntax in dbt versions >= 1.5.\nIn versions < 1.5, it will naively search for a model with MODEL_NAME as the name.",
 )
 @click.option(
     "--state",
diff --git a/data_diff/dbt_parser.py b/data_diff/dbt_parser.py
@@ -20,8 +20,8 @@
     DataDiffDbtRunResultsVersionError,
     DataDiffDbtSelectNoMatchingModelsError,
     DataDiffDbtSelectUnexpectedError,
-    DataDiffDbtSelectVersionTooLowError,
     DataDiffDbtSnowflakeSetConnectionError,
+    DataDiffSimpleSelectNotFound,
 )
 
 from .utils import getLogger, get_from_dict_with_raise
@@ -167,9 +167,11 @@ def get_models(self, dbt_selection: Optional[str] = None):
                         "data-diff is using a dbt-core version < 1.5, update the environment's dbt-core version via pip install 'dbt-core>=1.5' in order to use `--select`"
                     )
             else:
-                raise DataDiffDbtSelectVersionTooLowError(
-                    f"The `--select` feature requires dbt >= 1.5, but your project's manifest.json is from dbt v{dbt_version}. Please follow these steps to use the `--select` feature: \n 1. Update your dbt-core version via pip install 'dbt-core>=1.5'. Details: https://docs.getdbt.com/docs/core/pip-install#change-dbt-core-versions \n 2. Execute any `dbt` command (`run`, `compile`, `build`) to create a new manifest.json."
+                # Naively get node named <dbt_selection>
+                logger.warning(
+                    f"Full `--select` support requires dbt >= 1.5. Naively searching for a single model with name: '{dbt_selection}'."
                 )
+                return self.get_simple_model_selection(dbt_selection)
         else:
             return self.get_run_results_models()
 
@@ -209,6 +211,25 @@ def get_dbt_selection_models(self, dbt_selection: str) -> List[str]:
         logger.debug(str(results))
         raise DataDiffDbtSelectUnexpectedError("Encountered an unexpected error while finding `--select` models")
 
+    def get_simple_model_selection(self, dbt_selection: str):
+        model_nodes = dict(filter(lambda item: item[0].startswith("model."), self.dev_manifest_obj.nodes.items()))
+
+        model_unique_key_list = [k for k, v in model_nodes.items() if v.name == dbt_selection]
+
+        # name *should* always be unique, but just in case:
+        if len(model_unique_key_list) > 1:
+            logger.warning(
+                f"Found more than one model with name '{dbt_selection}' {model_unique_key_list}, using the first one."
+            )
+        elif len(model_unique_key_list) < 1:
+            raise DataDiffSimpleSelectNotFound(
+                f"Did not find a model node with name '{dbt_selection}' in the manifest."
+            )
+
+        model = model_nodes.get(model_unique_key_list[0])
+
+        return [model]
+
     def get_run_results_models(self):
         with open(self.project_dir / RUN_RESULTS_PATH) as run_results:
             logger.info(f"Parsing file {RUN_RESULTS_PATH}")
diff --git a/data_diff/errors.py b/data_diff/errors.py
@@ -42,10 +42,6 @@ class DataDiffDbtCoreNoRunnerError(Exception):
     "Raised when the manifest version >= 1.5, but the dbt-core package is < 1.5. This is an edge case most likely to occur in development."
 
 
-class DataDiffDbtSelectVersionTooLowError(Exception):
-    "Raised when attempting to use `--select` with a dbt-core version < 1.5."
-
-
 class DataDiffCustomSchemaNoConfigError(Exception):
     "Raised when a model has a custom schema, but there is no prod_custom_schema config. (And not using --state)."
 
@@ -68,3 +64,7 @@ class DataDiffCloudDiffFailed(Exception):
 
 class DataDiffCloudDiffTimedOut(Exception):
     "Raised when using --cloud and the diff did not return finish before the timeout value."
+
+
+class DataDiffSimpleSelectNotFound(Exception):
+    "Raised when using --select on dbt < 1.5 and a model node is not found in the manifest."
diff --git a/data_diff/sqeleton/databases/_connect.py b/data_diff/sqeleton/databases/_connect.py
@@ -106,7 +106,7 @@ def load_mixins(self, *abstract_mixins: AbstractMixin) -> Self:
         database_by_scheme = {k: db.load_mixins(*abstract_mixins) for k, db in self.database_by_scheme.items()}
         return type(self)(database_by_scheme)
 
-    def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1) -> Database:
+    def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs) -> Database:
         """Connect to the given database uri
 
         thread_count determines the max number of worker threads per database,
@@ -149,7 +149,7 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1) -> Databa
                 conn_dict = config["database"][database]
             except KeyError:
                 raise ValueError(f"Cannot find database config named '{database}'.")
-            return self.connect_with_dict(conn_dict, thread_count)
+            return self.connect_with_dict(conn_dict, thread_count, **kwargs)
 
         try:
             matcher = self.match_uri_path[scheme]
@@ -174,7 +174,7 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1) -> Databa
 
             if scheme == "bigquery":
                 kw["project"] = dsn.host
-                return cls(**kw)
+                return cls(**kw, **kwargs)
 
             if scheme == "snowflake":
                 kw["account"] = dsn.host
@@ -194,13 +194,13 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1) -> Databa
         kw = {k: v for k, v in kw.items() if v is not None}
 
         if issubclass(cls, ThreadedDatabase):
-            db = cls(thread_count=thread_count, **kw)
+            db = cls(thread_count=thread_count, **kw, **kwargs)
         else:
-            db = cls(**kw)
+            db = cls(**kw, **kwargs)
 
         return self._connection_created(db)
 
-    def connect_with_dict(self, d, thread_count):
+    def connect_with_dict(self, d, thread_count, **kwargs):
         d = dict(d)
         driver = d.pop("driver")
         try:
@@ -210,17 +210,19 @@ def connect_with_dict(self, d, thread_count):
 
         cls = matcher.database_cls
         if issubclass(cls, ThreadedDatabase):
-            db = cls(thread_count=thread_count, **d)
+            db = cls(thread_count=thread_count, **d, **kwargs)
         else:
-            db = cls(**d)
+            db = cls(**d, **kwargs)
 
         return self._connection_created(db)
 
     def _connection_created(self, db):
         "Nop function to be overridden by subclasses."
         return db
 
-    def __call__(self, db_conf: Union[str, dict], thread_count: Optional[int] = 1, shared: bool = True) -> Database:
+    def __call__(
+        self, db_conf: Union[str, dict], thread_count: Optional[int] = 1, shared: bool = True, **kwargs
+    ) -> Database:
         """Connect to a database using the given database configuration.
 
         Configuration can be given either as a URI string, or as a dict of {option: value}.
@@ -234,6 +236,8 @@ def __call__(self, db_conf: Union[str, dict], thread_count: Optional[int] = 1, s
             db_conf (str | dict): The configuration for the database to connect. URI or dict.
             thread_count (int, optional): Size of the threadpool. Ignored by cloud databases. (default: 1)
             shared (bool): Whether to cache and return the same connection for the same db_conf. (default: True)
+            bigquery_credentials (google.oauth2.credentials.Credentials): Custom Google oAuth2 credential for BigQuery.
+            (default: None)
 
         Note: For non-cloud databases, a low thread-pool size may be a performance bottleneck.
 
@@ -263,9 +267,9 @@ def __call__(self, db_conf: Union[str, dict], thread_count: Optional[int] = 1, s
                     return conn
 
         if isinstance(db_conf, str):
-            conn = self.connect_to_uri(db_conf, thread_count)
+            conn = self.connect_to_uri(db_conf, thread_count, **kwargs)
         elif isinstance(db_conf, dict):
-            conn = self.connect_with_dict(db_conf, thread_count)
+            conn = self.connect_with_dict(db_conf, thread_count, **kwargs)
         else:
             raise TypeError(f"db configuration must be a URI string or a dictionary. Instead got '{db_conf}'.")
 
diff --git a/data_diff/sqeleton/databases/bigquery.py b/data_diff/sqeleton/databases/bigquery.py
@@ -210,8 +210,8 @@ class BigQuery(Database):
     CONNECT_URI_PARAMS = ["dataset"]
     dialect = Dialect()
 
-    def __init__(self, project, *, dataset, **kw):
-        credentials = None
+    def __init__(self, project, *, dataset, bigquery_credentials=None, **kw):
+        credentials = bigquery_credentials
         bigquery = import_bigquery()
 
         keyfile = kw.pop("keyfile", None)
diff --git a/data_diff/version.py b/data_diff/version.py
@@ -1 +1 @@
-__version__ = "0.8.3"
+__version__ = "0.8.4"
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-diff"
-version = "0.8.3"
+version = "0.8.4"
 description = "Command-line tool and Python library to efficiently diff rows across two different databases."
 authors = ["Datafold <data-diff@datafold.com>"]
 license = "MIT"
@@ -37,7 +37,7 @@ trino = {version="^0.314.0", optional=true}
 presto-python-client = {version="*", optional=true}
 clickhouse-driver = {version="*", optional=true}
 duckdb = {version="*", optional=true}
-dbt-artifacts-parser = {version="^0.4.0"}
+dbt-artifacts-parser = {version="^0.4.2"}
 dbt-core = {version="^1.0.0"}
 keyring = "*"
 tabulate = "^0.9.0"
@@ -59,7 +59,7 @@ presto-python-client = "*"
 clickhouse-driver = "*"
 vertica-python = "*"
 duckdb = "^0.7.0"
-dbt-artifacts-parser = "^0.4.0"
+dbt-artifacts-parser = "^0.4.2"
 dbt-core = "^1.0.0"
 # google-cloud-bigquery = "*"
 # databricks-sql-connector = "*"
diff --git a/tests/test_dbt_parser.py b/tests/test_dbt_parser.py
@@ -10,7 +10,6 @@
     DataDiffDbtProfileNotFoundError,
     DataDiffDbtRedshiftPasswordOnlyError,
     DataDiffDbtRunResultsVersionError,
-    DataDiffDbtSelectVersionTooLowError,
     DataDiffDbtSnowflakeSetConnectionError,
 )
 
@@ -56,17 +55,18 @@ def test_get_models(self):
         mock_self.get_dbt_selection_models.assert_called_once_with(selection)
         self.assertEqual(models, mock_return_value)
 
-    def test_get_models_unsupported_manifest_version(self):
+    def test_get_models_simple_select(self):
         mock_self = Mock()
         mock_self.project_dir = Path()
         mock_self.dbt_version = "1.4.0"
         selection = "model+"
         mock_return_value = Mock()
-        mock_self.get_dbt_selection_models.return_value = mock_return_value
+        mock_self.get_simple_model_selection.return_value = mock_return_value
 
-        with self.assertRaises(DataDiffDbtSelectVersionTooLowError):
-            _ = DbtParser.get_models(mock_self, selection)
+        models = DbtParser.get_models(mock_self, selection)
         mock_self.get_dbt_selection_models.assert_not_called()
+        mock_self.get_simple_model_selection.assert_called_with(selection)
+        self.assertEqual(models, mock_return_value)
 
     def test_get_models_no_runner(self):
         mock_self = Mock()

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.8.3"`
	`1`	`+__version__ = "0.8.4"`