datafold
diff --git a/‎README.md‎
Lines changed: 37 additions & 2 deletions b/‎README.md‎
Lines changed: 37 additions & 2 deletions
diff --git a/‎data_diff/__init__.py‎
Lines changed: 7 additions & 2 deletions b/‎data_diff/__init__.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎data_diff/__main__.py‎
Lines changed: 9 additions & 2 deletions b/‎data_diff/__main__.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎data_diff/diff_tables.py‎
Lines changed: 84 additions & 34 deletions b/‎data_diff/diff_tables.py‎
Lines changed: 84 additions & 34 deletions
@@ -46,6 +46,7 @@ better than MySQL.
   - [How to use from Python](#how-to-use-from-python)
 - [Technical Explanation](#technical-explanation)
   - [Performance Considerations](#performance-considerations)
+- [Anonymous Tracking](#anonymous-tracking)
 - [Development Setup](#development-setup)
 - [License](#license)
 
@@ -182,12 +183,12 @@ _<sup>*</sup> Some drivers have dependencies that cannot be installed using `pip
 
 ### Install Psycopg2
 
-In order to run Postgresql, you'll need `psycopg2`. This Python package requires some additional dependencies described in their [documentation](https://www.psycopg.org/docs/install.html#build-prerequisites). 
+In order to run Postgresql, you'll need `psycopg2`. This Python package requires some additional dependencies described in their [documentation](https://www.psycopg.org/docs/install.html#build-prerequisites).
 An easy solution is to install [psycopg2-binary](https://www.psycopg.org/docs/install.html#quick-install) by running:
 
 ```pip install psycopg2-binary```
 
-Which comes with a pre-compiled binary and does not require additonal prerequisites. However, note that for production use it is adviced to use `psycopg2`. 
+Which comes with a pre-compiled binary and does not require additonal prerequisites. However, note that for production use it is adviced to use `psycopg2`.
 
 
 # How to use
@@ -225,6 +226,8 @@ Options:
   - `-j` or `--threads` - Number of worker threads to use per database. Default=1.
   - `-w`, `--where` - An additional 'where' expression to restrict the search space.
   - `--conf`, `--run` - Specify the run and configuration from a TOML file. (see below)
+  - `--no-tracking` - data-diff sends home anonymous usage data. Use this to disable it.
+
 
 ### How to use with a configuration file
 
@@ -469,6 +472,38 @@ If you pass `--stats` you'll see e.g. what % of rows were different.
   gaps), and improvements to bypass Python/driver performance limitations when
   comparing huge amounts of rows locally (i.e. for very high `bisection_threshold` values).
 
+# Usage Analytics
+
+data-diff collects anonymous usage data to help our team improve the tool and to apply development efforts to where our users need them most.
+
+We capture two events, one when the data-diff run starts and one when it finished. No user data or potentially sensitive information is or ever will be collected. The captured data is limited to:
+
+- Operating System and Python version
+
+- Types of databases used (postgresql, mysql, etc.)
+
+- Sizes of tables diffed, run time, and diff row count (numbers only)
+
+- Error message, if any, truncated to the first 20 characters.
+
+- A persistent UUID to indentify the session, stored in `~/.datadiff.toml`
+
+If you do not wish to participate, the tracking can be easily disabled with one of the following methods:
+
+* In the CLI, use the `--no-tracking` flag.
+
+* In the config file, set `no_tracking = true` (for example, under `[run.default]`)
+
+* If you're using the Python API:
+
+```python
+import data_diff
+data_diff.disable_tracking()    # Call this first, before making any API calls
+
+# Connect and diff your tables without any tracking
+```
+
+
 # Development Setup
 
 The development setup centers around using `docker-compose` to boot up various
 
@@ -1,5 +1,6 @@
 from typing import Tuple, Iterator, Optional, Union
 
+from .tracking import disable_tracking
 from .databases.connect import connect
 from .diff_tables import (
     TableSegment,
@@ -73,15 +74,19 @@ def diff_tables(
 
     """
     tables = [table1, table2]
-    override_attrs = {k:v for k,v in dict(
+    override_attrs = {
+        k: v
+        for k, v in dict(
             key_column=key_column,
             update_column=update_column,
             extra_columns=extra_columns,
             min_key=min_key,
             max_key=max_key,
             min_update=min_update,
             max_update=max_update,
-    ).items() if v is not None}
+        ).items()
+        if v is not None
+    }
 
     segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables
 
 
@@ -5,6 +5,8 @@
 import logging
 from itertools import islice
 
+from data_diff.tracking import disable_tracking
+
 from .utils import remove_password_from_url, safezip, match_like
 
 from .diff_tables import (
@@ -81,6 +83,7 @@ def _get_schema(pair):
 @click.option("--json", "json_output", is_flag=True, help="Print JSONL output for machine readability")
 @click.option("-v", "--verbose", is_flag=True, help="Print extra info")
 @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
+@click.option("--no-tracking", is_flag=True, help="data-diff sends home anonymous usage data. Use this to disable it.")
 @click.option(
     "--case-sensitive",
     is_flag=True,
@@ -128,6 +131,7 @@ def _main(
     debug,
     verbose,
     interactive,
+    no_tracking,
     threads,
     case_sensitive,
     json_output,
@@ -137,6 +141,9 @@ def _main(
     __conf__=None,
 ):
 
+    if no_tracking:
+        disable_tracking()
+
     if interactive:
         debug = True
 
@@ -182,7 +189,7 @@ def _main(
         for db in dbs:
             db.enable_interactive()
 
-    start = time.time()
+    start = time.monotonic()
 
     try:
         options = dict(
@@ -274,7 +281,7 @@ def _main(
 
             sys.stdout.flush()
 
-    end = time.time()
+    end = time.monotonic()
 
     logging.info(f"Duration: {end-start:.2f} seconds.")
 
 
@@ -12,8 +12,17 @@
 
 from runtype import dataclass
 
+from .tracking import create_end_event_json, create_start_event_json, send_event_json, is_tracking_enabled
 from .sql import Select, Checksum, Compare, Count, TableName, Time, Value
-from .utils import CaseAwareMapping, CaseInsensitiveDict, safezip, split_space, CaseSensitiveDict, ArithString
+from .utils import (
+    CaseAwareMapping,
+    CaseInsensitiveDict,
+    safezip,
+    split_space,
+    CaseSensitiveDict,
+    ArithString,
+    run_as_daemon,
+)
 from .databases.base import Database
 from .databases.database_types import (
     DbPath,
@@ -225,11 +234,11 @@ def count(self) -> Tuple[int, int]:
 
     def count_and_checksum(self) -> Tuple[int, int]:
         """Count and checksum the rows in the segment, in one pass."""
-        start = time.time()
+        start = time.monotonic()
         count, checksum = self.database.query(
             self._make_select(columns=[Count(), Checksum(self._relevant_columns_repr)]), tuple
         )
-        duration = time.time() - start
+        duration = time.monotonic() - start
         if duration > RECOMMENDED_CHECKSUM_DURATION:
             logger.warning(
                 f"Checksum is taking longer than expected ({duration:.2f}s). "
@@ -260,6 +269,11 @@ def query_key_range(self) -> Tuple[int, int]:
     def is_bounded(self):
         return self.min_key is not None and self.max_key is not None
 
+    def approximate_size(self):
+        if not self.is_bounded:
+            raise RuntimeError("Cannot approximate the size of an unbounded segment. Must have min_key and max_key.")
+        return self.max_key - self.min_key
+
 
 def diff_sets(a: set, b: set) -> Iterator:
     s1 = set(a)
@@ -325,45 +339,79 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
         if self.bisection_factor < 2:
             raise ValueError("Must have at least two segments per iteration (i.e. bisection_factor >= 2)")
 
-        # Query and validate schema
-        table1, table2 = self._threaded_call("with_schema", [table1, table2])
-        self._validate_and_adjust_columns(table1, table2)
-
-        key_type = table1._schema[table1.key_column]
-        key_type2 = table2._schema[table2.key_column]
-        if not isinstance(key_type, IKey):
-            raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
-        if not isinstance(key_type2, IKey):
-            raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
-        assert key_type.python_type is key_type2.python_type
+        if is_tracking_enabled():
+            options = dict(self)
+            event_json = create_start_event_json(options)
+            run_as_daemon(send_event_json, event_json)
 
-        # Query min/max values
-        key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
+        self.stats["diff_count"] = 0
+        start = time.monotonic()
+        try:
 
-        # Start with the first completed value, so we don't waste time waiting
-        min_key1, max_key1 = self._parse_key_range_result(key_type, next(key_ranges))
+            # Query and validate schema
+            table1, table2 = self._threaded_call("with_schema", [table1, table2])
+            self._validate_and_adjust_columns(table1, table2)
 
-        table1, table2 = [t.new(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
+            key_type = table1._schema[table1.key_column]
+            key_type2 = table2._schema[table2.key_column]
+            if not isinstance(key_type, IKey):
+                raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
+            if not isinstance(key_type2, IKey):
+                raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
+            assert key_type.python_type is key_type2.python_type
 
-        logger.info(
-            f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
-            f"key-range: {table1.min_key}..{table2.max_key}, "
-            f"size: {table2.max_key-table1.min_key}"
-        )
+            # Query min/max values
+            key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
 
-        # Bisect (split) the table into segments, and diff them recursively.
-        yield from self._bisect_and_diff_tables(table1, table2)
+            # Start with the first completed value, so we don't waste time waiting
+            min_key1, max_key1 = self._parse_key_range_result(key_type, next(key_ranges))
 
-        # Now we check for the second min-max, to diff the portions we "missed".
-        min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
+            table1, table2 = [t.new(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
 
-        if min_key2 < min_key1:
-            pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
-            yield from self._bisect_and_diff_tables(*pre_tables)
+            logger.info(
+                f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
+                f"key-range: {table1.min_key}..{table2.max_key}, "
+                f"size: {table1.approximate_size()}"
+            )
 
-        if max_key2 > max_key1:
-            post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
-            yield from self._bisect_and_diff_tables(*post_tables)
+            # Bisect (split) the table into segments, and diff them recursively.
+            yield from self._bisect_and_diff_tables(table1, table2)
+
+            # Now we check for the second min-max, to diff the portions we "missed".
+            min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
+
+            if min_key2 < min_key1:
+                pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
+                yield from self._bisect_and_diff_tables(*pre_tables)
+
+            if max_key2 > max_key1:
+                post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
+                yield from self._bisect_and_diff_tables(*post_tables)
+
+            error = None
+        except BaseException as e:  # Catch KeyboardInterrupt too
+            error = e
+        finally:
+            if is_tracking_enabled():
+                runtime = time.monotonic() - start
+                table1_count = self.stats.get("table1_count")
+                table2_count = self.stats.get("table2_count")
+                diff_count = self.stats.get("diff_count")
+                err_message = str(error)[:20]  # Truncate possibly sensitive information.
+                event_json = create_end_event_json(
+                    error is None,
+                    runtime,
+                    table1.database.name,
+                    table2.database.name,
+                    table1_count,
+                    table2_count,
+                    diff_count,
+                    err_message,
+                )
+                send_event_json(event_json)
+
+            if error:
+                raise error
 
     def _parse_key_range_result(self, key_type, key_range):
         mn, mx = key_range
@@ -441,6 +489,8 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
                 self.stats["table1_count"] = len(rows1)
                 self.stats["table2_count"] = len(rows2)
 
+            self.stats["diff_count"] += len(diff)
+
             logger.info(". " * level + f"Diff found {len(diff)} different rows.")
             self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
             yield from diff