Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 2b08d01

Browse files
authored
Merge pull request #213 from datafold/tracking2
Added optional tracking
2 parents 35ae1be + a2903bc commit 2b08d01

File tree

9 files changed

+279
-54
lines changed

9 files changed

+279
-54
lines changed

README.md

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ better than MySQL.
4646
- [How to use from Python](#how-to-use-from-python)
4747
- [Technical Explanation](#technical-explanation)
4848
- [Performance Considerations](#performance-considerations)
49+
- [Anonymous Tracking](#anonymous-tracking)
4950
- [Development Setup](#development-setup)
5051
- [License](#license)
5152

@@ -182,12 +183,12 @@ _<sup>*</sup> Some drivers have dependencies that cannot be installed using `pip
182183

183184
### Install Psycopg2
184185

185-
In order to run Postgresql, you'll need `psycopg2`. This Python package requires some additional dependencies described in their [documentation](https://www.psycopg.org/docs/install.html#build-prerequisites).
186+
In order to run Postgresql, you'll need `psycopg2`. This Python package requires some additional dependencies described in their [documentation](https://www.psycopg.org/docs/install.html#build-prerequisites).
186187
An easy solution is to install [psycopg2-binary](https://www.psycopg.org/docs/install.html#quick-install) by running:
187188

188189
```pip install psycopg2-binary```
189190

190-
Which comes with a pre-compiled binary and does not require additonal prerequisites. However, note that for production use it is adviced to use `psycopg2`.
191+
Which comes with a pre-compiled binary and does not require additonal prerequisites. However, note that for production use it is adviced to use `psycopg2`.
191192

192193

193194
# How to use
@@ -225,6 +226,8 @@ Options:
225226
- `-j` or `--threads` - Number of worker threads to use per database. Default=1.
226227
- `-w`, `--where` - An additional 'where' expression to restrict the search space.
227228
- `--conf`, `--run` - Specify the run and configuration from a TOML file. (see below)
229+
- `--no-tracking` - data-diff sends home anonymous usage data. Use this to disable it.
230+
228231

229232
### How to use with a configuration file
230233

@@ -469,6 +472,38 @@ If you pass `--stats` you'll see e.g. what % of rows were different.
469472
gaps), and improvements to bypass Python/driver performance limitations when
470473
comparing huge amounts of rows locally (i.e. for very high `bisection_threshold` values).
471474

475+
# Usage Analytics
476+
477+
data-diff collects anonymous usage data to help our team improve the tool and to apply development efforts to where our users need them most.
478+
479+
We capture two events, one when the data-diff run starts and one when it finished. No user data or potentially sensitive information is or ever will be collected. The captured data is limited to:
480+
481+
- Operating System and Python version
482+
483+
- Types of databases used (postgresql, mysql, etc.)
484+
485+
- Sizes of tables diffed, run time, and diff row count (numbers only)
486+
487+
- Error message, if any, truncated to the first 20 characters.
488+
489+
- A persistent UUID to indentify the session, stored in `~/.datadiff.toml`
490+
491+
If you do not wish to participate, the tracking can be easily disabled with one of the following methods:
492+
493+
* In the CLI, use the `--no-tracking` flag.
494+
495+
* In the config file, set `no_tracking = true` (for example, under `[run.default]`)
496+
497+
* If you're using the Python API:
498+
499+
```python
500+
import data_diff
501+
data_diff.disable_tracking() # Call this first, before making any API calls
502+
503+
# Connect and diff your tables without any tracking
504+
```
505+
506+
472507
# Development Setup
473508

474509
The development setup centers around using `docker-compose` to boot up various

data_diff/__init__.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Tuple, Iterator, Optional, Union
22

3+
from .tracking import disable_tracking
34
from .databases.connect import connect
45
from .diff_tables import (
56
TableSegment,
@@ -73,15 +74,19 @@ def diff_tables(
7374
7475
"""
7576
tables = [table1, table2]
76-
override_attrs = {k:v for k,v in dict(
77+
override_attrs = {
78+
k: v
79+
for k, v in dict(
7780
key_column=key_column,
7881
update_column=update_column,
7982
extra_columns=extra_columns,
8083
min_key=min_key,
8184
max_key=max_key,
8285
min_update=min_update,
8386
max_update=max_update,
84-
).items() if v is not None}
87+
).items()
88+
if v is not None
89+
}
8590

8691
segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables
8792

data_diff/__main__.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import logging
66
from itertools import islice
77

8+
from data_diff.tracking import disable_tracking
9+
810
from .utils import remove_password_from_url, safezip, match_like
911

1012
from .diff_tables import (
@@ -81,6 +83,7 @@ def _get_schema(pair):
8183
@click.option("--json", "json_output", is_flag=True, help="Print JSONL output for machine readability")
8284
@click.option("-v", "--verbose", is_flag=True, help="Print extra info")
8385
@click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
86+
@click.option("--no-tracking", is_flag=True, help="data-diff sends home anonymous usage data. Use this to disable it.")
8487
@click.option(
8588
"--case-sensitive",
8689
is_flag=True,
@@ -128,6 +131,7 @@ def _main(
128131
debug,
129132
verbose,
130133
interactive,
134+
no_tracking,
131135
threads,
132136
case_sensitive,
133137
json_output,
@@ -137,6 +141,9 @@ def _main(
137141
__conf__=None,
138142
):
139143

144+
if no_tracking:
145+
disable_tracking()
146+
140147
if interactive:
141148
debug = True
142149

@@ -182,7 +189,7 @@ def _main(
182189
for db in dbs:
183190
db.enable_interactive()
184191

185-
start = time.time()
192+
start = time.monotonic()
186193

187194
try:
188195
options = dict(
@@ -274,7 +281,7 @@ def _main(
274281

275282
sys.stdout.flush()
276283

277-
end = time.time()
284+
end = time.monotonic()
278285

279286
logging.info(f"Duration: {end-start:.2f} seconds.")
280287

data_diff/diff_tables.py

Lines changed: 84 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,17 @@
1212

1313
from runtype import dataclass
1414

15+
from .tracking import create_end_event_json, create_start_event_json, send_event_json, is_tracking_enabled
1516
from .sql import Select, Checksum, Compare, Count, TableName, Time, Value
16-
from .utils import CaseAwareMapping, CaseInsensitiveDict, safezip, split_space, CaseSensitiveDict, ArithString
17+
from .utils import (
18+
CaseAwareMapping,
19+
CaseInsensitiveDict,
20+
safezip,
21+
split_space,
22+
CaseSensitiveDict,
23+
ArithString,
24+
run_as_daemon,
25+
)
1726
from .databases.base import Database
1827
from .databases.database_types import (
1928
DbPath,
@@ -225,11 +234,11 @@ def count(self) -> Tuple[int, int]:
225234

226235
def count_and_checksum(self) -> Tuple[int, int]:
227236
"""Count and checksum the rows in the segment, in one pass."""
228-
start = time.time()
237+
start = time.monotonic()
229238
count, checksum = self.database.query(
230239
self._make_select(columns=[Count(), Checksum(self._relevant_columns_repr)]), tuple
231240
)
232-
duration = time.time() - start
241+
duration = time.monotonic() - start
233242
if duration > RECOMMENDED_CHECKSUM_DURATION:
234243
logger.warning(
235244
f"Checksum is taking longer than expected ({duration:.2f}s). "
@@ -260,6 +269,11 @@ def query_key_range(self) -> Tuple[int, int]:
260269
def is_bounded(self):
261270
return self.min_key is not None and self.max_key is not None
262271

272+
def approximate_size(self):
273+
if not self.is_bounded:
274+
raise RuntimeError("Cannot approximate the size of an unbounded segment. Must have min_key and max_key.")
275+
return self.max_key - self.min_key
276+
263277

264278
def diff_sets(a: set, b: set) -> Iterator:
265279
s1 = set(a)
@@ -325,45 +339,79 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
325339
if self.bisection_factor < 2:
326340
raise ValueError("Must have at least two segments per iteration (i.e. bisection_factor >= 2)")
327341

328-
# Query and validate schema
329-
table1, table2 = self._threaded_call("with_schema", [table1, table2])
330-
self._validate_and_adjust_columns(table1, table2)
331-
332-
key_type = table1._schema[table1.key_column]
333-
key_type2 = table2._schema[table2.key_column]
334-
if not isinstance(key_type, IKey):
335-
raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
336-
if not isinstance(key_type2, IKey):
337-
raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
338-
assert key_type.python_type is key_type2.python_type
342+
if is_tracking_enabled():
343+
options = dict(self)
344+
event_json = create_start_event_json(options)
345+
run_as_daemon(send_event_json, event_json)
339346

340-
# Query min/max values
341-
key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
347+
self.stats["diff_count"] = 0
348+
start = time.monotonic()
349+
try:
342350

343-
# Start with the first completed value, so we don't waste time waiting
344-
min_key1, max_key1 = self._parse_key_range_result(key_type, next(key_ranges))
351+
# Query and validate schema
352+
table1, table2 = self._threaded_call("with_schema", [table1, table2])
353+
self._validate_and_adjust_columns(table1, table2)
345354

346-
table1, table2 = [t.new(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
355+
key_type = table1._schema[table1.key_column]
356+
key_type2 = table2._schema[table2.key_column]
357+
if not isinstance(key_type, IKey):
358+
raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
359+
if not isinstance(key_type2, IKey):
360+
raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
361+
assert key_type.python_type is key_type2.python_type
347362

348-
logger.info(
349-
f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
350-
f"key-range: {table1.min_key}..{table2.max_key}, "
351-
f"size: {table2.max_key-table1.min_key}"
352-
)
363+
# Query min/max values
364+
key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
353365

354-
# Bisect (split) the table into segments, and diff them recursively.
355-
yield from self._bisect_and_diff_tables(table1, table2)
366+
# Start with the first completed value, so we don't waste time waiting
367+
min_key1, max_key1 = self._parse_key_range_result(key_type, next(key_ranges))
356368

357-
# Now we check for the second min-max, to diff the portions we "missed".
358-
min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
369+
table1, table2 = [t.new(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
359370

360-
if min_key2 < min_key1:
361-
pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
362-
yield from self._bisect_and_diff_tables(*pre_tables)
371+
logger.info(
372+
f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
373+
f"key-range: {table1.min_key}..{table2.max_key}, "
374+
f"size: {table1.approximate_size()}"
375+
)
363376

364-
if max_key2 > max_key1:
365-
post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
366-
yield from self._bisect_and_diff_tables(*post_tables)
377+
# Bisect (split) the table into segments, and diff them recursively.
378+
yield from self._bisect_and_diff_tables(table1, table2)
379+
380+
# Now we check for the second min-max, to diff the portions we "missed".
381+
min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
382+
383+
if min_key2 < min_key1:
384+
pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
385+
yield from self._bisect_and_diff_tables(*pre_tables)
386+
387+
if max_key2 > max_key1:
388+
post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
389+
yield from self._bisect_and_diff_tables(*post_tables)
390+
391+
error = None
392+
except BaseException as e: # Catch KeyboardInterrupt too
393+
error = e
394+
finally:
395+
if is_tracking_enabled():
396+
runtime = time.monotonic() - start
397+
table1_count = self.stats.get("table1_count")
398+
table2_count = self.stats.get("table2_count")
399+
diff_count = self.stats.get("diff_count")
400+
err_message = str(error)[:20] # Truncate possibly sensitive information.
401+
event_json = create_end_event_json(
402+
error is None,
403+
runtime,
404+
table1.database.name,
405+
table2.database.name,
406+
table1_count,
407+
table2_count,
408+
diff_count,
409+
err_message,
410+
)
411+
send_event_json(event_json)
412+
413+
if error:
414+
raise error
367415

368416
def _parse_key_range_result(self, key_type, key_range):
369417
mn, mx = key_range
@@ -441,6 +489,8 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
441489
self.stats["table1_count"] = len(rows1)
442490
self.stats["table2_count"] = len(rows2)
443491

492+
self.stats["diff_count"] += len(diff)
493+
444494
logger.info(". " * level + f"Diff found {len(diff)} different rows.")
445495
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
446496
yield from diff

0 commit comments

Comments
 (0)