diff --git a/README.md b/README.md index 244f1560e1..2457f65adb 100644 --- a/README.md +++ b/README.md @@ -305,7 +305,6 @@ Please help us add more systems and run the benchmarks on more types of VMs: - [ ] Hive - [ ] Hydrolix - [ ] Impala -- [ ] InfluxDB - [ ] LocustDB - [ ] Manticore Search - [ ] MS SQL Server with Column Store Index (without publishing) diff --git a/influxdb/README.md b/influxdb/README.md new file mode 100644 index 0000000000..186e0ce78d --- /dev/null +++ b/influxdb/README.md @@ -0,0 +1,37 @@ +# InfluxDB + +This entry uses [InfluxDB 3 Core](https://docs.influxdata.com/influxdb3/core/), the open-source SQL-capable +release of InfluxDB. The query engine is Apache DataFusion, the storage is local Parquet. + +## Caveats + +InfluxDB is a time-series database, not a general analytical database, so loading a flat 100M-row +analytical dataset into it stretches the data model: + +1. **No bulk CSV/Parquet import.** The only ingestion path is line protocol over HTTP + (`/api/v3/write_lp`). `load.py` streams `hits.tsv`, converts each row to a line-protocol point, and + POSTs in batches. The conversion + ingest is the dominant cost of the load phase and is much slower + than e.g. Postgres `\copy` or DuckDB `COPY FROM`. + +2. **Required unique timestamp.** Line protocol merges points that share `(measurement, tags, timestamp)`, + so to preserve all rows we use the row index as the line protocol timestamp (in nanoseconds, offset + from a fixed 2020-01-01 epoch). The original `EventTime` is stored as a regular string field and used + by the queries. + +3. **No tags, all fields.** Tags are indexed at ingest time; for a wide flat schema the indexing cost + is prohibitive. Every column is written as a field instead. Numeric columns use the integer + line-protocol type (`...i`); string and date/time columns are written as strings. + +4. **Query compatibility.** Most ClickBench queries run unchanged. Q19 and Q43 cast `EventTime` (stored + as string) to a `TIMESTAMP` for `extract(minute ...)` and `date_trunc('minute', ...)`. DataFusion + folds unquoted identifiers to lowercase, so `load.py` writes column names in lowercase to keep the + standard CamelCase queries portable. + +## Run + +``` +./benchmark.sh +``` + +The server listens on port 8181, stores data under `./influxdb3-data`, and runs without authentication +(`--without-auth`) for the duration of the benchmark. diff --git a/influxdb/benchmark.sh b/influxdb/benchmark.sh new file mode 100755 index 0000000000..786e4ffd96 --- /dev/null +++ b/influxdb/benchmark.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +set -eu + +export DEBIAN_FRONTEND=noninteractive + +# Install dependencies and the InfluxDB 3 Core binary directly. We bypass the +# upstream install_influxdb3.sh installer because it is interactive and not +# suited for unattended runs. +sudo apt-get update -qq >/dev/null +sudo apt-get install -y -qq python3 python3-requests curl jq time >/dev/null + +INFLUX_VERSION=3.9.2 +case "$(uname -m)" in + x86_64|amd64) INFLUX_ARTIFACT=linux_amd64 ;; + aarch64|arm64) INFLUX_ARTIFACT=linux_arm64 ;; + *) echo "Unsupported architecture: $(uname -m)" >&2; exit 1 ;; +esac + +INFLUX_TGZ="influxdb3-core-${INFLUX_VERSION}_${INFLUX_ARTIFACT}.tar.gz" +wget --continue -q "https://dl.influxdata.com/influxdb/releases/${INFLUX_TGZ}" +rm -rf "influxdb3-core-${INFLUX_VERSION}" +tar -xzf "${INFLUX_TGZ}" +INFLUXDB3="${PWD}/influxdb3-core-${INFLUX_VERSION}/influxdb3" + +# Start the server with local-file storage and authentication disabled. +# The --wal-* tunings reduce per-second fsync churn during the multi-hour +# load and let more write requests accumulate in memory before being +# rejected with back-pressure. +mkdir -p ./influxdb3-data +start_server() { + nohup "${INFLUXDB3}" serve \ + --node-id node0 \ + --object-store file \ + --data-dir "${PWD}/influxdb3-data" \ + --http-bind 127.0.0.1:8181 \ + --without-auth \ + --wal-max-write-buffer-size 1000000 \ + --max-http-request-size 67108864 \ + --exec-mem-pool-bytes 80% \ + > influxdb3.log 2>&1 & + INFLUXDB_PID=$! + echo "InfluxDB PID: ${INFLUXDB_PID}" + + for _ in $(seq 1 300); do + curl -sf http://localhost:8181/health > /dev/null && return + sleep 1 + done + echo "Timed out waiting for InfluxDB to start" >&2 + return 1 +} + +restart_server() { + # SIGTERM forces the WAL to drain into Parquet and the in-memory write + # buffers to flush; the next start comes up with no WAL to replay. + kill -TERM "${INFLUXDB_PID}" 2>/dev/null || true + wait "${INFLUXDB_PID}" 2>/dev/null || true + start_server +} + +start_server + +"${INFLUXDB3}" create database hits + +# Download the dataset and load it via line protocol. +../download-hits-tsv + +# Load in chunks, restarting the server between each chunk so the WAL drains +# into Parquet. With one monolithic load, every Parquet file ends up covering +# the same broad time range (16 parallel writers interleave timestamps across +# the whole dataset), and InfluxDB 3.9.2's regroup_files optimizer hits an +# internal "overlapping ranges within same file" assertion at query time. +# Chunking keeps each Parquet file's [min_time, max_time] bounded to a +# disjoint slice, so subsequent queries can plan successfully. +TOTAL_ROWS=99997497 +CHUNKS=10 +CHUNK_ROWS=$(( (TOTAL_ROWS + CHUNKS - 1) / CHUNKS )) + +load_t0=$(date +%s) +for i in $(seq 0 $((CHUNKS - 1))); do + chunk_start=$((i * CHUNK_ROWS)) + chunk_end=$(( (i + 1) * CHUNK_ROWS )) + if [ "$chunk_end" -gt "$TOTAL_ROWS" ]; then chunk_end=$TOTAL_ROWS; fi + echo "Chunk $((i + 1))/${CHUNKS}: rows ${chunk_start}..${chunk_end}" + python3 load.py --start-row "$chunk_start" --end-row "$chunk_end" + # Drain WAL so this chunk lands in its own Parquet files before the + # next chunk starts mixing more timestamps into the in-memory buffer. + restart_server +done +echo "Load time: $(($(date +%s) - load_t0))" + +# Server is already freshly restarted from the last chunk's drain, so no +# additional restart is needed before the query phase. + +# Run queries. +./run.sh | tee log.txt + +echo -n "Data size: " +du -bcs ./influxdb3-data | grep total | awk '{print $1}' + +kill "${INFLUXDB_PID}" || true diff --git a/influxdb/load.py b/influxdb/load.py new file mode 100755 index 0000000000..9ee906e3cb --- /dev/null +++ b/influxdb/load.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +"""Stream hits.tsv into InfluxDB 3 via line protocol over HTTP. + +Each row of the TSV becomes one line-protocol point in measurement ``hits``. +All columns are written as fields (no tags). The line protocol timestamp is +derived from the row index so that every point is unique. + +Batches are encoded and uploaded by a thread pool: encoding happens under +the GIL, but the HTTP upload releases the GIL while waiting on the socket +and on the InfluxDB server, so several uploads can be in flight at once. + +Optionally accepts --start-row / --end-row so the load can be split into +chunks across multiple invocations. The wrapper script restarts InfluxDB +between chunks to drain the WAL into Parquet, which keeps the per-Parquet +``[min_time, max_time]`` ranges disjoint across chunks. That layout is what +keeps InfluxDB 3's ``regroup_files`` optimizer from tripping its +"overlapping ranges within same file" assertion at query time. +""" + +import argparse +import csv +import sys +import time +from concurrent.futures import ThreadPoolExecutor +from itertools import islice + +import requests + +URL = "http://localhost:8181/api/v3/write_lp" +DB = "hits" +INPUT = "hits.tsv" +TOTAL_ROWS = 99997497 + +# Each line-protocol point is roughly 2 KB encoded (105 fields with their +# names repeated on every row). 2000 rows keeps the body comfortably under +# the 64 MiB max-http-request-size we set on the server. Larger batches mean +# fewer HTTP round-trips. +BATCH_ROWS = 2000 + +# Number of upload workers. InfluxDB's per-connection ingest is limited; what +# scales here is concurrent in-flight HTTP requests. 16 workers gives ~30× +# the throughput of single-threaded loading on this dataset; doubling to 32 +# only adds another ~10%. +WORKERS = 16 + +# Bound the in-flight queue so we don't accumulate gigabytes of pending +# bodies if the server back-pressures. +MAX_PENDING = WORKERS * 2 + +PROGRESS_INTERVAL_SECONDS = 30 + +# 2020-01-01 00:00:00 UTC, in nanoseconds. Row i is written at TS_BASE + i ns, +# guaranteeing per-row uniqueness while keeping timestamps in a reasonable range. +TS_BASE = 1577836800_000_000_000 + +# Column schema (name, type) in TSV order. Type 'i' = integer, 's' = string. +# Mirrors postgresql/create.sql. +COLUMNS = [ + ("WatchID", "i"), + ("JavaEnable", "i"), + ("Title", "s"), + ("GoodEvent", "i"), + ("EventTime", "s"), + ("EventDate", "s"), + ("CounterID", "i"), + ("ClientIP", "i"), + ("RegionID", "i"), + ("UserID", "i"), + ("CounterClass", "i"), + ("OS", "i"), + ("UserAgent", "i"), + ("URL", "s"), + ("Referer", "s"), + ("IsRefresh", "i"), + ("RefererCategoryID", "i"), + ("RefererRegionID", "i"), + ("URLCategoryID", "i"), + ("URLRegionID", "i"), + ("ResolutionWidth", "i"), + ("ResolutionHeight", "i"), + ("ResolutionDepth", "i"), + ("FlashMajor", "i"), + ("FlashMinor", "i"), + ("FlashMinor2", "s"), + ("NetMajor", "i"), + ("NetMinor", "i"), + ("UserAgentMajor", "i"), + ("UserAgentMinor", "s"), + ("CookieEnable", "i"), + ("JavascriptEnable", "i"), + ("IsMobile", "i"), + ("MobilePhone", "i"), + ("MobilePhoneModel", "s"), + ("Params", "s"), + ("IPNetworkID", "i"), + ("TraficSourceID", "i"), + ("SearchEngineID", "i"), + ("SearchPhrase", "s"), + ("AdvEngineID", "i"), + ("IsArtifical", "i"), + ("WindowClientWidth", "i"), + ("WindowClientHeight", "i"), + ("ClientTimeZone", "i"), + ("ClientEventTime", "s"), + ("SilverlightVersion1", "i"), + ("SilverlightVersion2", "i"), + ("SilverlightVersion3", "i"), + ("SilverlightVersion4", "i"), + ("PageCharset", "s"), + ("CodeVersion", "i"), + ("IsLink", "i"), + ("IsDownload", "i"), + ("IsNotBounce", "i"), + ("FUniqID", "i"), + ("OriginalURL", "s"), + ("HID", "i"), + ("IsOldCounter", "i"), + ("IsEvent", "i"), + ("IsParameter", "i"), + ("DontCountHits", "i"), + ("WithHash", "i"), + ("HitColor", "s"), + ("LocalEventTime", "s"), + ("Age", "i"), + ("Sex", "i"), + ("Income", "i"), + ("Interests", "i"), + ("Robotness", "i"), + ("RemoteIP", "i"), + ("WindowName", "i"), + ("OpenerName", "i"), + ("HistoryLength", "i"), + ("BrowserLanguage", "s"), + ("BrowserCountry", "s"), + ("SocialNetwork", "s"), + ("SocialAction", "s"), + ("HTTPError", "i"), + ("SendTiming", "i"), + ("DNSTiming", "i"), + ("ConnectTiming", "i"), + ("ResponseStartTiming", "i"), + ("ResponseEndTiming", "i"), + ("FetchTiming", "i"), + ("SocialSourceNetworkID", "i"), + ("SocialSourcePage", "s"), + ("ParamPrice", "i"), + ("ParamOrderID", "s"), + ("ParamCurrency", "s"), + ("ParamCurrencyID", "i"), + ("OpenstatServiceName", "s"), + ("OpenstatCampaignID", "s"), + ("OpenstatAdID", "s"), + ("OpenstatSourceID", "s"), + ("UTMSource", "s"), + ("UTMMedium", "s"), + ("UTMCampaign", "s"), + ("UTMContent", "s"), + ("UTMTerm", "s"), + ("FromTag", "s"), + ("HasGCLID", "i"), + ("RefererHash", "i"), + ("URLHash", "i"), + ("CLID", "i"), +] + +# String fields need each value individually escaped; integers can be written raw. +_STR_TRANS = str.maketrans({"\\": "\\\\", '"': '\\"', "\n": " ", "\r": " "}) + +# DataFusion (the InfluxDB 3 query engine) folds unquoted identifiers to +# lowercase, so we lowercase field names at load time. That way the standard +# ClickBench queries with CamelCase column references (e.g. ``EventDate``) +# resolve correctly without needing to be quoted. +_LOWER_COLUMNS = [(name.lower(), ty) for name, ty in COLUMNS] +_NUM_COLS = len(_LOWER_COLUMNS) + +_session = requests.Session() +_session.headers["Content-Type"] = "text/plain; charset=utf-8" +_PARAMS = {"db": DB, "precision": "nanosecond", "accept_partial": "false"} + + +def encode_and_upload(rows, ts_start): + out = [] + for i, row in enumerate(rows): + if len(row) != _NUM_COLS: + row = (row + [""] * _NUM_COLS)[:_NUM_COLS] + parts = [] + for (name, ty), value in zip(_LOWER_COLUMNS, row): + if ty == "i": + # Integer field — append 'i' suffix per line-protocol spec. + # Empty/blank cells become 0. + v = value if value else "0" + parts.append(f"{name}={v}i") + else: + v = value.translate(_STR_TRANS) if value else "" + parts.append(f'{name}="{v}"') + out.append(f"hits {','.join(parts)} {ts_start + i}\n") + + body = "".join(out).encode("utf-8") + r = _session.post(URL, params=_PARAMS, data=body, timeout=600) + if r.status_code >= 300: + sys.stderr.write(f"write_lp HTTP {r.status_code}: {r.text[:500]}\n") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--start-row", type=int, default=0) + parser.add_argument("--end-row", type=int, default=TOTAL_ROWS) + args = parser.parse_args() + start_row = args.start_row + end_row = args.end_row + chunk_rows = end_row - start_row + + total = 0 + next_ts = TS_BASE + start_row + last_log = time.monotonic() + pending = [] + + with ThreadPoolExecutor(max_workers=WORKERS) as executor: + with open(INPUT, "r", encoding="utf-8", errors="replace", newline="") as f: + reader = csv.reader( + f, delimiter="\t", quoting=csv.QUOTE_NONE, escapechar=None + ) + # Skip rows before our chunk. csv.QUOTE_NONE means each record is + # exactly one line, so islice over the reader is safe. + if start_row: + for _ in islice(reader, start_row): + pass + + while total < chunk_rows: + take = min(BATCH_ROWS, chunk_rows - total) + batch = list(islice(reader, take)) + if not batch: + break + + pending.append(executor.submit(encode_and_upload, batch, next_ts)) + next_ts += len(batch) + total += len(batch) + + # Drain oldest futures so memory stays bounded and any error + # surfaces promptly. + while len(pending) >= MAX_PENDING: + pending.pop(0).result() + + now = time.monotonic() + if now - last_log > PROGRESS_INTERVAL_SECONDS: + pct = 100.0 * total / chunk_rows + print( + f" {pct:5.2f}% ({total}/{chunk_rows})" + f" rows {start_row}..{start_row + total}", + flush=True, + ) + last_log = now + + for fut in pending: + fut.result() + + print(f"Total rows written: {total} (chunk {start_row}..{start_row + total})") + + +if __name__ == "__main__": + main() diff --git a/influxdb/queries.sql b/influxdb/queries.sql new file mode 100644 index 0000000000..47114dee7f --- /dev/null +++ b/influxdb/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM CAST(EventTime AS TIMESTAMP)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', CAST(EventTime AS TIMESTAMP)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', CAST(EventTime AS TIMESTAMP)) ORDER BY DATE_TRUNC('minute', CAST(EventTime AS TIMESTAMP)) LIMIT 10 OFFSET 1000; diff --git a/influxdb/run.sh b/influxdb/run.sh new file mode 100755 index 0000000000..c8b9a6136d --- /dev/null +++ b/influxdb/run.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +TRIES=3 +URL="http://localhost:8181/api/v3/query_sql" + +set -f +while IFS= read -r query; do + [ -z "$query" ] && continue + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null + + body=$(jq -n --arg q "$query" '{db: "hits", q: $q, format: "json"}') + + echo -n "[" + for i in $(seq 1 $TRIES); do + t1=$(date +%s%N) + # `-s` (silent) without `-S` so transient curl errors like the + # "transfer closed" message that DataFusion emits when a query OOMs + # don't pollute the captured benchmark log; the non-zero exit code + # is enough for us to record a `null` below. + curl -s --fail --max-time 600 -H 'Content-Type: application/json' \ + -X POST "$URL" -d "$body" > /dev/null 2>&1 + rc=$? + t2=$(date +%s%N) + if [ "$rc" = "0" ]; then + awk "BEGIN { printf \"%.3f\", ($t2 - $t1) / 1000000000 }" + else + echo -n "null" + fi + [ "$i" != "$TRIES" ] && echo -n ", " + done + echo "]," +done < queries.sql diff --git a/influxdb/template.json b/influxdb/template.json new file mode 100644 index 0000000000..51d0e822a8 --- /dev/null +++ b/influxdb/template.json @@ -0,0 +1,12 @@ +{ + "system": "InfluxDB", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "Rust", + "column-oriented", + "time-series", + "lukewarm-cold-run" + ] +}