Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cedardb-parquet/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@
# Thin shim — actual flow is in lib/benchmark-common.sh.
export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single"
export BENCH_DURABLE=yes
export PGHOST="/tmp"
export PGUSER=postgres
export PGDATABASE=postgres

exec ../lib/benchmark-common.sh
2 changes: 1 addition & 1 deletion cedardb-parquet/check
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
set -e

PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null
psql -c 'SELECT 1' >/dev/null
2 changes: 1 addition & 1 deletion cedardb-parquet/create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,4 @@ SELECT
"RefererHash" AS RefererHash,
"URLHash" AS URLHash,
"CLID" AS CLID
FROM '/data/hits.parquet';
FROM 'hits.parquet';
2 changes: 1 addition & 1 deletion cedardb-parquet/data-size
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
set -eu

# No ingestion — reported size is the parquet file itself.
stat -c%s data/hits.parquet
stat -c%s hits.parquet
14 changes: 10 additions & 4 deletions cedardb-parquet/install
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@
set -eu

sudo apt-get update -y
sudo apt-get install -y docker.io postgresql-client
sudo apt-get install -y postgresql-client

sudo docker pull cedardb/cedardb:latest
# Stop any running instance before reinstalling
./stop || true

mkdir -p data db
chmod -R 777 data db
# Delete potential previously created database
rm -rf ./db

curl https://get.cedardb.com | bash -s -- -y \
--install-dir "$PWD" \
--db-dir "$PWD/db" \
--with-systemd=system
10 changes: 3 additions & 7 deletions cedardb-parquet/load
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
#!/bin/bash
set -eu

# Stage parquet file under ./data so the docker container sees /data/hits.parquet.
mkdir -p data
mv hits.parquet data/
chmod -R 777 data

# create.sql defines a view over the parquet file — no ingestion needed.
PGPASSWORD=test psql -h localhost -U postgres -t < create.sql
# Substitute the absolute path so CedarDB can find the file regardless of
# its working directory.
sed "s|'hits\.parquet'|'$PWD/hits.parquet'|" create.sql | psql

sync
2 changes: 1 addition & 1 deletion cedardb-parquet/query
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set -e

query=$(cat)

raw=$(PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$?
raw=$(psql -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$?

if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then
printf '%s\n' "$raw" >&2
Expand Down
17 changes: 1 addition & 16 deletions cedardb-parquet/start
Original file line number Diff line number Diff line change
@@ -1,19 +1,4 @@
#!/bin/bash
set -eu

if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1; then
exit 0
fi

sudo docker stop cedardb >/dev/null 2>&1 || true
sudo docker rm cedardb >/dev/null 2>&1 || true

sudo docker run -d --rm -p 5432:5432 \
-v "$(pwd)/data:/data" \
-v "$(pwd)/db:/var/lib/cedardb/data" \
-e CEDAR_PASSWORD=test \
--name cedardb cedardb/cedardb:latest >/dev/null

until pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1; do
sleep 1
done
sudo systemctl start cedardb.service
2 changes: 1 addition & 1 deletion cedardb-parquet/stop
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/bin/bash

sudo docker stop cedardb >/dev/null 2>&1 || true
sudo systemctl stop cedardb.service || true
6 changes: 5 additions & 1 deletion cedardb/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#!/bin/bash
# Thin shim — actual flow is in lib/benchmark-common.sh.
export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv"
export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single"
export BENCH_DURABLE=yes
export PGHOST="/tmp"
export PGUSER=postgres
export PGDATABASE=postgres

exec ../lib/benchmark-common.sh
2 changes: 1 addition & 1 deletion cedardb/check
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
set -e

PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null
psql -c 'SELECT 1' >/dev/null
3 changes: 1 addition & 2 deletions cedardb/create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ create table hits (
hasgclid smallint not null,
refererhash bigint not null,
urlhash bigint not null,
clid integer not null,
primary key (counterid, eventdate, userid, eventtime, watchid)
clid integer not null
);

2 changes: 1 addition & 1 deletion cedardb/data-size
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
set -eu

PGPASSWORD=test psql -h localhost -U postgres -q -t -A -c "SELECT pg_total_relation_size('hits');"
psql -q -t -A -c "SELECT pg_total_relation_size('hits');"
14 changes: 10 additions & 4 deletions cedardb/install
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@
set -eu

sudo apt-get update -y
sudo apt-get install -y docker.io postgresql-client gzip
sudo apt-get install -y postgresql-client

sudo docker pull cedardb/cedardb:latest
# Stop any running instance before reinstalling
./stop || true

mkdir -p data db
chmod -R 777 data db
# Delete potential previously created database
rm -rf ./db

curl https://get.cedardb.com | bash -s -- -y \
--install-dir "$PWD" \
--db-dir "$PWD/db" \
--with-systemd=system
17 changes: 8 additions & 9 deletions cedardb/load
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
#!/bin/bash
set -eu

mkdir -p data
mv hits.tsv data/
chmod -R 777 data

PGPASSWORD=test psql -h localhost -U postgres -t < create.sql

PGPASSWORD=test psql -h localhost -U postgres -q -t -c "COPY hits FROM '/data/hits.tsv';"

rm -f data/hits.tsv
psql -f create.sql
{
# CedarDB would like to have ~4 GB of memory per parallel worker for parquet import to be efficient, so set the number of workers accordingly.
echo "SET debug.parallel = $(( $(grep MemTotal /proc/meminfo | awk '{print $2}') / (4 * 1024 * 1024) ));"
sed "s|'hits\.parquet'|'$PWD/hits.parquet'|" load.sql
} | psql

rm -f hits.parquet
sync
107 changes: 107 additions & 0 deletions cedardb/load.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
INSERT INTO hits SELECT
"WatchID",
"JavaEnable",
"Title",
"GoodEvent",
to_timestamp("EventTime") as "EventTime",
('1970-01-01'::date + "EventDate"::integer) as "EventDate",
"CounterID",
"ClientIP",
"RegionID",
"UserID",
"CounterClass",
"OS",
"UserAgent",
"URL",
"Referer",
"IsRefresh",
"RefererCategoryID",
"RefererRegionID",
"URLCategoryID",
"URLRegionID",
"ResolutionWidth",
"ResolutionHeight",
"ResolutionDepth",
"FlashMajor",
"FlashMinor",
"FlashMinor2",
"NetMajor",
"NetMinor",
"UserAgentMajor",
"UserAgentMinor",
"CookieEnable",
"JavascriptEnable",
"IsMobile",
"MobilePhone",
"MobilePhoneModel",
"Params",
"IPNetworkID",
"TraficSourceID",
"SearchEngineID",
"SearchPhrase",
"AdvEngineID",
"IsArtifical",
"WindowClientWidth",
"WindowClientHeight",
"ClientTimeZone",
to_timestamp("ClientEventTime") as "ClientEventTime",
"SilverlightVersion1",
"SilverlightVersion2",
"SilverlightVersion3",
"SilverlightVersion4",
"PageCharset",
"CodeVersion",
"IsLink",
"IsDownload",
"IsNotBounce",
"FUniqID",
"OriginalURL",
"HID",
"IsOldCounter",
"IsEvent",
"IsParameter",
"DontCountHits",
"WithHash",
"HitColor",
to_timestamp("LocalEventTime") as "LocalEventTime",
"Age",
"Sex",
"Income",
"Interests",
"Robotness",
"RemoteIP",
"WindowName",
"OpenerName",
"HistoryLength",
"BrowserLanguage",
"BrowserCountry",
"SocialNetwork",
"SocialAction",
"HTTPError",
"SendTiming",
"DNSTiming",
"ConnectTiming",
"ResponseStartTiming",
"ResponseEndTiming",
"FetchTiming",
"SocialSourceNetworkID",
"SocialSourcePage",
"ParamPrice",
"ParamOrderID",
"ParamCurrency",
"ParamCurrencyID",
"OpenstatServiceName",
"OpenstatCampaignID",
"OpenstatAdID",
"OpenstatSourceID",
"UTMSource",
"UTMMedium",
"UTMCampaign",
"UTMContent",
"UTMTerm",
"FromTag",
"HasGCLID",
"RefererHash",
"URLHash",
"CLID"
FROM 'hits.parquet';
2 changes: 1 addition & 1 deletion cedardb/query
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set -e

query=$(cat)

raw=$(PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$?
raw=$(psql -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$?

if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then
printf '%s\n' "$raw" >&2
Expand Down
58 changes: 0 additions & 58 deletions cedardb/results/20260510/c6a.4xlarge.json

This file was deleted.

Loading