|
2 | 2 |
|
3 | 3 | set -e |
4 | 4 |
|
5 | | -SCALE_FACTOR=1 |
| 5 | +SCALE_FACTOR=${SCALE_FACTOR:-1} |
| 6 | +PARTITIONS=${PARTITIONS:-16} |
| 7 | + |
| 8 | +echo "Generating TPCH dataset with SCALE_FACTOR=${SCALE_FACTOR} and PARTITIONS=${PARTITIONS}" |
6 | 9 |
|
7 | 10 | # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script |
8 | 11 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) |
9 | 12 | DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data} |
10 | 13 | CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"} |
11 | | - |
12 | | -if [ -z "$SCALE_FACTOR" ] ; then |
13 | | - echo "Internal error: Scale factor not specified" |
14 | | - exit 1 |
15 | | -fi |
16 | | - |
17 | 14 | TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}" |
18 | 15 | echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..." |
19 | 16 |
|
|
29 | 26 | docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s "${SCALE_FACTOR}" |
30 | 27 | fi |
31 | 28 |
|
32 | | -# Copy expected answers into the ./data/answers directory if it does not already exist |
33 | | -FILE="${TPCH_DIR}/answers/q1.out" |
34 | | -if test -f "${FILE}"; then |
35 | | - echo " Expected answers exist (${FILE} exists)." |
36 | | -else |
37 | | - echo " Copying answers to ${TPCH_DIR}/answers" |
38 | | - mkdir -p "${TPCH_DIR}/answers" |
39 | | - docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" |
40 | | -fi |
41 | | - |
42 | 29 | # Create 'parquet' files from tbl |
43 | 30 | FILE="${TPCH_DIR}/supplier" |
44 | 31 | if test -d "${FILE}"; then |
45 | 32 | echo " parquet files exist ($FILE exists)." |
46 | 33 | else |
47 | 34 | echo " creating parquet files using benchmark binary ..." |
48 | 35 | pushd "${SCRIPT_DIR}" > /dev/null |
49 | | - $CARGO_COMMAND -- tpch-convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet |
| 36 | + $CARGO_COMMAND -- tpch-convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet --partitions "$PARTITIONS" |
50 | 37 | popd > /dev/null |
51 | 38 | fi |
52 | | - |
53 | | -# Create 'csv' files from tbl |
54 | | -FILE="${TPCH_DIR}/csv/supplier" |
55 | | -if test -d "${FILE}"; then |
56 | | - echo " csv files exist ($FILE exists)." |
57 | | -else |
58 | | - echo " creating csv files using benchmark binary ..." |
59 | | - pushd "${SCRIPT_DIR}" > /dev/null |
60 | | - $CARGO_COMMAND -- tpch-convert --input "${TPCH_DIR}" --output "${TPCH_DIR}/csv" --format csv |
61 | | - popd > /dev/null |
62 | | -fi |
63 | | - |
0 commit comments