From f2715858c6aafc67dbf20ab919282bb3f1ea0b5e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 8 Mar 2025 08:10:01 -0700 Subject: [PATCH 1/7] draft --- tpch/README.md | 111 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 tpch/README.md diff --git a/tpch/README.md b/tpch/README.md new file mode 100644 index 0000000..7f5342d --- /dev/null +++ b/tpch/README.md @@ -0,0 +1,111 @@ + + +# Benchmarking DataFusion Ray on Kubernetes + +This is a rough guide to deploying and benchmarking DataFusion Ray on Kubernetes. + +set up new venv + +```shell +python3 -m venv venv +source venv/bin/activate +pip3 install maturin +pip3 install ray +pip3 install ray[default] +``` + +Build the project. + +```shell +maturin build --strip +``` + +```yaml +apiVersion: ray.io/v1alpha1 +kind: RayCluster +metadata: + name: datafusion-ray-cluster +spec: + headGroupSpec: + rayStartParams: + num-cpus: "0" + template: + spec: + containers: + - name: ray-head + image: rayproject/ray:2.42.1-py310-cpu + imagePullPolicy: Always + resources: + limits: + cpu: 2 + memory: 8Gi + requests: + cpu: 2 + memory: 8Gi + volumeMounts: + - mountPath: /mnt/bigdata # Mount path inside the container + name: ray-storage + volumes: + - name: ray-storage + persistentVolumeClaim: + claimName: ray-pvc + workerGroupSpecs: + - replicas: 2 + groupName: "datafusion-ray" + rayStartParams: + num-cpus: "4" + template: + spec: + containers: + - name: ray-worker + image: rayproject/ray:2.42.1-py310-cpu + imagePullPolicy: Always + resources: + limits: + cpu: 5 + memory: 64Gi + requests: + cpu: 5 + memory: 64Gi + volumeMounts: + - mountPath: /mnt/bigdata + name: ray-storage + volumes: + - name: ray-storage + persistentVolumeClaim: + claimName: ray-pvc +``` + +```shell +kubectl apply -f datafusion-ray.yaml +``` + +set up port forwarding on head node 8265 + +```shell +ray job submit --address='http://localhost:8265' \ + --runtime-env-json='{"pip":["datafusion", "tabulate", "boto3", "duckdb"], "py_modules":["/home/andy/git/apache/datafusion-ray/target/wheels/datafusion_ray-0.1.0-cp38-abi3-manylinux_2_35_x86_64.whl"], "working_dir":"./", "env_vars":{"RAY_DEDUP_LOGS":"O", "RAY_COLOR_PREFIX":"1"}}' -- \ + python tpcbench.py \ + --data /mnt/bigdata/tpch/sf100 \ + --concurrency 8 \ + --partitions-per-worker 4 \ + --worker-pool-min 30 \ + --listing-tables +``` From 69f387c09479acf13079be9bf0f0e3284b2a6135 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 8 Mar 2025 09:04:44 -0700 Subject: [PATCH 2/7] update --- tpch/README.md | 27 +++++++++------- tpch/datafusion-ray-tpch-1739638664197.json | 14 +++++++++ tpch/datafusion-ray-tpch-1739638680909.json | 35 +++++++++++++++++++++ 3 files changed, 65 insertions(+), 11 deletions(-) create mode 100644 tpch/datafusion-ray-tpch-1739638664197.json create mode 100644 tpch/datafusion-ray-tpch-1739638680909.json diff --git a/tpch/README.md b/tpch/README.md index 7f5342d..e976359 100644 --- a/tpch/README.md +++ b/tpch/README.md @@ -19,24 +19,26 @@ # Benchmarking DataFusion Ray on Kubernetes -This is a rough guide to deploying and benchmarking DataFusion Ray on Kubernetes. +This is a rough guide to deploying and benchmarking DataFusion Ray on Kubernetes as part of the development process. -set up new venv +## Building Wheels -```shell -python3 -m venv venv -source venv/bin/activate -pip3 install maturin -pip3 install ray -pip3 install ray[default] -``` +Follow the instructions in the [contributor guide] to set up a development environment and then build the project +using the following command. -Build the project. +[contributor guide]: ../docs/contributing.md ```shell maturin build --strip ``` +## Create a Ray Cluster + +Create a `datafusion-ray.yaml` file using the following template. It is important that the Ray Docker image uses the +same Python version that was used to build the wheels. This example yaml assumes that the TPC-H data files are +available locally on each node in the cluster at the path `/mnt/bigdata`. If the data is stored on object storage then +the `volume` and `volumeMount` sections can be removed. + ```yaml apiVersion: ray.io/v1alpha1 kind: RayCluster @@ -93,11 +95,14 @@ spec: claimName: ray-pvc ``` +Run the following command to create the cluster: + ```shell kubectl apply -f datafusion-ray.yaml ``` -set up port forwarding on head node 8265 +Once the cluster is running, set up port forwarding on port 8265 on the head node and then run the following +command to run the benchmarks. ```shell ray job submit --address='http://localhost:8265' \ diff --git a/tpch/datafusion-ray-tpch-1739638664197.json b/tpch/datafusion-ray-tpch-1739638664197.json new file mode 100644 index 0000000..c4463c3 --- /dev/null +++ b/tpch/datafusion-ray-tpch-1739638664197.json @@ -0,0 +1,14 @@ +{ + "engine": "datafusion-ray", + "benchmark": "tpch", + "settings": { + "concurrency": 8, + "batch_size": 8182, + "prefetch_buffer_size": 0, + "partitions_per_worker": null + }, + "data_path": "file:///mnt/bigdata/tpch/sf100", + "queries": { + "2": 8.547899007797241 + } +} \ No newline at end of file diff --git a/tpch/datafusion-ray-tpch-1739638680909.json b/tpch/datafusion-ray-tpch-1739638680909.json new file mode 100644 index 0000000..bceda1e --- /dev/null +++ b/tpch/datafusion-ray-tpch-1739638680909.json @@ -0,0 +1,35 @@ +{ + "engine": "datafusion-ray", + "benchmark": "tpch", + "settings": { + "concurrency": 8, + "batch_size": 8182, + "prefetch_buffer_size": 0, + "partitions_per_worker": null + }, + "data_path": "file:///mnt/bigdata/tpch/sf100", + "queries": { + "1": 7.222118854522705, + "2": 8.797776937484741, + "3": 11.183124780654907, + "4": 7.6282007694244385, + "5": 20.619840383529663, + "6": 3.466888427734375, + "7": 29.999598026275635, + "8": 22.716665267944336, + "9": 38.37256050109863, + "10": 25.540525197982788, + "11": 6.4380128383636475, + "12": 10.021047592163086, + "13": 8.462335348129272, + "14": 3.810248851776123, + "15": 0.8309383392333984, + "16": 3.692992925643921, + "17": 32.96640610694885, + "18": 50.401840925216675, + "19": 4.988840818405151, + "20": 7.992424011230469, + "21": 47.60438632965088, + "22": 3.4463324546813965 + } +} \ No newline at end of file From 2d93dee7d18cce5ddc75558c5a3e15f91388dcf2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 8 Mar 2025 09:06:57 -0700 Subject: [PATCH 3/7] update --- tpch/datafusion-ray-tpch-1739638664197.json | 14 --------- tpch/datafusion-ray-tpch-1739638680909.json | 35 --------------------- 2 files changed, 49 deletions(-) delete mode 100644 tpch/datafusion-ray-tpch-1739638664197.json delete mode 100644 tpch/datafusion-ray-tpch-1739638680909.json diff --git a/tpch/datafusion-ray-tpch-1739638664197.json b/tpch/datafusion-ray-tpch-1739638664197.json deleted file mode 100644 index c4463c3..0000000 --- a/tpch/datafusion-ray-tpch-1739638664197.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "engine": "datafusion-ray", - "benchmark": "tpch", - "settings": { - "concurrency": 8, - "batch_size": 8182, - "prefetch_buffer_size": 0, - "partitions_per_worker": null - }, - "data_path": "file:///mnt/bigdata/tpch/sf100", - "queries": { - "2": 8.547899007797241 - } -} \ No newline at end of file diff --git a/tpch/datafusion-ray-tpch-1739638680909.json b/tpch/datafusion-ray-tpch-1739638680909.json deleted file mode 100644 index bceda1e..0000000 --- a/tpch/datafusion-ray-tpch-1739638680909.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "engine": "datafusion-ray", - "benchmark": "tpch", - "settings": { - "concurrency": 8, - "batch_size": 8182, - "prefetch_buffer_size": 0, - "partitions_per_worker": null - }, - "data_path": "file:///mnt/bigdata/tpch/sf100", - "queries": { - "1": 7.222118854522705, - "2": 8.797776937484741, - "3": 11.183124780654907, - "4": 7.6282007694244385, - "5": 20.619840383529663, - "6": 3.466888427734375, - "7": 29.999598026275635, - "8": 22.716665267944336, - "9": 38.37256050109863, - "10": 25.540525197982788, - "11": 6.4380128383636475, - "12": 10.021047592163086, - "13": 8.462335348129272, - "14": 3.810248851776123, - "15": 0.8309383392333984, - "16": 3.692992925643921, - "17": 32.96640610694885, - "18": 50.401840925216675, - "19": 4.988840818405151, - "20": 7.992424011230469, - "21": 47.60438632965088, - "22": 3.4463324546813965 - } -} \ No newline at end of file From d38c44222be29398eabd4ba6099fac53ae80c635 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 8 Mar 2025 09:20:15 -0700 Subject: [PATCH 4/7] kuberay --- tpch/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tpch/README.md b/tpch/README.md index e976359..75ca1af 100644 --- a/tpch/README.md +++ b/tpch/README.md @@ -34,6 +34,10 @@ maturin build --strip ## Create a Ray Cluster +Follow the instructions in the [Ray on Kubernetes] documentation to install the KubeRay operator. + +[Ray on Kubernetes]: https://docs.ray.io/en/latest/cluster/kubernetes/index.html + Create a `datafusion-ray.yaml` file using the following template. It is important that the Ray Docker image uses the same Python version that was used to build the wheels. This example yaml assumes that the TPC-H data files are available locally on each node in the cluster at the path `/mnt/bigdata`. If the data is stored on object storage then From e3d6a950439aa7706523557568ade968c1fb47f9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 8 Mar 2025 09:26:30 -0700 Subject: [PATCH 5/7] rename arg --- tpch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tpch/README.md b/tpch/README.md index 75ca1af..551be2f 100644 --- a/tpch/README.md +++ b/tpch/README.md @@ -24,7 +24,7 @@ This is a rough guide to deploying and benchmarking DataFusion Ray on Kubernetes ## Building Wheels Follow the instructions in the [contributor guide] to set up a development environment and then build the project -using the following command. +using the following command. Note that the `--strip` argument is important for keeping the wheel size below 100MB [contributor guide]: ../docs/contributing.md @@ -114,7 +114,7 @@ ray job submit --address='http://localhost:8265' \ python tpcbench.py \ --data /mnt/bigdata/tpch/sf100 \ --concurrency 8 \ - --partitions-per-worker 4 \ + --partitions-per-processor 4 \ --worker-pool-min 30 \ --listing-tables ``` From 6642a307fb3b85086087c69de049abefeee2f6b6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 8 Mar 2025 09:35:16 -0700 Subject: [PATCH 6/7] update ray docker image version --- tpch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tpch/README.md b/tpch/README.md index 551be2f..095ca23 100644 --- a/tpch/README.md +++ b/tpch/README.md @@ -56,7 +56,7 @@ spec: spec: containers: - name: ray-head - image: rayproject/ray:2.42.1-py310-cpu + image: rayproject/ray:2.43.0-py312-cpu imagePullPolicy: Always resources: limits: @@ -81,7 +81,7 @@ spec: spec: containers: - name: ray-worker - image: rayproject/ray:2.42.1-py310-cpu + image: rayproject/ray:2.43.0-py312-cpu imagePullPolicy: Always resources: limits: From c36bead4221149177f71509e2ac3c256c6b0c97e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 8 Mar 2025 10:51:45 -0700 Subject: [PATCH 7/7] fix --- tpch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tpch/README.md b/tpch/README.md index 095ca23..a1f125d 100644 --- a/tpch/README.md +++ b/tpch/README.md @@ -57,7 +57,7 @@ spec: containers: - name: ray-head image: rayproject/ray:2.43.0-py312-cpu - imagePullPolicy: Always + imagePullPolicy: IfNotPresent resources: limits: cpu: 2 @@ -82,7 +82,7 @@ spec: containers: - name: ray-worker image: rayproject/ray:2.43.0-py312-cpu - imagePullPolicy: Always + imagePullPolicy: IfNotPresent resources: limits: cpu: 5