From 05feabc09641b3eec8d7429d8425edda1f6c30c0 Mon Sep 17 00:00:00 2001
From: Pieter Noordhuis <pieter.noordhuis@databricks.com>
Date: Wed, 19 Nov 2025 09:32:41 +0100
Subject: [PATCH 1/3] Include instantiation of the pydabs template

---
 pydabs/.gitignore                             |  10 ++
 pydabs/.vscode/__builtins__.pyi               |   3 +
 pydabs/.vscode/extensions.json                |   7 +
 pydabs/.vscode/settings.json                  |  39 +++++
 pydabs/README.md                              |  70 ++++++++
 pydabs/databricks.yml                         |  53 +++++++
 pydabs/fixtures/.gitkeep                      |   9 ++
 pydabs/pyproject.toml                         |  30 ++++
 pydabs/resources/__init__.py                  |  16 ++
 pydabs/resources/pydabs_etl_pipeline.py       |  29 ++++
 pydabs/resources/sample_job.py                |  81 ++++++++++
 pydabs/src/pydabs/__init__.py                 |   0
 pydabs/src/pydabs/main.py                     |  22 +++
 pydabs/src/pydabs/taxis.py                    |   7 +
 pydabs/src/pydabs_etl/README.md               |  20 +++
 .../transformations/sample_trips_pydabs.py    |  12 ++
 .../transformations/sample_zones_pydabs.py    |  17 ++
 pydabs/src/sample_notebook.ipynb              | 149 ++++++++++++++++++
 pydabs/tests/conftest.py                      |  98 ++++++++++++
 pydabs/tests/sample_taxis_test.py             |   8 +
 20 files changed, 680 insertions(+)
 create mode 100644 pydabs/.gitignore
 create mode 100644 pydabs/.vscode/__builtins__.pyi
 create mode 100644 pydabs/.vscode/extensions.json
 create mode 100644 pydabs/.vscode/settings.json
 create mode 100644 pydabs/README.md
 create mode 100644 pydabs/databricks.yml
 create mode 100644 pydabs/fixtures/.gitkeep
 create mode 100644 pydabs/pyproject.toml
 create mode 100644 pydabs/resources/__init__.py
 create mode 100644 pydabs/resources/pydabs_etl_pipeline.py
 create mode 100644 pydabs/resources/sample_job.py
 create mode 100644 pydabs/src/pydabs/__init__.py
 create mode 100644 pydabs/src/pydabs/main.py
 create mode 100644 pydabs/src/pydabs/taxis.py
 create mode 100644 pydabs/src/pydabs_etl/README.md
 create mode 100644 pydabs/src/pydabs_etl/transformations/sample_trips_pydabs.py
 create mode 100644 pydabs/src/pydabs_etl/transformations/sample_zones_pydabs.py
 create mode 100644 pydabs/src/sample_notebook.ipynb
 create mode 100644 pydabs/tests/conftest.py
 create mode 100644 pydabs/tests/sample_taxis_test.py

diff --git a/pydabs/.gitignore b/pydabs/.gitignore
new file mode 100644
index 0000000..e566c51
--- /dev/null
+++ b/pydabs/.gitignore
@@ -0,0 +1,10 @@
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+scratch/**
+!scratch/README.md
+**/explorations/**
+**/!explorations/README.md
diff --git a/pydabs/.vscode/__builtins__.pyi b/pydabs/.vscode/__builtins__.pyi
new file mode 100644
index 0000000..0edd518
--- /dev/null
+++ b/pydabs/.vscode/__builtins__.pyi
@@ -0,0 +1,3 @@
+# Typings for Pylance in Visual Studio Code
+# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
+from databricks.sdk.runtime import *
diff --git a/pydabs/.vscode/extensions.json b/pydabs/.vscode/extensions.json
new file mode 100644
index 0000000..75a111a
--- /dev/null
+++ b/pydabs/.vscode/extensions.json
@@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "databricks.databricks",
+        "redhat.vscode-yaml",
+        "ms-python.black-formatter"
+    ]
+}
diff --git a/pydabs/.vscode/settings.json b/pydabs/.vscode/settings.json
new file mode 100644
index 0000000..c49593b
--- /dev/null
+++ b/pydabs/.vscode/settings.json
@@ -0,0 +1,39 @@
+{
+    "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
+    "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "files.exclude": {
+        "**/*.egg-info": true,
+        "**/__pycache__": true,
+        ".pytest_cache": true,
+        "dist": true,
+    },
+    "files.associations": {
+        "**/.gitkeep": "markdown"
+    },
+
+    // Pylance settings (VS Code)
+    // Set typeCheckingMode to "basic" to enable type checking!
+    "python.analysis.typeCheckingMode": "off",
+    "python.analysis.extraPaths": ["src", "lib", "resources"],
+    "python.analysis.diagnosticMode": "workspace",
+    "python.analysis.stubPath": ".vscode",
+
+    // Pyright settings (Cursor)
+    // Set typeCheckingMode to "basic" to enable type checking!
+    "cursorpyright.analysis.typeCheckingMode": "off",
+    "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"],
+    "cursorpyright.analysis.diagnosticMode": "workspace",
+    "cursorpyright.analysis.stubPath": ".vscode",
+
+    // General Python settings
+    "python.defaultInterpreterPath": "./.venv/bin/python",
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter",
+        "editor.formatOnSave": true,
+    },
+}
diff --git a/pydabs/README.md b/pydabs/README.md
new file mode 100644
index 0000000..f7f8eb3
--- /dev/null
+++ b/pydabs/README.md
@@ -0,0 +1,70 @@
+# pydabs
+
+The 'pydabs' project was generated by using the default template.
+
+* `src/`: Python source code for this project.
+  * `src/pydabs/`: Shared Python code that can be used by jobs and pipelines.
+* `resources/`:  Resource configurations (jobs, pipelines, etc.)
+* `tests/`: Unit tests for the shared Python code.
+* `fixtures/`: Fixtures for data sets (primarily used for testing).
+
+
+## Getting started
+
+Choose how you want to work on this project:
+
+(a) Directly in your Databricks workspace, see
+    https://docs.databricks.com/dev-tools/bundles/workspace.
+
+(b) Locally with an IDE like Cursor or VS Code, see
+    https://docs.databricks.com/dev-tools/vscode-ext.html.
+
+(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html
+
+If you're developing with an IDE, dependencies for this project should be installed using uv:
+
+*  Make sure you have the UV package manager installed.
+   It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/.
+*  Run `uv sync --dev` to install the project's dependencies.
+
+
+# Using this project using the CLI
+
+The Databricks workspace and IDE extensions provide a graphical interface for working
+with this project. It's also possible to interact with it directly using the CLI:
+
+1. Authenticate to your Databricks workspace, if you have not done so already:
+    ```
+    $ databricks configure
+    ```
+
+2. To deploy a development copy of this project, type:
+    ```
+    $ databricks bundle deploy --target dev
+    ```
+    (Note that "dev" is the default target, so the `--target` parameter
+    is optional here.)
+
+    This deploys everything that's defined for this project.
+    For example, the default template would deploy a pipeline called
+    `[dev yourname] pydabs_etl` to your workspace.
+    You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**.
+
+3. Similarly, to deploy a production copy, type:
+   ```
+   $ databricks bundle deploy --target prod
+   ```
+   Note the default template has a includes a job that runs the pipeline every day
+   (defined in resources/sample_job.job.yml). The schedule
+   is paused when deploying in development mode (see
+   https://docs.databricks.com/dev-tools/bundles/deployment-modes.html).
+
+4. To run a job or pipeline, use the "run" command:
+   ```
+   $ databricks bundle run
+   ```
+
+5. Finally, to run tests locally, use `pytest`:
+   ```
+   $ uv run pytest
+   ```
diff --git a/pydabs/databricks.yml b/pydabs/databricks.yml
new file mode 100644
index 0000000..6a285dc
--- /dev/null
+++ b/pydabs/databricks.yml
@@ -0,0 +1,53 @@
+# This is a Databricks asset bundle definition for pydabs.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: pydabs
+  uuid: 4062028b-2184-4acd-9c62-f2ec572f7843
+
+python:
+  venv_path: .venv
+  # Functions called to load resources defined in Python. See resources/__init__.py
+  resources:
+    - "resources:load_resources"
+
+include:
+  - resources/*.yml
+  - resources/*/*.yml
+
+artifacts:
+  python_artifact:
+    type: whl
+    build: uv build --wheel
+
+# Variable declarations. These variables are assigned in the dev/prod targets below.
+variables:
+  catalog:
+    description: The catalog to use
+  schema:
+    description: The schema to use
+
+targets:
+  dev:
+    # The default target uses 'mode: development' to create a development copy.
+    # - Deployed resources get prefixed with '[dev my_user_name]'
+    # - Any job schedules and triggers are paused by default.
+    # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
+    mode: development
+    default: true
+    workspace:
+      #host: https://company.databricks.com
+    variables:
+      catalog: main
+      schema: ${workspace.current_user.short_name}
+  prod:
+    mode: production
+    workspace:
+      #host: https://company.databricks.com
+      # We explicitly deploy to /Workspace/Users/pieter.noordhuis@databricks.com to make sure we only have a single copy.
+      root_path: /Workspace/Users/pieter.noordhuis@databricks.com/.bundle/${bundle.name}/${bundle.target}
+    variables:
+      catalog: main
+      schema: prod
+    permissions:
+      - user_name: pieter.noordhuis@databricks.com
+        level: CAN_MANAGE
diff --git a/pydabs/fixtures/.gitkeep b/pydabs/fixtures/.gitkeep
new file mode 100644
index 0000000..77a9066
--- /dev/null
+++ b/pydabs/fixtures/.gitkeep
@@ -0,0 +1,9 @@
+# Test fixtures directory
+
+Add JSON or CSV files here. In tests, use them with `load_fixture()`:
+
+```
+def test_using_fixture(load_fixture):
+    data = load_fixture("my_data.json")
+    assert len(data) >= 1
+```
diff --git a/pydabs/pyproject.toml b/pydabs/pyproject.toml
new file mode 100644
index 0000000..50286d5
--- /dev/null
+++ b/pydabs/pyproject.toml
@@ -0,0 +1,30 @@
+[project]
+name = "pydabs"
+version = "0.0.1"
+authors = [{ name = "pieter.noordhuis@databricks.com" }]
+requires-python = ">=3.10,<=3.13"
+dependencies = [
+    # Any dependencies for jobs and pipelines in this project can be added here
+    # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies
+    #
+    # LIMITATION: for pipelines, dependencies are cached during development;
+    # add dependencies to the 'environment' section of your pipeline.yml file instead
+]
+
+[dependency-groups]
+dev = [
+    "pytest",
+    "databricks-dlt",
+    "databricks-connect>=15.4,<15.5",
+    "databricks-bundles==0.277.0",
+]
+
+[project.scripts]
+main = "pydabs.main:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.black]
+line-length = 125
diff --git a/pydabs/resources/__init__.py b/pydabs/resources/__init__.py
new file mode 100644
index 0000000..fbcb9dc
--- /dev/null
+++ b/pydabs/resources/__init__.py
@@ -0,0 +1,16 @@
+from databricks.bundles.core import (
+    Bundle,
+    Resources,
+    load_resources_from_current_package_module,
+)
+
+
+def load_resources(bundle: Bundle) -> Resources:
+    """
+    'load_resources' function is referenced in databricks.yml and is responsible for loading
+    bundle resources defined in Python code. This function is called by Databricks CLI during
+    bundle deployment. After deployment, this function is not used.
+    """
+
+    # the default implementation loads all Python files in 'resources' directory
+    return load_resources_from_current_package_module()
diff --git a/pydabs/resources/pydabs_etl_pipeline.py b/pydabs/resources/pydabs_etl_pipeline.py
new file mode 100644
index 0000000..51da8cc
--- /dev/null
+++ b/pydabs/resources/pydabs_etl_pipeline.py
@@ -0,0 +1,29 @@
+from databricks.bundles.pipelines import Pipeline
+
+"""
+The main pipeline for pydabs
+"""
+
+pydabs_etl = Pipeline.from_dict(
+    {
+        "name": "pydabs_etl",
+        "catalog": "${var.catalog}",
+        "schema": "${var.schema}",
+        "serverless": True,
+        "root_path": "src/pydabs_etl",
+        "libraries": [
+            {
+                "glob": {
+                    "include": "src/pydabs_etl/transformations/**",
+                },
+            },
+        ],
+        "environment": {
+            "dependencies": [
+                # We include every dependency defined by pyproject.toml by defining an editable environment
+                # that points to the folder where pyproject.toml is deployed.
+                "--editable ${workspace.file_path}",
+            ],
+        },
+    }
+)
diff --git a/pydabs/resources/sample_job.py b/pydabs/resources/sample_job.py
new file mode 100644
index 0000000..9be2163
--- /dev/null
+++ b/pydabs/resources/sample_job.py
@@ -0,0 +1,81 @@
+from databricks.bundles.jobs import Job
+
+"""
+A sample job for pydabs.
+"""
+
+sample_job = Job.from_dict(
+    {
+        "name": "sample_job",
+        "trigger": {
+            # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
+            "periodic": {
+                "interval": 1,
+                "unit": "DAYS",
+            },
+        },
+        # "email_notifications": {
+        #     "on_failure": [
+        #         "pieter.noordhuis@databricks.com",
+        #     ],
+        # },
+        "parameters": [
+            {
+                "name": "catalog",
+                "default": "${var.catalog}",
+            },
+            {
+                "name": "schema",
+                "default": "${var.schema}",
+            },
+        ],
+        "tasks": [
+            {
+                "task_key": "notebook_task",
+                "notebook_task": {
+                    "notebook_path": "src/sample_notebook.ipynb",
+                },
+            },
+            {
+                "task_key": "python_wheel_task",
+                "depends_on": [
+                    {"task_key": "notebook_task"},
+                ],
+                "python_wheel_task": {
+                    "package_name": "pydabs",
+                    "entry_point": "main",
+                    "parameters": [
+                        "--catalog",
+                        "${var.catalog}",
+                        "--schema",
+                        "${var.schema}",
+                    ],
+                },
+                "environment_key": "default",
+            },
+            {
+                "task_key": "refresh_pipeline",
+                "depends_on": [
+                    {"task_key": "notebook_task"},
+                ],
+                "pipeline_task": {
+                    "pipeline_id": "${resources.pipelines.pydabs_etl.id}",
+                },
+            },
+        ],
+        "environments": [
+            {
+                "environment_key": "default",
+                "spec": {
+                    "environment_version": "2",
+                    "dependencies": [
+                        # By default we just include the .whl file generated for the pydabs package.
+                        # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+                        # for more information on how to add other libraries.
+                        "dist/*.whl",
+                    ],
+                },
+            },
+        ],
+    }
+)
diff --git a/pydabs/src/pydabs/__init__.py b/pydabs/src/pydabs/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pydabs/src/pydabs/main.py b/pydabs/src/pydabs/main.py
new file mode 100644
index 0000000..e1b0148
--- /dev/null
+++ b/pydabs/src/pydabs/main.py
@@ -0,0 +1,22 @@
+import argparse
+from databricks.sdk.runtime import spark
+from pydabs import taxis
+
+
+def main():
+    # Process command-line arguments
+    parser = argparse.ArgumentParser(description="Databricks job with catalog and schema parameters")
+    parser.add_argument("--catalog", required=True)
+    parser.add_argument("--schema", required=True)
+    args = parser.parse_args()
+
+    # Set the default catalog and schema
+    spark.sql(f"USE CATALOG {args.catalog}")
+    spark.sql(f"USE SCHEMA {args.schema}")
+
+    # Example: just find all taxis from a sample catalog
+    taxis.find_all_taxis().show(5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pydabs/src/pydabs/taxis.py b/pydabs/src/pydabs/taxis.py
new file mode 100644
index 0000000..a7309cd
--- /dev/null
+++ b/pydabs/src/pydabs/taxis.py
@@ -0,0 +1,7 @@
+from databricks.sdk.runtime import spark
+from pyspark.sql import DataFrame
+
+
+def find_all_taxis() -> DataFrame:
+    """Find all taxi data."""
+    return spark.read.table("samples.nyctaxi.trips")
diff --git a/pydabs/src/pydabs_etl/README.md b/pydabs/src/pydabs_etl/README.md
new file mode 100644
index 0000000..9211406
--- /dev/null
+++ b/pydabs/src/pydabs_etl/README.md
@@ -0,0 +1,20 @@
+# pydabs
+
+This folder defines all source code for the pydabs pipeline:
+
+- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline.
+- `transformations/`: All dataset definitions and transformations.
+- `utilities/` (optional): Utility functions and Python modules used in this pipeline.
+- `data_sources/` (optional): View definitions describing the source data for this pipeline.
+
+## Getting Started
+
+To get started, go to the `transformations` folder -- most of the relevant source code lives there:
+
+* By convention, every dataset under `transformations` is in a separate file.
+* Take a look at the sample called "sample_trips_pydabs.py" to get familiar with the syntax.
+  Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html.
+* If you're using the workspace UI, use `Run file` to run and preview a single transformation.
+* If you're using the CLI, use `databricks bundle run pydabs_etl --select sample_trips_pydabs` to run a single transformation.
+
+For more tutorials and reference material, see https://docs.databricks.com/dlt.
diff --git a/pydabs/src/pydabs_etl/transformations/sample_trips_pydabs.py b/pydabs/src/pydabs_etl/transformations/sample_trips_pydabs.py
new file mode 100644
index 0000000..3fe9df9
--- /dev/null
+++ b/pydabs/src/pydabs_etl/transformations/sample_trips_pydabs.py
@@ -0,0 +1,12 @@
+from pyspark import pipelines as dp
+from pyspark.sql.functions import col
+
+
+# This file defines a sample transformation.
+# Edit the sample below or add new transformations
+# using "+ Add" in the file browser.
+
+
+@dp.table
+def sample_trips_pydabs():
+    return spark.read.table("samples.nyctaxi.trips")
diff --git a/pydabs/src/pydabs_etl/transformations/sample_zones_pydabs.py b/pydabs/src/pydabs_etl/transformations/sample_zones_pydabs.py
new file mode 100644
index 0000000..8b2d9ae
--- /dev/null
+++ b/pydabs/src/pydabs_etl/transformations/sample_zones_pydabs.py
@@ -0,0 +1,17 @@
+from pyspark import pipelines as dp
+from pyspark.sql.functions import col, sum
+
+
+# This file defines a sample transformation.
+# Edit the sample below or add new transformations
+# using "+ Add" in the file browser.
+
+
+@dp.table
+def sample_zones_pydabs():
+    # Read from the "sample_trips" table, then sum all the fares
+    return (
+        spark.read.table(f"sample_trips_pydabs")
+        .groupBy(col("pickup_zip"))
+        .agg(sum("fare_amount").alias("total_fare"))
+    )
diff --git a/pydabs/src/sample_notebook.ipynb b/pydabs/src/sample_notebook.ipynb
new file mode 100644
index 0000000..fc6620f
--- /dev/null
+++ b/pydabs/src/sample_notebook.ipynb
@@ -0,0 +1,149 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "# Default notebook\n",
+    "\n",
+    "This default notebook is executed using a Lakeflow job as defined in resources/sample_job.job.yml."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Set default catalog and schema\n",
+    "catalog = dbutils.widgets.get(\"catalog\")\n",
+    "schema = dbutils.widgets.get(\"schema\")\n",
+    "spark.sql(f\"USE CATALOG {catalog}\")\n",
+    "spark.sql(f\"USE SCHEMA {schema}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "sys.path.append(\"../src\")\n",
+    "from pydabs import taxis\n",
+    "\n",
+    "taxis.find_all_taxis().show(10)"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "environmentMetadata": {
+    "base_environment": "",
+    "dependencies": [
+     "--editable .."
+    ],
+    "environment_version": "2"
+   },
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 2
+   },
+   "notebookName": "notebook",
+   "widgets": {
+    "catalog": {
+     "currentValue": "main",
+     "nuid": "c4t4l0g-w1dg-3t12-3456-789012345678",
+     "typedWidgetInfo": {
+      "autoCreated": false,
+      "defaultValue": "main",
+      "label": "Catalog",
+      "name": "catalog",
+      "options": {
+       "validationRegex": null,
+       "widgetDisplayType": "Text"
+      },
+      "parameterDataType": "String"
+     },
+     "widgetInfo": {
+      "defaultValue": "main",
+      "label": "Catalog",
+      "name": "catalog",
+      "options": {
+       "autoCreated": false,
+       "validationRegex": null,
+       "widgetType": "text"
+      },
+      "widgetType": "text"
+     }
+    },
+    "schema": {
+     "currentValue": "pieter_noordhuis",
+     "nuid": "5ch3m4-w1dg-3t98-7654-321098765432",
+     "typedWidgetInfo": {
+      "autoCreated": false,
+      "defaultValue": "default",
+      "label": "Schema",
+      "name": "schema",
+      "options": {
+       "validationRegex": null,
+       "widgetDisplayType": "Text"
+      },
+      "parameterDataType": "String"
+     },
+     "widgetInfo": {
+      "defaultValue": "default",
+      "label": "Schema",
+      "name": "schema",
+      "options": {
+       "autoCreated": false,
+       "validationRegex": null,
+       "widgetType": "text"
+      },
+      "widgetType": "text"
+     }
+    }
+   }
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/pydabs/tests/conftest.py b/pydabs/tests/conftest.py
new file mode 100644
index 0000000..4df274f
--- /dev/null
+++ b/pydabs/tests/conftest.py
@@ -0,0 +1,98 @@
+"""This file configures pytest.
+
+This file is in the root since it can be used for tests in any place in this
+project, including tests under resources/.
+"""
+
+import os, sys, pathlib
+from contextlib import contextmanager
+
+
+try:
+    from databricks.connect import DatabricksSession
+    from databricks.sdk import WorkspaceClient
+    from pyspark.sql import SparkSession
+    import pytest
+    import json
+    import csv
+    import os
+except ImportError:
+    raise ImportError(
+        "Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv."
+    )
+
+
+@pytest.fixture()
+def spark() -> SparkSession:
+    """Provide a SparkSession fixture for tests.
+
+    Minimal example:
+        def test_uses_spark(spark):
+            df = spark.createDataFrame([(1,)], ["x"])
+            assert df.count() == 1
+    """
+    return DatabricksSession.builder.getOrCreate()
+
+
+@pytest.fixture()
+def load_fixture(spark: SparkSession):
+    """Provide a callable to load JSON or CSV from fixtures/ directory.
+
+    Example usage:
+
+        def test_using_fixture(load_fixture):
+            data = load_fixture("my_data.json")
+            assert data.count() >= 1
+    """
+
+    def _loader(filename: str):
+        path = pathlib.Path(__file__).parent.parent / "fixtures" / filename
+        suffix = path.suffix.lower()
+        if suffix == ".json":
+            rows = json.loads(path.read_text())
+            return spark.createDataFrame(rows)
+        if suffix == ".csv":
+            with path.open(newline="") as f:
+                rows = list(csv.DictReader(f))
+            return spark.createDataFrame(rows)
+        raise ValueError(f"Unsupported fixture type for: {filename}")
+
+    return _loader
+
+
+def _enable_fallback_compute():
+    """Enable serverless compute if no compute is specified."""
+    conf = WorkspaceClient().config
+    if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"):
+        return
+
+    url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config"
+    print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr)
+    print(f"  see {url} for manual configuration", file=sys.stdout)
+
+    os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto"
+
+
+@contextmanager
+def _allow_stderr_output(config: pytest.Config):
+    """Temporarily disable pytest output capture."""
+    capman = config.pluginmanager.get_plugin("capturemanager")
+    if capman:
+        with capman.global_and_fixture_disabled():
+            yield
+    else:
+        yield
+
+
+def pytest_configure(config: pytest.Config):
+    """Configure pytest session."""
+    with _allow_stderr_output(config):
+        _enable_fallback_compute()
+
+        # Initialize Spark session eagerly, so it is available even when
+        # SparkSession.builder.getOrCreate() is used. For DB Connect 15+,
+        # we validate version compatibility with the remote cluster.
+        if hasattr(DatabricksSession.builder, "validateSession"):
+            DatabricksSession.builder.validateSession().getOrCreate()
+        else:
+            DatabricksSession.builder.getOrCreate()
diff --git a/pydabs/tests/sample_taxis_test.py b/pydabs/tests/sample_taxis_test.py
new file mode 100644
index 0000000..8675204
--- /dev/null
+++ b/pydabs/tests/sample_taxis_test.py
@@ -0,0 +1,8 @@
+from databricks.sdk.runtime import spark
+from pyspark.sql import DataFrame
+from pydabs import taxis
+
+
+def test_find_all_taxis():
+    results = taxis.find_all_taxis()
+    assert results.count() > 5

From 331007e305ce807e70bb0fdfba7af209866131f3 Mon Sep 17 00:00:00 2001
From: Pieter Noordhuis <pieter.noordhuis@databricks.com>
Date: Wed, 19 Nov 2025 09:36:59 +0100
Subject: [PATCH 2/3] Format

---
 pydabs/src/pydabs/main.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pydabs/src/pydabs/main.py b/pydabs/src/pydabs/main.py
index e1b0148..7ab6d7b 100644
--- a/pydabs/src/pydabs/main.py
+++ b/pydabs/src/pydabs/main.py
@@ -5,7 +5,9 @@
 
 def main():
     # Process command-line arguments
-    parser = argparse.ArgumentParser(description="Databricks job with catalog and schema parameters")
+    parser = argparse.ArgumentParser(
+        description="Databricks job with catalog and schema parameters"
+    )
     parser.add_argument("--catalog", required=True)
     parser.add_argument("--schema", required=True)
     args = parser.parse_args()

From fc25547177e71f62bdf37ed451864cee068b304a Mon Sep 17 00:00:00 2001
From: Pieter Noordhuis <pieter.noordhuis@databricks.com>
Date: Thu, 20 Nov 2025 00:20:06 -0800
Subject: [PATCH 3/3] Update pydabs/src/pydabs/main.py

Co-authored-by: Gleb Kanterov <kanterov@users.noreply.github.com>
---
 pydabs/src/pydabs/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydabs/src/pydabs/main.py b/pydabs/src/pydabs/main.py
index 7ab6d7b..0b7d9a0 100644
--- a/pydabs/src/pydabs/main.py
+++ b/pydabs/src/pydabs/main.py
@@ -6,7 +6,7 @@
 def main():
     # Process command-line arguments
     parser = argparse.ArgumentParser(
-        description="Databricks job with catalog and schema parameters"
+        description="Databricks job with catalog and schema parameters",
     )
     parser.add_argument("--catalog", required=True)
     parser.add_argument("--schema", required=True)