databricks · pietern · Nov 20, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 20, 2025
diff --git a/pydabs/.gitignore b/pydabs/.gitignore
@@ -0,0 +1,10 @@
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+scratch/**
+!scratch/README.md
+**/explorations/**
+**/!explorations/README.md
diff --git a/pydabs/.vscode/__builtins__.pyi b/pydabs/.vscode/__builtins__.pyi
@@ -0,0 +1,3 @@
+# Typings for Pylance in Visual Studio Code
+# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
+from databricks.sdk.runtime import *
diff --git a/pydabs/.vscode/extensions.json b/pydabs/.vscode/extensions.json
@@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "databricks.databricks",
+        "redhat.vscode-yaml",
+        "ms-python.black-formatter"
+    ]
+}
diff --git a/pydabs/.vscode/settings.json b/pydabs/.vscode/settings.json
@@ -0,0 +1,39 @@
+{
+    "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
+    "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "files.exclude": {
+        "**/*.egg-info": true,
+        "**/__pycache__": true,
+        ".pytest_cache": true,
+        "dist": true,
+    },
+    "files.associations": {
+        "**/.gitkeep": "markdown"
+    },
+
+    // Pylance settings (VS Code)
+    // Set typeCheckingMode to "basic" to enable type checking!
+    "python.analysis.typeCheckingMode": "off",
+    "python.analysis.extraPaths": ["src", "lib", "resources"],
+    "python.analysis.diagnosticMode": "workspace",
+    "python.analysis.stubPath": ".vscode",
+
+    // Pyright settings (Cursor)
+    // Set typeCheckingMode to "basic" to enable type checking!
+    "cursorpyright.analysis.typeCheckingMode": "off",
+    "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"],
+    "cursorpyright.analysis.diagnosticMode": "workspace",
+    "cursorpyright.analysis.stubPath": ".vscode",
+
+    // General Python settings
+    "python.defaultInterpreterPath": "./.venv/bin/python",
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter",
+        "editor.formatOnSave": true,
+    },
+}
diff --git a/pydabs/README.md b/pydabs/README.md
@@ -0,0 +1,70 @@
+# pydabs
+
+The 'pydabs' project was generated by using the default template.
+
+* `src/`: Python source code for this project.
+  * `src/pydabs/`: Shared Python code that can be used by jobs and pipelines.
+* `resources/`:  Resource configurations (jobs, pipelines, etc.)
+* `tests/`: Unit tests for the shared Python code.
+* `fixtures/`: Fixtures for data sets (primarily used for testing).
+
+
+## Getting started
+
+Choose how you want to work on this project:
+
+(a) Directly in your Databricks workspace, see
+    https://docs.databricks.com/dev-tools/bundles/workspace.
+
+(b) Locally with an IDE like Cursor or VS Code, see
+    https://docs.databricks.com/dev-tools/vscode-ext.html.
+
+(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html
+
+If you're developing with an IDE, dependencies for this project should be installed using uv:
+
+*  Make sure you have the UV package manager installed.
+   It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/.
+*  Run `uv sync --dev` to install the project's dependencies.
+
+
+# Using this project using the CLI
+
+The Databricks workspace and IDE extensions provide a graphical interface for working
+with this project. It's also possible to interact with it directly using the CLI:
+
+1. Authenticate to your Databricks workspace, if you have not done so already:
+    ```
+    $ databricks configure
+    ```
+
+2. To deploy a development copy of this project, type:
+    ```
+    $ databricks bundle deploy --target dev
+    ```
+    (Note that "dev" is the default target, so the `--target` parameter
+    is optional here.)
+
+    This deploys everything that's defined for this project.
+    For example, the default template would deploy a pipeline called
+    `[dev yourname] pydabs_etl` to your workspace.
+    You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**.
+
+3. Similarly, to deploy a production copy, type:
+   ```
+   $ databricks bundle deploy --target prod
+   ```
+   Note the default template has a includes a job that runs the pipeline every day
+   (defined in resources/sample_job.job.yml). The schedule
+   is paused when deploying in development mode (see
+   https://docs.databricks.com/dev-tools/bundles/deployment-modes.html).
+
+4. To run a job or pipeline, use the "run" command:
+   ```
+   $ databricks bundle run
+   ```
+
+5. Finally, to run tests locally, use `pytest`:
+   ```
+   $ uv run pytest
+   ```
diff --git a/pydabs/databricks.yml b/pydabs/databricks.yml
@@ -0,0 +1,53 @@
+# This is a Databricks asset bundle definition for pydabs.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: pydabs
+  uuid: 4062028b-2184-4acd-9c62-f2ec572f7843
+
+python:
+  venv_path: .venv
+  # Functions called to load resources defined in Python. See resources/__init__.py
+  resources:
+    - "resources:load_resources"
+
+include:
+  - resources/*.yml
+  - resources/*/*.yml
+
+artifacts:
+  python_artifact:
+    type: whl
+    build: uv build --wheel
+
+# Variable declarations. These variables are assigned in the dev/prod targets below.
+variables:
+  catalog:
+    description: The catalog to use
+  schema:
+    description: The schema to use
+
+targets:
+  dev:
+    # The default target uses 'mode: development' to create a development copy.
+    # - Deployed resources get prefixed with '[dev my_user_name]'
+    # - Any job schedules and triggers are paused by default.
+    # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
+    mode: development
+    default: true
+    workspace:
+      #host: https://company.databricks.com
+    variables:
+      catalog: main
+      schema: ${workspace.current_user.short_name}
+  prod:
+    mode: production
+    workspace:
+      #host: https://company.databricks.com
+      # We explicitly deploy to /Workspace/Users/pieter.noordhuis@databricks.com to make sure we only have a single copy.
+      root_path: /Workspace/Users/pieter.noordhuis@databricks.com/.bundle/${bundle.name}/${bundle.target}
+    variables:
+      catalog: main
+      schema: prod
+    permissions:
+      - user_name: pieter.noordhuis@databricks.com
+        level: CAN_MANAGE
diff --git a/pydabs/fixtures/.gitkeep b/pydabs/fixtures/.gitkeep
@@ -0,0 +1,9 @@
+# Test fixtures directory
+
+Add JSON or CSV files here. In tests, use them with `load_fixture()`:
+
+```
+def test_using_fixture(load_fixture):
+    data = load_fixture("my_data.json")
+    assert len(data) >= 1
+```
diff --git a/pydabs/pyproject.toml b/pydabs/pyproject.toml
@@ -0,0 +1,30 @@
+[project]
+name = "pydabs"
+version = "0.0.1"
+authors = [{ name = "pieter.noordhuis@databricks.com" }]
+requires-python = ">=3.10,<=3.13"
+dependencies = [
+    # Any dependencies for jobs and pipelines in this project can be added here
+    # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies
+    #
+    # LIMITATION: for pipelines, dependencies are cached during development;
+    # add dependencies to the 'environment' section of your pipeline.yml file instead
+]
+
+[dependency-groups]
+dev = [
+    "pytest",
+    "databricks-dlt",
+    "databricks-connect>=15.4,<15.5",
+    "databricks-bundles==0.277.0",
+]
+
+[project.scripts]
+main = "pydabs.main:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.black]
+line-length = 125
diff --git a/pydabs/resources/__init__.py b/pydabs/resources/__init__.py
@@ -0,0 +1,16 @@
+from databricks.bundles.core import (
+    Bundle,
+    Resources,
+    load_resources_from_current_package_module,
+)
+
+
+def load_resources(bundle: Bundle) -> Resources:
+    """
+    'load_resources' function is referenced in databricks.yml and is responsible for loading
+    bundle resources defined in Python code. This function is called by Databricks CLI during
+    bundle deployment. After deployment, this function is not used.
+    """
+
+    # the default implementation loads all Python files in 'resources' directory
+    return load_resources_from_current_package_module()
diff --git a/pydabs/resources/pydabs_etl_pipeline.py b/pydabs/resources/pydabs_etl_pipeline.py
@@ -0,0 +1,29 @@
+from databricks.bundles.pipelines import Pipeline
+
+"""
+The main pipeline for pydabs
+"""
+
+pydabs_etl = Pipeline.from_dict(
+    {
+        "name": "pydabs_etl",
+        "catalog": "${var.catalog}",
+        "schema": "${var.schema}",
+        "serverless": True,
+        "root_path": "src/pydabs_etl",
+        "libraries": [
+            {
+                "glob": {
+                    "include": "src/pydabs_etl/transformations/**",
+                },
+            },
+        ],
+        "environment": {
+            "dependencies": [
+                # We include every dependency defined by pyproject.toml by defining an editable environment
+                # that points to the folder where pyproject.toml is deployed.
+                "--editable ${workspace.file_path}",
+            ],
+        },
+    }
+)
diff --git a/pydabs/resources/sample_job.py b/pydabs/resources/sample_job.py
@@ -0,0 +1,81 @@
+from databricks.bundles.jobs import Job
+
+"""
+A sample job for pydabs.
+"""
+
+sample_job = Job.from_dict(
+    {
+        "name": "sample_job",
+        "trigger": {
+            # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
+            "periodic": {
+                "interval": 1,
+                "unit": "DAYS",
+            },
+        },
+        # "email_notifications": {
+        #     "on_failure": [
+        #         "pieter.noordhuis@databricks.com",
+        #     ],
+        # },
+        "parameters": [
+            {
+                "name": "catalog",
+                "default": "${var.catalog}",
+            },
+            {
+                "name": "schema",
+                "default": "${var.schema}",
+            },
+        ],
+        "tasks": [
+            {
+                "task_key": "notebook_task",
+                "notebook_task": {
+                    "notebook_path": "src/sample_notebook.ipynb",
+                },
+            },
+            {
+                "task_key": "python_wheel_task",
+                "depends_on": [
+                    {"task_key": "notebook_task"},
+                ],
+                "python_wheel_task": {
+                    "package_name": "pydabs",
+                    "entry_point": "main",
+                    "parameters": [
+                        "--catalog",
+                        "${var.catalog}",
+                        "--schema",
+                        "${var.schema}",
+                    ],
+                },
+                "environment_key": "default",
+            },
+            {
+                "task_key": "refresh_pipeline",
+                "depends_on": [
+                    {"task_key": "notebook_task"},
+                ],
+                "pipeline_task": {
+                    "pipeline_id": "${resources.pipelines.pydabs_etl.id}",
+                },
+            },
+        ],
+        "environments": [
+            {
+                "environment_key": "default",
+                "spec": {
+                    "environment_version": "2",
+                    "dependencies": [
+                        # By default we just include the .whl file generated for the pydabs package.
+                        # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+                        # for more information on how to add other libraries.
+                        "dist/*.whl",
+                    ],
+                },
+            },
+        ],
+    }
+)
diff --git a/pydabs/src/pydabs/__init__.py b/pydabs/src/pydabs/__init__.py
diff --git a/pydabs/src/pydabs/main.py b/pydabs/src/pydabs/main.py
@@ -0,0 +1,24 @@
+import argparse
+from databricks.sdk.runtime import spark
+from pydabs import taxis
+
+
+def main():
+    # Process command-line arguments
+    parser = argparse.ArgumentParser(
+        description="Databricks job with catalog and schema parameters",
+    )
+    parser.add_argument("--catalog", required=True)
+    parser.add_argument("--schema", required=True)
+    args = parser.parse_args()
+
+    # Set the default catalog and schema
+    spark.sql(f"USE CATALOG {args.catalog}")
+    spark.sql(f"USE SCHEMA {args.schema}")
+
+    # Example: just find all taxis from a sample catalog
+    taxis.find_all_taxis().show(5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pydabs/src/pydabs/taxis.py b/pydabs/src/pydabs/taxis.py
@@ -0,0 +1,7 @@
+from databricks.sdk.runtime import spark
+from pyspark.sql import DataFrame
+
+
+def find_all_taxis() -> DataFrame:
+    """Find all taxi data."""
+    return spark.read.table("samples.nyctaxi.trips")