From 05feabc09641b3eec8d7429d8425edda1f6c30c0 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Wed, 19 Nov 2025 09:32:41 +0100 Subject: [PATCH 1/3] Include instantiation of the pydabs template --- pydabs/.gitignore | 10 ++ pydabs/.vscode/__builtins__.pyi | 3 + pydabs/.vscode/extensions.json | 7 + pydabs/.vscode/settings.json | 39 +++++ pydabs/README.md | 70 ++++++++ pydabs/databricks.yml | 53 +++++++ pydabs/fixtures/.gitkeep | 9 ++ pydabs/pyproject.toml | 30 ++++ pydabs/resources/__init__.py | 16 ++ pydabs/resources/pydabs_etl_pipeline.py | 29 ++++ pydabs/resources/sample_job.py | 81 ++++++++++ pydabs/src/pydabs/__init__.py | 0 pydabs/src/pydabs/main.py | 22 +++ pydabs/src/pydabs/taxis.py | 7 + pydabs/src/pydabs_etl/README.md | 20 +++ .../transformations/sample_trips_pydabs.py | 12 ++ .../transformations/sample_zones_pydabs.py | 17 ++ pydabs/src/sample_notebook.ipynb | 149 ++++++++++++++++++ pydabs/tests/conftest.py | 98 ++++++++++++ pydabs/tests/sample_taxis_test.py | 8 + 20 files changed, 680 insertions(+) create mode 100644 pydabs/.gitignore create mode 100644 pydabs/.vscode/__builtins__.pyi create mode 100644 pydabs/.vscode/extensions.json create mode 100644 pydabs/.vscode/settings.json create mode 100644 pydabs/README.md create mode 100644 pydabs/databricks.yml create mode 100644 pydabs/fixtures/.gitkeep create mode 100644 pydabs/pyproject.toml create mode 100644 pydabs/resources/__init__.py create mode 100644 pydabs/resources/pydabs_etl_pipeline.py create mode 100644 pydabs/resources/sample_job.py create mode 100644 pydabs/src/pydabs/__init__.py create mode 100644 pydabs/src/pydabs/main.py create mode 100644 pydabs/src/pydabs/taxis.py create mode 100644 pydabs/src/pydabs_etl/README.md create mode 100644 pydabs/src/pydabs_etl/transformations/sample_trips_pydabs.py create mode 100644 pydabs/src/pydabs_etl/transformations/sample_zones_pydabs.py create mode 100644 pydabs/src/sample_notebook.ipynb create mode 100644 pydabs/tests/conftest.py create mode 100644 pydabs/tests/sample_taxis_test.py diff --git a/pydabs/.gitignore b/pydabs/.gitignore new file mode 100644 index 0000000..e566c51 --- /dev/null +++ b/pydabs/.gitignore @@ -0,0 +1,10 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/** +!scratch/README.md +**/explorations/** +**/!explorations/README.md diff --git a/pydabs/.vscode/__builtins__.pyi b/pydabs/.vscode/__builtins__.pyi new file mode 100644 index 0000000..0edd518 --- /dev/null +++ b/pydabs/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/pydabs/.vscode/extensions.json b/pydabs/.vscode/extensions.json new file mode 100644 index 0000000..75a111a --- /dev/null +++ b/pydabs/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "redhat.vscode-yaml", + "ms-python.black-formatter" + ] +} diff --git a/pydabs/.vscode/settings.json b/pydabs/.vscode/settings.json new file mode 100644 index 0000000..c49593b --- /dev/null +++ b/pydabs/.vscode/settings.json @@ -0,0 +1,39 @@ +{ + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" + }, + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/pydabs/README.md b/pydabs/README.md new file mode 100644 index 0000000..f7f8eb3 --- /dev/null +++ b/pydabs/README.md @@ -0,0 +1,70 @@ +# pydabs + +The 'pydabs' project was generated by using the default template. + +* `src/`: Python source code for this project. + * `src/pydabs/`: Shared Python code that can be used by jobs and pipelines. +* `resources/`: Resource configurations (jobs, pipelines, etc.) +* `tests/`: Unit tests for the shared Python code. +* `fixtures/`: Fixtures for data sets (primarily used for testing). + + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/dev-tools/vscode-ext.html. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a pipeline called + `[dev yourname] pydabs_etl` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. + +3. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/sample_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` diff --git a/pydabs/databricks.yml b/pydabs/databricks.yml new file mode 100644 index 0000000..6a285dc --- /dev/null +++ b/pydabs/databricks.yml @@ -0,0 +1,53 @@ +# This is a Databricks asset bundle definition for pydabs. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: pydabs + uuid: 4062028b-2184-4acd-9c62-f2ec572f7843 + +python: + venv_path: .venv + # Functions called to load resources defined in Python. See resources/__init__.py + resources: + - "resources:load_resources" + +include: + - resources/*.yml + - resources/*/*.yml + +artifacts: + python_artifact: + type: whl + build: uv build --wheel + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + #host: https://company.databricks.com + variables: + catalog: main + schema: ${workspace.current_user.short_name} + prod: + mode: production + workspace: + #host: https://company.databricks.com + # We explicitly deploy to /Workspace/Users/pieter.noordhuis@databricks.com to make sure we only have a single copy. + root_path: /Workspace/Users/pieter.noordhuis@databricks.com/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: main + schema: prod + permissions: + - user_name: pieter.noordhuis@databricks.com + level: CAN_MANAGE diff --git a/pydabs/fixtures/.gitkeep b/pydabs/fixtures/.gitkeep new file mode 100644 index 0000000..77a9066 --- /dev/null +++ b/pydabs/fixtures/.gitkeep @@ -0,0 +1,9 @@ +# Test fixtures directory + +Add JSON or CSV files here. In tests, use them with `load_fixture()`: + +``` +def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert len(data) >= 1 +``` diff --git a/pydabs/pyproject.toml b/pydabs/pyproject.toml new file mode 100644 index 0000000..50286d5 --- /dev/null +++ b/pydabs/pyproject.toml @@ -0,0 +1,30 @@ +[project] +name = "pydabs" +version = "0.0.1" +authors = [{ name = "pieter.noordhuis@databricks.com" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of your pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-dlt", + "databricks-connect>=15.4,<15.5", + "databricks-bundles==0.277.0", +] + +[project.scripts] +main = "pydabs.main:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.black] +line-length = 125 diff --git a/pydabs/resources/__init__.py b/pydabs/resources/__init__.py new file mode 100644 index 0000000..fbcb9dc --- /dev/null +++ b/pydabs/resources/__init__.py @@ -0,0 +1,16 @@ +from databricks.bundles.core import ( + Bundle, + Resources, + load_resources_from_current_package_module, +) + + +def load_resources(bundle: Bundle) -> Resources: + """ + 'load_resources' function is referenced in databricks.yml and is responsible for loading + bundle resources defined in Python code. This function is called by Databricks CLI during + bundle deployment. After deployment, this function is not used. + """ + + # the default implementation loads all Python files in 'resources' directory + return load_resources_from_current_package_module() diff --git a/pydabs/resources/pydabs_etl_pipeline.py b/pydabs/resources/pydabs_etl_pipeline.py new file mode 100644 index 0000000..51da8cc --- /dev/null +++ b/pydabs/resources/pydabs_etl_pipeline.py @@ -0,0 +1,29 @@ +from databricks.bundles.pipelines import Pipeline + +""" +The main pipeline for pydabs +""" + +pydabs_etl = Pipeline.from_dict( + { + "name": "pydabs_etl", + "catalog": "${var.catalog}", + "schema": "${var.schema}", + "serverless": True, + "root_path": "src/pydabs_etl", + "libraries": [ + { + "glob": { + "include": "src/pydabs_etl/transformations/**", + }, + }, + ], + "environment": { + "dependencies": [ + # We include every dependency defined by pyproject.toml by defining an editable environment + # that points to the folder where pyproject.toml is deployed. + "--editable ${workspace.file_path}", + ], + }, + } +) diff --git a/pydabs/resources/sample_job.py b/pydabs/resources/sample_job.py new file mode 100644 index 0000000..9be2163 --- /dev/null +++ b/pydabs/resources/sample_job.py @@ -0,0 +1,81 @@ +from databricks.bundles.jobs import Job + +""" +A sample job for pydabs. +""" + +sample_job = Job.from_dict( + { + "name": "sample_job", + "trigger": { + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + "periodic": { + "interval": 1, + "unit": "DAYS", + }, + }, + # "email_notifications": { + # "on_failure": [ + # "pieter.noordhuis@databricks.com", + # ], + # }, + "parameters": [ + { + "name": "catalog", + "default": "${var.catalog}", + }, + { + "name": "schema", + "default": "${var.schema}", + }, + ], + "tasks": [ + { + "task_key": "notebook_task", + "notebook_task": { + "notebook_path": "src/sample_notebook.ipynb", + }, + }, + { + "task_key": "python_wheel_task", + "depends_on": [ + {"task_key": "notebook_task"}, + ], + "python_wheel_task": { + "package_name": "pydabs", + "entry_point": "main", + "parameters": [ + "--catalog", + "${var.catalog}", + "--schema", + "${var.schema}", + ], + }, + "environment_key": "default", + }, + { + "task_key": "refresh_pipeline", + "depends_on": [ + {"task_key": "notebook_task"}, + ], + "pipeline_task": { + "pipeline_id": "${resources.pipelines.pydabs_etl.id}", + }, + }, + ], + "environments": [ + { + "environment_key": "default", + "spec": { + "environment_version": "2", + "dependencies": [ + # By default we just include the .whl file generated for the pydabs package. + # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + # for more information on how to add other libraries. + "dist/*.whl", + ], + }, + }, + ], + } +) diff --git a/pydabs/src/pydabs/__init__.py b/pydabs/src/pydabs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pydabs/src/pydabs/main.py b/pydabs/src/pydabs/main.py new file mode 100644 index 0000000..e1b0148 --- /dev/null +++ b/pydabs/src/pydabs/main.py @@ -0,0 +1,22 @@ +import argparse +from databricks.sdk.runtime import spark +from pydabs import taxis + + +def main(): + # Process command-line arguments + parser = argparse.ArgumentParser(description="Databricks job with catalog and schema parameters") + parser.add_argument("--catalog", required=True) + parser.add_argument("--schema", required=True) + args = parser.parse_args() + + # Set the default catalog and schema + spark.sql(f"USE CATALOG {args.catalog}") + spark.sql(f"USE SCHEMA {args.schema}") + + # Example: just find all taxis from a sample catalog + taxis.find_all_taxis().show(5) + + +if __name__ == "__main__": + main() diff --git a/pydabs/src/pydabs/taxis.py b/pydabs/src/pydabs/taxis.py new file mode 100644 index 0000000..a7309cd --- /dev/null +++ b/pydabs/src/pydabs/taxis.py @@ -0,0 +1,7 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame + + +def find_all_taxis() -> DataFrame: + """Find all taxi data.""" + return spark.read.table("samples.nyctaxi.trips") diff --git a/pydabs/src/pydabs_etl/README.md b/pydabs/src/pydabs_etl/README.md new file mode 100644 index 0000000..9211406 --- /dev/null +++ b/pydabs/src/pydabs_etl/README.md @@ -0,0 +1,20 @@ +# pydabs + +This folder defines all source code for the pydabs pipeline: + +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample called "sample_trips_pydabs.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* If you're using the workspace UI, use `Run file` to run and preview a single transformation. +* If you're using the CLI, use `databricks bundle run pydabs_etl --select sample_trips_pydabs` to run a single transformation. + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/pydabs/src/pydabs_etl/transformations/sample_trips_pydabs.py b/pydabs/src/pydabs_etl/transformations/sample_trips_pydabs.py new file mode 100644 index 0000000..3fe9df9 --- /dev/null +++ b/pydabs/src/pydabs_etl/transformations/sample_trips_pydabs.py @@ -0,0 +1,12 @@ +from pyspark import pipelines as dp +from pyspark.sql.functions import col + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dp.table +def sample_trips_pydabs(): + return spark.read.table("samples.nyctaxi.trips") diff --git a/pydabs/src/pydabs_etl/transformations/sample_zones_pydabs.py b/pydabs/src/pydabs_etl/transformations/sample_zones_pydabs.py new file mode 100644 index 0000000..8b2d9ae --- /dev/null +++ b/pydabs/src/pydabs_etl/transformations/sample_zones_pydabs.py @@ -0,0 +1,17 @@ +from pyspark import pipelines as dp +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dp.table +def sample_zones_pydabs(): + # Read from the "sample_trips" table, then sum all the fares + return ( + spark.read.table(f"sample_trips_pydabs") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) diff --git a/pydabs/src/sample_notebook.ipynb b/pydabs/src/sample_notebook.ipynb new file mode 100644 index 0000000..fc6620f --- /dev/null +++ b/pydabs/src/sample_notebook.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Default notebook\n", + "\n", + "This default notebook is executed using a Lakeflow job as defined in resources/sample_job.job.yml." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a1b2c3d4-e5f6-7890-abcd-ef1234567890", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Set default catalog and schema\n", + "catalog = dbutils.widgets.get(\"catalog\")\n", + "schema = dbutils.widgets.get(\"schema\")\n", + "spark.sql(f\"USE CATALOG {catalog}\")\n", + "spark.sql(f\"USE SCHEMA {schema}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"../src\")\n", + "from pydabs import taxis\n", + "\n", + "taxis.find_all_taxis().show(10)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "dependencies": [ + "--editable .." + ], + "environment_version": "2" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": { + "catalog": { + "currentValue": "main", + "nuid": "c4t4l0g-w1dg-3t12-3456-789012345678", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "main", + "label": "Catalog", + "name": "catalog", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "main", + "label": "Catalog", + "name": "catalog", + "options": { + "autoCreated": false, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "schema": { + "currentValue": "pieter_noordhuis", + "nuid": "5ch3m4-w1dg-3t98-7654-321098765432", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "default", + "label": "Schema", + "name": "schema", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "default", + "label": "Schema", + "name": "schema", + "options": { + "autoCreated": false, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/pydabs/tests/conftest.py b/pydabs/tests/conftest.py new file mode 100644 index 0000000..4df274f --- /dev/null +++ b/pydabs/tests/conftest.py @@ -0,0 +1,98 @@ +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest + import json + import csv + import os +except ImportError: + raise ImportError( + "Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv." + ) + + +@pytest.fixture() +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests. + + Minimal example: + def test_uses_spark(spark): + df = spark.createDataFrame([(1,)], ["x"]) + assert df.count() == 1 + """ + return DatabricksSession.builder.getOrCreate() + + +@pytest.fixture() +def load_fixture(spark: SparkSession): + """Provide a callable to load JSON or CSV from fixtures/ directory. + + Example usage: + + def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert data.count() >= 1 + """ + + def _loader(filename: str): + path = pathlib.Path(__file__).parent.parent / "fixtures" / filename + suffix = path.suffix.lower() + if suffix == ".json": + rows = json.loads(path.read_text()) + return spark.createDataFrame(rows) + if suffix == ".csv": + with path.open(newline="") as f: + rows = list(csv.DictReader(f)) + return spark.createDataFrame(rows) + raise ValueError(f"Unsupported fixture type for: {filename}") + + return _loader + + +def _enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def _allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with _allow_stderr_output(config): + _enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() diff --git a/pydabs/tests/sample_taxis_test.py b/pydabs/tests/sample_taxis_test.py new file mode 100644 index 0000000..8675204 --- /dev/null +++ b/pydabs/tests/sample_taxis_test.py @@ -0,0 +1,8 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame +from pydabs import taxis + + +def test_find_all_taxis(): + results = taxis.find_all_taxis() + assert results.count() > 5 From 331007e305ce807e70bb0fdfba7af209866131f3 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Wed, 19 Nov 2025 09:36:59 +0100 Subject: [PATCH 2/3] Format --- pydabs/src/pydabs/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pydabs/src/pydabs/main.py b/pydabs/src/pydabs/main.py index e1b0148..7ab6d7b 100644 --- a/pydabs/src/pydabs/main.py +++ b/pydabs/src/pydabs/main.py @@ -5,7 +5,9 @@ def main(): # Process command-line arguments - parser = argparse.ArgumentParser(description="Databricks job with catalog and schema parameters") + parser = argparse.ArgumentParser( + description="Databricks job with catalog and schema parameters" + ) parser.add_argument("--catalog", required=True) parser.add_argument("--schema", required=True) args = parser.parse_args() From fc25547177e71f62bdf37ed451864cee068b304a Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Thu, 20 Nov 2025 00:20:06 -0800 Subject: [PATCH 3/3] Update pydabs/src/pydabs/main.py Co-authored-by: Gleb Kanterov --- pydabs/src/pydabs/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydabs/src/pydabs/main.py b/pydabs/src/pydabs/main.py index 7ab6d7b..0b7d9a0 100644 --- a/pydabs/src/pydabs/main.py +++ b/pydabs/src/pydabs/main.py @@ -6,7 +6,7 @@ def main(): # Process command-line arguments parser = argparse.ArgumentParser( - description="Databricks job with catalog and schema parameters" + description="Databricks job with catalog and schema parameters", ) parser.add_argument("--catalog", required=True) parser.add_argument("--schema", required=True)