diff --git a/dbt_sql/resources/dbt_sql.job.yml b/dbt_sql/resources/dbt_sql.job.yml index 6722436..3af98c2 100644 --- a/dbt_sql/resources/dbt_sql.job.yml +++ b/dbt_sql/resources/dbt_sql.job.yml @@ -29,6 +29,6 @@ resources: environments: - environment_key: default spec: - environment_version: "2" + environment_version: "4" dependencies: - dbt-databricks>=1.8.0,<2.0.0 diff --git a/default_minimal/.gitignore b/default_minimal/.gitignore new file mode 100644 index 0000000..e566c51 --- /dev/null +++ b/default_minimal/.gitignore @@ -0,0 +1,10 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/** +!scratch/README.md +**/explorations/** +**/!explorations/README.md diff --git a/default_minimal/.vscode/__builtins__.pyi b/default_minimal/.vscode/__builtins__.pyi new file mode 100644 index 0000000..0edd518 --- /dev/null +++ b/default_minimal/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/default_minimal/.vscode/extensions.json b/default_minimal/.vscode/extensions.json new file mode 100644 index 0000000..75a111a --- /dev/null +++ b/default_minimal/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "redhat.vscode-yaml", + "ms-python.black-formatter" + ] +} diff --git a/default_minimal/.vscode/settings.json b/default_minimal/.vscode/settings.json new file mode 100644 index 0000000..c49593b --- /dev/null +++ b/default_minimal/.vscode/settings.json @@ -0,0 +1,39 @@ +{ + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" + }, + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/default_minimal/README.md b/default_minimal/README.md new file mode 100644 index 0000000..a56a0a1 --- /dev/null +++ b/default_minimal/README.md @@ -0,0 +1,54 @@ +# default_minimal + +The 'default_minimal' project was generated by using the default-minimal template. + +* `src/`: SQL source code for this project. +* `resources/`: Resource configurations (jobs, pipelines, etc.) + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/dev-tools/vscode-ext.html. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + +3. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` diff --git a/default_minimal/databricks.yml b/default_minimal/databricks.yml new file mode 100644 index 0000000..6e4dd55 --- /dev/null +++ b/default_minimal/databricks.yml @@ -0,0 +1,42 @@ +# This is a Databricks asset bundle definition for default_minimal. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: default_minimal + uuid: 8127e9c1-adac-4c9c-b006-d3450874f663 + +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: https://company.databricks.com + variables: + catalog: catalog + schema: ${workspace.current_user.short_name} + prod: + mode: production + workspace: + host: https://company.databricks.com + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: catalog + schema: prod + permissions: + - user_name: user@company.com + level: CAN_MANAGE diff --git a/default_minimal/resources/.gitkeep b/default_minimal/resources/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/default_minimal/resources/.gitkeep @@ -0,0 +1 @@ + diff --git a/default_minimal/src/.gitkeep b/default_minimal/src/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/default_minimal/src/.gitkeep @@ -0,0 +1 @@ + diff --git a/default_python/.gitignore b/default_python/.gitignore index 0dab7f4..e566c51 100644 --- a/default_python/.gitignore +++ b/default_python/.gitignore @@ -6,3 +6,5 @@ __pycache__/ .venv/ scratch/** !scratch/README.md +**/explorations/** +**/!explorations/README.md diff --git a/default_python/.vscode/extensions.json b/default_python/.vscode/extensions.json index 5d15eba..75a111a 100644 --- a/default_python/.vscode/extensions.json +++ b/default_python/.vscode/extensions.json @@ -1,7 +1,7 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "ms-python.black-formatter" ] } diff --git a/default_python/.vscode/settings.json b/default_python/.vscode/settings.json index 8ee87c3..c49593b 100644 --- a/default_python/.vscode/settings.json +++ b/default_python/.vscode/settings.json @@ -1,16 +1,39 @@ { - "python.analysis.stubPath": ".vscode", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["src"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" + }, + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, }, } diff --git a/default_python/README.md b/default_python/README.md index f5a6a22..2f2032b 100644 --- a/default_python/README.md +++ b/default_python/README.md @@ -2,8 +2,12 @@ The 'default_python' project was generated by using the default-python template. -For documentation on the Databricks Asset Bundles format use for this project, -and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. +* `src/`: Python source code for this project. + * `src/default_python/`: Shared Python code that can be used by jobs and pipelines. +* `resources/`: Resource configurations (jobs, pipelines, etc.) +* `tests/`: Unit tests for the shared Python code. +* `fixtures/`: Fixtures for data sets (primarily used for testing). + ## Getting started @@ -13,17 +17,17 @@ Choose how you want to work on this project: https://docs.databricks.com/dev-tools/bundles/workspace. (b) Locally with an IDE like Cursor or VS Code, see - https://docs.databricks.com/vscode-ext. + https://docs.databricks.com/dev-tools/vscode-ext.html. (c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html - -Dependencies for this project should be installed using uv: +If you're developing with an IDE, dependencies for this project should be installed using uv: * Make sure you have the UV package manager installed. It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. * Run `uv sync --dev` to install the project's dependencies. + # Using this project using the CLI The Databricks workspace and IDE extensions provide a graphical interface for working @@ -42,17 +46,16 @@ with this project. It's also possible to interact with it directly using the CLI is optional here.) This deploys everything that's defined for this project. - For example, the default template would deploy a job called - `[dev yourname] default_python_job` to your workspace. - You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. + For example, the default template would deploy a pipeline called + `[dev yourname] default_python_etl` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. 3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` - - Note that the default job from the template has a schedule that runs every day - (defined in resources/default_python.job.yml). The schedule + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/sample_job.job.yml). The schedule is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). diff --git a/default_python/databricks.yml b/default_python/databricks.yml index 079edb9..aa15077 100644 --- a/default_python/databricks.yml +++ b/default_python/databricks.yml @@ -4,14 +4,21 @@ bundle: name: default_python uuid: 87d5a23e-7bc7-4f52-98ee-e374b67d5681 +include: + - resources/*.yml + - resources/*/*.yml + artifacts: python_artifact: type: whl build: uv build --wheel -include: - - resources/*.yml - - resources/*/*.yml +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use targets: dev: @@ -23,13 +30,18 @@ targets: default: true workspace: host: https://company.databricks.com - + variables: + catalog: catalog + schema: ${workspace.current_user.short_name} prod: mode: production workspace: host: https://company.databricks.com # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: catalog + schema: prod permissions: - user_name: user@company.com level: CAN_MANAGE diff --git a/default_python/fixtures/.gitkeep b/default_python/fixtures/.gitkeep index fa25d27..77a9066 100644 --- a/default_python/fixtures/.gitkeep +++ b/default_python/fixtures/.gitkeep @@ -1,22 +1,9 @@ -# Fixtures +# Test fixtures directory -This folder is reserved for fixtures, such as CSV files. - -Below is an example of how to load fixtures as a data frame: +Add JSON or CSV files here. In tests, use them with `load_fixture()`: ``` -import pandas as pd -import os - -def get_absolute_path(*relative_parts): - if 'dbutils' in globals(): - base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore - path = os.path.normpath(os.path.join(base_dir, *relative_parts)) - return path if path.startswith("/Workspace") else "/Workspace" + path - else: - return os.path.join(*relative_parts) - -csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") -df = pd.read_csv(csv_file) -display(df) +def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert len(data) >= 1 ``` diff --git a/default_python/pyproject.toml b/default_python/pyproject.toml index 279d7f3..a90910c 100644 --- a/default_python/pyproject.toml +++ b/default_python/pyproject.toml @@ -2,34 +2,28 @@ name = "default_python" version = "0.0.1" authors = [{ name = "user@company.com" }] -requires-python = ">=3.10,<=3.13" +requires-python = ">=3.10,<3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of your pipeline.yml file instead +] [dependency-groups] dev = [ "pytest", - - # Code completion support for Lakeflow Declarative Pipelines, also install databricks-connect "databricks-dlt", - - # databricks-connect can be used to run parts of this project locally. - # Note that for local development, you should use a version that is not newer - # than the remote cluster or serverless compute you connect to. - # See also https://docs.databricks.com/dev-tools/databricks-connect.html. "databricks-connect>=15.4,<15.5", ] -[tool.pytest.ini_options] -pythonpath = "src" -testpaths = [ - "tests", -] +[project.scripts] +main = "default_python.main:main" [build-system] requires = ["hatchling"] build-backend = "hatchling.build" -[tool.hatch.build.targets.wheel] -packages = ["src/default_python"] - -[project.scripts] -main = "default_python.main:main" +[tool.black] +line-length = 125 diff --git a/default_python/resources/default_python.job.yml b/default_python/resources/default_python.job.yml deleted file mode 100644 index d99eb4d..0000000 --- a/default_python/resources/default_python.job.yml +++ /dev/null @@ -1,45 +0,0 @@ -# The main job for default_python. -resources: - jobs: - default_python_job: - name: default_python_job - - trigger: - # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger - periodic: - interval: 1 - unit: DAYS - - #email_notifications: - # on_failure: - # - your_email@example.com - - tasks: - - task_key: notebook_task - notebook_task: - notebook_path: ../src/notebook.ipynb - - - task_key: refresh_pipeline - depends_on: - - task_key: notebook_task - pipeline_task: - pipeline_id: ${resources.pipelines.default_python_pipeline.id} - - - task_key: main_task - depends_on: - - task_key: refresh_pipeline - environment_key: default - python_wheel_task: - package_name: default_python - entry_point: main - - # A list of task execution environment specifications that can be referenced by tasks of this job. - environments: - - environment_key: default - - # Full documentation of this spec can be found at: - # https://docs.databricks.com/api/workspace/jobs/create#environments-spec - spec: - environment_version: "2" - dependencies: - - ../dist/*.whl diff --git a/default_python/resources/default_python.pipeline.yml b/default_python/resources/default_python.pipeline.yml deleted file mode 100644 index 7954922..0000000 --- a/default_python/resources/default_python.pipeline.yml +++ /dev/null @@ -1,14 +0,0 @@ -# The main pipeline for default_python -resources: - pipelines: - default_python_pipeline: - name: default_python_pipeline - catalog: main - schema: default_python_${bundle.target} - serverless: true - libraries: - - notebook: - path: ../src/pipeline.ipynb - - configuration: - bundle.sourcePath: ${workspace.file_path}/src diff --git a/default_python/resources/default_python_etl.pipeline.yml b/default_python/resources/default_python_etl.pipeline.yml new file mode 100644 index 0000000..e3383a9 --- /dev/null +++ b/default_python/resources/default_python_etl.pipeline.yml @@ -0,0 +1,20 @@ +# The main pipeline for default_python + +resources: + pipelines: + default_python_etl: + name: default_python_etl + catalog: ${var.catalog} + schema: ${var.schema} + serverless: true + root_path: "../src/default_python_etl" + + libraries: + - glob: + include: ../src/default_python_etl/transformations/** + + environment: + dependencies: + # We include every dependency defined by pyproject.toml by defining an editable environment + # that points to the folder where pyproject.toml is deployed. + - --editable ${workspace.file_path} diff --git a/default_python/resources/sample_job.job.yml b/default_python/resources/sample_job.job.yml new file mode 100644 index 0000000..da5e0ea --- /dev/null +++ b/default_python/resources/sample_job.job.yml @@ -0,0 +1,54 @@ +# A sample job for default_python. + +resources: + jobs: + sample_job: + name: sample_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + + tasks: + - task_key: notebook_task + notebook_task: + notebook_path: ../src/sample_notebook.ipynb + - task_key: python_wheel_task + depends_on: + - task_key: notebook_task + python_wheel_task: + package_name: default_python + entry_point: main + parameters: + - "--catalog" + - "${var.catalog}" + - "--schema" + - "${var.schema}" + environment_key: default + - task_key: refresh_pipeline + depends_on: + - task_key: notebook_task + pipeline_task: + pipeline_id: ${resources.pipelines.default_python_etl.id} + + environments: + - environment_key: default + spec: + environment_version: "4" + dependencies: + # By default we just include the .whl file generated for the default_python package. + # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + # for more information on how to add other libraries. + - ../dist/*.whl diff --git a/default_python/scratch/README.md b/default_python/scratch/README.md deleted file mode 100644 index e6cfb81..0000000 --- a/default_python/scratch/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# scratch - -This folder is reserved for personal, exploratory notebooks. -By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/default_python/scratch/exploration.ipynb b/default_python/scratch/exploration.ipynb deleted file mode 100644 index 57a9c97..0000000 --- a/default_python/scratch/exploration.ipynb +++ /dev/null @@ -1,61 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "sys.path.append(\"../src\")\n", - "from default_python import main\n", - "\n", - "main.get_taxis().show(10)" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "ipynb-notebook", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/default_python/src/default_python/main.py b/default_python/src/default_python/main.py index 04e8be4..0cb0e73 100644 --- a/default_python/src/default_python/main.py +++ b/default_python/src/default_python/main.py @@ -1,13 +1,21 @@ +import argparse from databricks.sdk.runtime import spark -from pyspark.sql import DataFrame +from default_python import taxis -def find_all_taxis() -> DataFrame: - return spark.read.table("samples.nyctaxi.trips") +def main(): + # Process command-line arguments + parser = argparse.ArgumentParser(description="Databricks job with catalog and schema parameters") + parser.add_argument("--catalog", required=True) + parser.add_argument("--schema", required=True) + args = parser.parse_args() + # Set the default catalog and schema + spark.sql(f"USE CATALOG {args.catalog}") + spark.sql(f"USE SCHEMA {args.schema}") -def main(): - find_all_taxis().show(5) + # Example: just find all taxis from a sample catalog + taxis.find_all_taxis().show(5) if __name__ == "__main__": diff --git a/default_python/src/default_python/taxis.py b/default_python/src/default_python/taxis.py new file mode 100644 index 0000000..a7309cd --- /dev/null +++ b/default_python/src/default_python/taxis.py @@ -0,0 +1,7 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame + + +def find_all_taxis() -> DataFrame: + """Find all taxi data.""" + return spark.read.table("samples.nyctaxi.trips") diff --git a/default_python/src/default_python_etl/README.md b/default_python/src/default_python_etl/README.md new file mode 100644 index 0000000..de5e26a --- /dev/null +++ b/default_python/src/default_python_etl/README.md @@ -0,0 +1,20 @@ +# default_python + +This folder defines all source code for the default_python pipeline: + +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample called "sample_trips_default_python.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* If you're using the workspace UI, use `Run file` to run and preview a single transformation. +* If you're using the CLI, use `databricks bundle run default_python_etl --select sample_trips_default_python` to run a single transformation. + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/default_python/src/default_python_etl/transformations/sample_trips_default_python.py b/default_python/src/default_python_etl/transformations/sample_trips_default_python.py new file mode 100644 index 0000000..6106a11 --- /dev/null +++ b/default_python/src/default_python_etl/transformations/sample_trips_default_python.py @@ -0,0 +1,12 @@ +from pyspark import pipelines as dp +from pyspark.sql.functions import col + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dp.table +def sample_trips_default_python(): + return spark.read.table("samples.nyctaxi.trips") diff --git a/default_python/src/default_python_etl/transformations/sample_zones_default_python.py b/default_python/src/default_python_etl/transformations/sample_zones_default_python.py new file mode 100644 index 0000000..c56c4db --- /dev/null +++ b/default_python/src/default_python_etl/transformations/sample_zones_default_python.py @@ -0,0 +1,17 @@ +from pyspark import pipelines as dp +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dp.table +def sample_zones_default_python(): + # Read from the "sample_trips" table, then sum all the fares + return ( + spark.read.table(f"sample_trips_default_python") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) diff --git a/default_python/src/notebook.ipynb b/default_python/src/notebook.ipynb deleted file mode 100644 index fd49e5b..0000000 --- a/default_python/src/notebook.ipynb +++ /dev/null @@ -1,75 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Default notebook\n", - "\n", - "This default notebook is executed using Databricks Workflows as defined in resources/default_python.job.yml." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from default_python import main\n", - "\n", - "main.find_all_taxis().show(10)" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "notebook", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/default_python/src/pipeline.ipynb b/default_python/src/pipeline.ipynb deleted file mode 100644 index 7c55138..0000000 --- a/default_python/src/pipeline.ipynb +++ /dev/null @@ -1,90 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Lakeflow Declarative Pipeline\n", - "\n", - "This Lakeflow Declarative Pipeline definition is executed using a pipeline defined in resources/default_python.pipeline.yml." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Import DLT and src/default_python\n", - "import dlt\n", - "import sys\n", - "\n", - "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", - "from pyspark.sql.functions import expr\n", - "from default_python import main" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "@dlt.view\n", - "def taxi_raw():\n", - " return main.find_all_taxis()\n", - "\n", - "\n", - "@dlt.table\n", - "def filtered_taxis():\n", - " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "pipeline", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/default_python/src/sample_notebook.ipynb b/default_python/src/sample_notebook.ipynb new file mode 100644 index 0000000..63723b5 --- /dev/null +++ b/default_python/src/sample_notebook.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Default notebook\n", + "\n", + "This default notebook is executed using a Lakeflow job as defined in resources/sample_job.job.yml." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a1b2c3d4-e5f6-7890-abcd-ef1234567890", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Set default catalog and schema\n", + "catalog = dbutils.widgets.get(\"catalog\")\n", + "schema = dbutils.widgets.get(\"schema\")\n", + "spark.sql(f\"USE CATALOG {catalog}\")\n", + "spark.sql(f\"USE SCHEMA {schema}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"../src\")\n", + "from default_python import taxis\n", + "\n", + "taxis.find_all_taxis().show(10)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "dependencies": [ + "--editable .." + ], + "environment_version": "4" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": { + "catalog": { + "currentValue": "catalog", + "nuid": "c4t4l0g-w1dg-3t12-3456-789012345678", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "catalog", + "label": "Catalog", + "name": "catalog", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "catalog", + "label": "Catalog", + "name": "catalog", + "options": { + "autoCreated": false, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "schema": { + "currentValue": "user_name", + "nuid": "5ch3m4-w1dg-3t98-7654-321098765432", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "default", + "label": "Schema", + "name": "schema", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "default", + "label": "Schema", + "name": "schema", + "options": { + "autoCreated": false, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/default_python/tests/conftest.py b/default_python/tests/conftest.py index f80cb43..4df274f 100644 --- a/default_python/tests/conftest.py +++ b/default_python/tests/conftest.py @@ -1,4 +1,8 @@ -"""This file configures pytest.""" +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" import os, sys, pathlib from contextlib import contextmanager @@ -9,13 +13,54 @@ from databricks.sdk import WorkspaceClient from pyspark.sql import SparkSession import pytest + import json + import csv + import os except ImportError: raise ImportError( "Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv." ) -def enable_fallback_compute(): +@pytest.fixture() +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests. + + Minimal example: + def test_uses_spark(spark): + df = spark.createDataFrame([(1,)], ["x"]) + assert df.count() == 1 + """ + return DatabricksSession.builder.getOrCreate() + + +@pytest.fixture() +def load_fixture(spark: SparkSession): + """Provide a callable to load JSON or CSV from fixtures/ directory. + + Example usage: + + def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert data.count() >= 1 + """ + + def _loader(filename: str): + path = pathlib.Path(__file__).parent.parent / "fixtures" / filename + suffix = path.suffix.lower() + if suffix == ".json": + rows = json.loads(path.read_text()) + return spark.createDataFrame(rows) + if suffix == ".csv": + with path.open(newline="") as f: + rows = list(csv.DictReader(f)) + return spark.createDataFrame(rows) + raise ValueError(f"Unsupported fixture type for: {filename}") + + return _loader + + +def _enable_fallback_compute(): """Enable serverless compute if no compute is specified.""" conf = WorkspaceClient().config if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): @@ -23,13 +68,13 @@ def enable_fallback_compute(): url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) - print(f" see {url} for manual configuration", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" @contextmanager -def allow_stderr_output(config: pytest.Config): +def _allow_stderr_output(config: pytest.Config): """Temporarily disable pytest output capture.""" capman = config.pluginmanager.get_plugin("capturemanager") if capman: @@ -41,8 +86,8 @@ def allow_stderr_output(config: pytest.Config): def pytest_configure(config: pytest.Config): """Configure pytest session.""" - with allow_stderr_output(config): - enable_fallback_compute() + with _allow_stderr_output(config): + _enable_fallback_compute() # Initialize Spark session eagerly, so it is available even when # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, @@ -51,9 +96,3 @@ def pytest_configure(config: pytest.Config): DatabricksSession.builder.validateSession().getOrCreate() else: DatabricksSession.builder.getOrCreate() - - -@pytest.fixture(scope="session") -def spark() -> SparkSession: - """Provide a SparkSession fixture for tests.""" - return DatabricksSession.builder.getOrCreate() diff --git a/default_python/tests/main_test.py b/default_python/tests/main_test.py deleted file mode 100644 index 66c2702..0000000 --- a/default_python/tests/main_test.py +++ /dev/null @@ -1,6 +0,0 @@ -from default_python import main - - -def test_find_all_taxis(): - taxis = main.find_all_taxis() - assert taxis.count() > 5 diff --git a/default_python/tests/sample_taxis_test.py b/default_python/tests/sample_taxis_test.py new file mode 100644 index 0000000..d5d6544 --- /dev/null +++ b/default_python/tests/sample_taxis_test.py @@ -0,0 +1,8 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame +from default_python import taxis + + +def test_find_all_taxis(): + results = taxis.find_all_taxis() + assert results.count() > 5 diff --git a/lakeflow_pipelines_python/README.md b/lakeflow_pipelines_python/README.md index b910a0f..adf30de 100644 --- a/lakeflow_pipelines_python/README.md +++ b/lakeflow_pipelines_python/README.md @@ -1,6 +1,6 @@ # lakeflow_pipelines_python -The 'lakeflow_pipelines_python' project was generated by using the default template. +The 'lakeflow_pipelines_python' project was generated by using the lakeflow-pipelines template. * `src/`: Python source code for this project. * `resources/`: Resource configurations (jobs, pipelines, etc.) @@ -13,7 +13,7 @@ Choose how you want to work on this project: https://docs.databricks.com/dev-tools/bundles/workspace. (b) Locally with an IDE like Cursor or VS Code, see - https://docs.databricks.com/vscode-ext. + https://docs.databricks.com/dev-tools/vscode-ext.html. (c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html @@ -36,7 +36,7 @@ with this project. It's also possible to interact with it directly using the CLI This deploys everything that's defined for this project. For example, the default template would deploy a pipeline called - `[dev yourname] pipelines_python_etl` to your workspace. + `[dev yourname] lakeflow_pipelines_python_etl` to your workspace. You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. 3. Similarly, to deploy a production copy, type: diff --git a/lakeflow_pipelines_python/pyproject.toml b/lakeflow_pipelines_python/pyproject.toml index 7886f28..5e565ad 100644 --- a/lakeflow_pipelines_python/pyproject.toml +++ b/lakeflow_pipelines_python/pyproject.toml @@ -2,13 +2,13 @@ name = "lakeflow_pipelines_python" version = "0.0.1" authors = [{ name = "user@company.com" }] -requires-python = ">=3.10,<=3.13" +requires-python = ">=3.10,<3.13" dependencies = [ # Any dependencies for jobs and pipelines in this project can be added here # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies # # LIMITATION: for pipelines, dependencies are cached during development; - # add dependencies to the 'environment' section of pipeline.yml file instead + # add dependencies to the 'environment' section of your pipeline.yml file instead ] [dependency-groups] @@ -25,5 +25,8 @@ main = "lakeflow_pipelines_python.main:main" requires = ["hatchling"] build-backend = "hatchling.build" +[tool.hatch.build.targets.wheel] +packages = ["src"] + [tool.black] line-length = 125 diff --git a/lakeflow_pipelines_python/resources/pipelines_python_etl.pipeline.yml b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_etl.pipeline.yml similarity index 68% rename from lakeflow_pipelines_python/resources/pipelines_python_etl.pipeline.yml rename to lakeflow_pipelines_python/resources/lakeflow_pipelines_python_etl.pipeline.yml index bd1f31b..52b9843 100644 --- a/lakeflow_pipelines_python/resources/pipelines_python_etl.pipeline.yml +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_etl.pipeline.yml @@ -2,16 +2,16 @@ resources: pipelines: - pipelines_python_etl: - name: pipelines_python_etl + lakeflow_pipelines_python_etl: + name: lakeflow_pipelines_python_etl catalog: ${var.catalog} schema: ${var.schema} serverless: true - root_path: "../src/pipelines_python_etl" + root_path: "../src/lakeflow_pipelines_python_etl" libraries: - glob: - include: ../src/pipelines_python_etl/transformations/** + include: ../src/lakeflow_pipelines_python_etl/transformations/** environment: dependencies: diff --git a/lakeflow_pipelines_python/resources/sample_job.job.yml b/lakeflow_pipelines_python/resources/sample_job.job.yml index 3fe7701..a762860 100644 --- a/lakeflow_pipelines_python/resources/sample_job.job.yml +++ b/lakeflow_pipelines_python/resources/sample_job.job.yml @@ -24,9 +24,9 @@ resources: tasks: - task_key: refresh_pipeline pipeline_task: - pipeline_id: ${resources.pipelines.pipelines_python_etl.id} + pipeline_id: ${resources.pipelines.lakeflow_pipelines_python_etl.id} environments: - environment_key: default spec: - environment_version: "2" + environment_version: "4" diff --git a/lakeflow_pipelines_python/src/pipelines_python_etl/README.md b/lakeflow_pipelines_python/src/lakeflow_pipelines_python_etl/README.md similarity index 86% rename from lakeflow_pipelines_python/src/pipelines_python_etl/README.md rename to lakeflow_pipelines_python/src/lakeflow_pipelines_python_etl/README.md index ba1c3dd..aac5d43 100644 --- a/lakeflow_pipelines_python/src/pipelines_python_etl/README.md +++ b/lakeflow_pipelines_python/src/lakeflow_pipelines_python_etl/README.md @@ -15,6 +15,6 @@ To get started, go to the `transformations` folder -- most of the relevant sourc * Take a look at the sample called "sample_trips_lakeflow_pipelines_python.py" to get familiar with the syntax. Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. * If you're using the workspace UI, use `Run file` to run and preview a single transformation. -* If you're using the CLI, use `databricks bundle run pipelines_python_etl --select sample_trips_lakeflow_pipelines_python` to run a single transformation. +* If you're using the CLI, use `databricks bundle run lakeflow_pipelines_python_etl --select sample_trips_lakeflow_pipelines_python` to run a single transformation. For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/lakeflow_pipelines_python/src/pipelines_python_etl/transformations/sample_trips_lakeflow_pipelines_python.py b/lakeflow_pipelines_python/src/lakeflow_pipelines_python_etl/transformations/sample_trips_lakeflow_pipelines_python.py similarity index 100% rename from lakeflow_pipelines_python/src/pipelines_python_etl/transformations/sample_trips_lakeflow_pipelines_python.py rename to lakeflow_pipelines_python/src/lakeflow_pipelines_python_etl/transformations/sample_trips_lakeflow_pipelines_python.py diff --git a/lakeflow_pipelines_python/src/pipelines_python_etl/transformations/sample_zones_lakeflow_pipelines_python.py b/lakeflow_pipelines_python/src/lakeflow_pipelines_python_etl/transformations/sample_zones_lakeflow_pipelines_python.py similarity index 100% rename from lakeflow_pipelines_python/src/pipelines_python_etl/transformations/sample_zones_lakeflow_pipelines_python.py rename to lakeflow_pipelines_python/src/lakeflow_pipelines_python_etl/transformations/sample_zones_lakeflow_pipelines_python.py diff --git a/lakeflow_pipelines_sql/README.md b/lakeflow_pipelines_sql/README.md index 1de3c6c..9be8611 100644 --- a/lakeflow_pipelines_sql/README.md +++ b/lakeflow_pipelines_sql/README.md @@ -1,6 +1,6 @@ # lakeflow_pipelines_sql -The 'lakeflow_pipelines_sql' project was generated by using the default template. +The 'lakeflow_pipelines_sql' project was generated by using the lakeflow-pipelines template. * `src/`: SQL source code for this project. * `resources/`: Resource configurations (jobs, pipelines, etc.) @@ -13,7 +13,7 @@ Choose how you want to work on this project: https://docs.databricks.com/dev-tools/bundles/workspace. (b) Locally with an IDE like Cursor or VS Code, see - https://docs.databricks.com/vscode-ext. + https://docs.databricks.com/dev-tools/vscode-ext.html. (c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html @@ -36,7 +36,7 @@ with this project. It's also possible to interact with it directly using the CLI This deploys everything that's defined for this project. For example, the default template would deploy a pipeline called - `[dev yourname] pipelines_sql_etl` to your workspace. + `[dev yourname] lakeflow_pipelines_sql_etl` to your workspace. You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. 3. Similarly, to deploy a production copy, type: diff --git a/lakeflow_pipelines_sql/pyproject.toml b/lakeflow_pipelines_sql/pyproject.toml deleted file mode 100644 index 5e53a63..0000000 --- a/lakeflow_pipelines_sql/pyproject.toml +++ /dev/null @@ -1,29 +0,0 @@ -[project] -name = "lakeflow_pipelines_sql" -version = "0.0.1" -authors = [{ name = "user@company.com" }] -requires-python = ">=3.10,<=3.13" -dependencies = [ - # Any dependencies for jobs and pipelines in this project can be added here - # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies - # - # LIMITATION: for pipelines, dependencies are cached during development; - # add dependencies to the 'environment' section of pipeline.yml file instead -] - -[dependency-groups] -dev = [ - "pytest", - "databricks-dlt", - "databricks-connect>=15.4,<15.5", -] - -[project.scripts] -main = "lakeflow_pipelines_sql.main:main" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.black] -line-length = 125 diff --git a/lakeflow_pipelines_sql/resources/pipelines_sql_etl.pipeline.yml b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_etl.pipeline.yml similarity index 69% rename from lakeflow_pipelines_sql/resources/pipelines_sql_etl.pipeline.yml rename to lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_etl.pipeline.yml index e8e5c18..3b04805 100644 --- a/lakeflow_pipelines_sql/resources/pipelines_sql_etl.pipeline.yml +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_etl.pipeline.yml @@ -2,16 +2,16 @@ resources: pipelines: - pipelines_sql_etl: - name: pipelines_sql_etl + lakeflow_pipelines_sql_etl: + name: lakeflow_pipelines_sql_etl catalog: ${var.catalog} schema: ${var.schema} serverless: true - root_path: "../src/pipelines_sql_etl" + root_path: "../src/lakeflow_pipelines_sql_etl" libraries: - glob: - include: ../src/pipelines_sql_etl/transformations/** + include: ../src/lakeflow_pipelines_sql_etl/transformations/** environment: dependencies: diff --git a/lakeflow_pipelines_sql/resources/sample_job.job.yml b/lakeflow_pipelines_sql/resources/sample_job.job.yml index d883b05..ea0f8e1 100644 --- a/lakeflow_pipelines_sql/resources/sample_job.job.yml +++ b/lakeflow_pipelines_sql/resources/sample_job.job.yml @@ -24,9 +24,9 @@ resources: tasks: - task_key: refresh_pipeline pipeline_task: - pipeline_id: ${resources.pipelines.pipelines_sql_etl.id} + pipeline_id: ${resources.pipelines.lakeflow_pipelines_sql_etl.id} environments: - environment_key: default spec: - environment_version: "2" + environment_version: "4" diff --git a/lakeflow_pipelines_sql/src/pipelines_sql_etl/README.md b/lakeflow_pipelines_sql/src/lakeflow_pipelines_sql_etl/README.md similarity index 86% rename from lakeflow_pipelines_sql/src/pipelines_sql_etl/README.md rename to lakeflow_pipelines_sql/src/lakeflow_pipelines_sql_etl/README.md index b1178a5..e994a91 100644 --- a/lakeflow_pipelines_sql/src/pipelines_sql_etl/README.md +++ b/lakeflow_pipelines_sql/src/lakeflow_pipelines_sql_etl/README.md @@ -15,6 +15,6 @@ To get started, go to the `transformations` folder -- most of the relevant sourc * Take a look at the sample called "sample_trips_lakeflow_pipelines_sql.py" to get familiar with the syntax. Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. * If you're using the workspace UI, use `Run file` to run and preview a single transformation. -* If you're using the CLI, use `databricks bundle run pipelines_sql_etl --select sample_trips_lakeflow_pipelines_sql` to run a single transformation. +* If you're using the CLI, use `databricks bundle run lakeflow_pipelines_sql_etl --select sample_trips_lakeflow_pipelines_sql` to run a single transformation. For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/lakeflow_pipelines_sql/src/pipelines_sql_etl/transformations/sample_trips_lakeflow_pipelines_sql.sql b/lakeflow_pipelines_sql/src/lakeflow_pipelines_sql_etl/transformations/sample_trips_lakeflow_pipelines_sql.sql similarity index 100% rename from lakeflow_pipelines_sql/src/pipelines_sql_etl/transformations/sample_trips_lakeflow_pipelines_sql.sql rename to lakeflow_pipelines_sql/src/lakeflow_pipelines_sql_etl/transformations/sample_trips_lakeflow_pipelines_sql.sql diff --git a/lakeflow_pipelines_sql/src/pipelines_sql_etl/transformations/sample_zones_lakeflow_pipelines_sql.sql b/lakeflow_pipelines_sql/src/lakeflow_pipelines_sql_etl/transformations/sample_zones_lakeflow_pipelines_sql.sql similarity index 100% rename from lakeflow_pipelines_sql/src/pipelines_sql_etl/transformations/sample_zones_lakeflow_pipelines_sql.sql rename to lakeflow_pipelines_sql/src/lakeflow_pipelines_sql_etl/transformations/sample_zones_lakeflow_pipelines_sql.sql diff --git a/pydabs/README.md b/pydabs/README.md index f7f8eb3..aa91bad 100644 --- a/pydabs/README.md +++ b/pydabs/README.md @@ -1,6 +1,6 @@ # pydabs -The 'pydabs' project was generated by using the default template. +The 'pydabs' project was generated by using the PyDABs template. * `src/`: Python source code for this project. * `src/pydabs/`: Shared Python code that can be used by jobs and pipelines. diff --git a/pydabs/databricks.yml b/pydabs/databricks.yml index 6a285dc..57634dd 100644 --- a/pydabs/databricks.yml +++ b/pydabs/databricks.yml @@ -35,19 +35,19 @@ targets: mode: development default: true workspace: - #host: https://company.databricks.com + host: https://company.databricks.com variables: - catalog: main + catalog: catalog schema: ${workspace.current_user.short_name} prod: mode: production workspace: - #host: https://company.databricks.com - # We explicitly deploy to /Workspace/Users/pieter.noordhuis@databricks.com to make sure we only have a single copy. - root_path: /Workspace/Users/pieter.noordhuis@databricks.com/.bundle/${bundle.name}/${bundle.target} + host: https://company.databricks.com + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} variables: - catalog: main + catalog: catalog schema: prod permissions: - - user_name: pieter.noordhuis@databricks.com + - user_name: user@company.com level: CAN_MANAGE diff --git a/pydabs/pyproject.toml b/pydabs/pyproject.toml index 50286d5..8621579 100644 --- a/pydabs/pyproject.toml +++ b/pydabs/pyproject.toml @@ -1,8 +1,8 @@ [project] name = "pydabs" version = "0.0.1" -authors = [{ name = "pieter.noordhuis@databricks.com" }] -requires-python = ">=3.10,<=3.13" +authors = [{ name = "user@company.com" }] +requires-python = ">=3.10,<3.13" dependencies = [ # Any dependencies for jobs and pipelines in this project can be added here # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies @@ -16,7 +16,7 @@ dev = [ "pytest", "databricks-dlt", "databricks-connect>=15.4,<15.5", - "databricks-bundles==0.277.0", + "databricks-bundles==0.279.0", ] [project.scripts] diff --git a/pydabs/resources/sample_job.py b/pydabs/resources/sample_job.py index 9be2163..68093c7 100644 --- a/pydabs/resources/sample_job.py +++ b/pydabs/resources/sample_job.py @@ -16,7 +16,7 @@ }, # "email_notifications": { # "on_failure": [ - # "pieter.noordhuis@databricks.com", + # "user@company.com", # ], # }, "parameters": [ @@ -67,7 +67,7 @@ { "environment_key": "default", "spec": { - "environment_version": "2", + "environment_version": "4", "dependencies": [ # By default we just include the .whl file generated for the pydabs package. # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html diff --git a/pydabs/src/pydabs/main.py b/pydabs/src/pydabs/main.py index 0b7d9a0..7ab6d7b 100644 --- a/pydabs/src/pydabs/main.py +++ b/pydabs/src/pydabs/main.py @@ -6,7 +6,7 @@ def main(): # Process command-line arguments parser = argparse.ArgumentParser( - description="Databricks job with catalog and schema parameters", + description="Databricks job with catalog and schema parameters" ) parser.add_argument("--catalog", required=True) parser.add_argument("--schema", required=True) diff --git a/pydabs/src/sample_notebook.ipynb b/pydabs/src/sample_notebook.ipynb index fc6620f..a5ce76c 100644 --- a/pydabs/src/sample_notebook.ipynb +++ b/pydabs/src/sample_notebook.ipynb @@ -72,7 +72,7 @@ "dependencies": [ "--editable .." ], - "environment_version": "2" + "environment_version": "4" }, "language": "python", "notebookMetadata": { @@ -81,11 +81,11 @@ "notebookName": "notebook", "widgets": { "catalog": { - "currentValue": "main", + "currentValue": "catalog", "nuid": "c4t4l0g-w1dg-3t12-3456-789012345678", "typedWidgetInfo": { "autoCreated": false, - "defaultValue": "main", + "defaultValue": "catalog", "label": "Catalog", "name": "catalog", "options": { @@ -95,7 +95,7 @@ "parameterDataType": "String" }, "widgetInfo": { - "defaultValue": "main", + "defaultValue": "catalog", "label": "Catalog", "name": "catalog", "options": { @@ -107,7 +107,7 @@ } }, "schema": { - "currentValue": "pieter_noordhuis", + "currentValue": "user_name", "nuid": "5ch3m4-w1dg-3t98-7654-321098765432", "typedWidgetInfo": { "autoCreated": false, @@ -141,7 +141,7 @@ }, "language_info": { "name": "python", - "version": "3.11.4" + "version": "3.12" } }, "nbformat": 4, diff --git a/scripts/update_from_templates.sh b/scripts/update_from_templates.sh index c56a4b8..d66cb3e 100755 --- a/scripts/update_from_templates.sh +++ b/scripts/update_from_templates.sh @@ -1,3 +1,4 @@ + #!/bin/bash set -euo pipefail @@ -19,22 +20,19 @@ function init_bundle() { local TEMPLATE_NAME="$1" local BUNDLE_UUID="${2:-}" local CONFIG_JSON="$3" - + # Extract project_name from JSON local PROJECT_NAME=$(echo "$CONFIG_JSON" | grep -o '"project_name"[[:space:]]*:[[:space:]]*"[^"]*"' | cut -d'"' -f4) - - # Use 'cli' if available, otherwise fall back to 'databricks' - local CLI_CMD="databricks" - if command -v cli >/dev/null 2>&1; then - CLI_CMD="cli" - fi - + + # Use CLI_COMMAND if set, otherwise default to 'databricks' + local CLI_COMMAND="${CLI_COMMAND:-databricks}" + echo echo "# $PROJECT_NAME" - + rm -rf "$PROJECT_NAME" echo "$CONFIG_JSON" > /tmp/config.json - $CLI_CMD bundle init "$TEMPLATE_NAME" --config-file /tmp/config.json + $CLI_COMMAND bundle init "$TEMPLATE_NAME" --config-file /tmp/config.json cleanup "$PROJECT_NAME" "$BUNDLE_UUID" } @@ -63,12 +61,20 @@ fi cd $(dirname $0)/.. +# Use the 'databricks' CLI by default +# To use a custom CLI, set: export CLI_COMMAND=/path/to/cli +echo "Using Databricks CLI: ${CLI_COMMAND:-databricks}" +${CLI_COMMAND:-databricks} --version +echo + init_bundle "default-python" "87d5a23e-7bc7-4f52-98ee-e374b67d5681" '{ "project_name": "default_python", - "include_notebook": "yes", - "include_dlt": "yes", + "include_job": "yes", + "include_pipeline": "yes", "include_python": "yes", - "serverless": "yes" + "serverless": "yes", + "default_catalog": "catalog", + "personal_schemas": "yes" }' init_bundle "default-sql" "853cd9bc-631c-4d4f-bca0-3195c7540854" '{ @@ -81,6 +87,7 @@ init_bundle "default-sql" "853cd9bc-631c-4d4f-bca0-3195c7540854" '{ init_bundle "dbt-sql" "5e5ca8d5-0388-473e-84a1-1414ed89c5df" '{ "project_name": "dbt_sql", "http_path": "/sql/1.0/warehouses/abcdef1234567890", + "serverless": "yes", "default_catalog": "catalog", "personal_schemas": "yes, use a schema based on the current user name during development" }' @@ -100,12 +107,23 @@ init_bundle "lakeflow-pipelines" "87a174ba-60e4-4867-a140-1936bc9b00de" '{ "language": "python" }' -cd contrib -( - init_bundle "templates/data-engineering" "e5f6g7h8-i9j0-1234-efgh-567890123456" '{ - "project_name": "data_engineering", - "default_catalog": "catalog", - "personal_schemas": "yes, use a schema based on the current user name during development" - }' -) -cd .. \ No newline at end of file +init_bundle "default-minimal" "8127e9c1-adac-4c9c-b006-d3450874f663" '{ + "project_name": "default_minimal", + "default_catalog": "catalog", + "personal_schemas": "yes", + "language_choice": "sql" +}' + +# Add .gitkeep files to empty directories in default_minimal +echo > default_minimal/src/.gitkeep +echo > default_minimal/resources/.gitkeep + +init_bundle "pydabs" "4062028b-2184-4acd-9c62-f2ec572f7843" '{ + "project_name": "pydabs", + "include_job": "yes", + "include_pipeline": "yes", + "include_python": "yes", + "serverless": "yes", + "default_catalog": "catalog", + "personal_schemas": "yes" +}'