databricks · andrewnester · Feb 28, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 14, 2025
diff --git a/contrib/templates/scala-job/README.md b/contrib/templates/scala-job/README.md
@@ -0,0 +1,10 @@
+# scala-job
+
+This is an (experimental) template for creating using Scala with Databricks Asset Bundles. It uses sbt to compile and package Scala files, and can be used with Databricks Connect for local development.
+
+Run 
+```
+databricks bundle init --template-dir contrib/templates/scala-job https://github.com/databricks/bundle-examples
+```
+
+and follow the generated README.md to get started.
diff --git a/contrib/templates/scala-job/databricks_template_schema.json b/contrib/templates/scala-job/databricks_template_schema.json
@@ -0,0 +1,19 @@
+{
+  "welcome_message": "\nWelcome to the scala-job template for Databricks Asset Bundles!\n\nA workspace was selected based on your current profile. For information about how to change this, see https://docs.databricks.com/dev-tools/cli/profiles.html.\nworkspace_host: {{workspace_host}}",
+  "properties": {
+    "project_name": {
+      "type": "string",
+      "description": "\nPlease provide a unique name for this project.\nproject_name",
+      "order": 1,
+      "pattern": "^[A-Za-z_][A-Za-z0-9-_]+$",
+      "pattern_match_failure_message": "Name must consist of letters, numbers, dashes, and underscores."
+    },
+    "artifacts_dest_path": {
+      "type": "string",
+      "description": "\nPlease provide the Volumes destination path in Databricks where the directory will be created containing the JAR and other artifacts to store. [example: /Volumes/abcdef1234567890].\nNote: Your admin must allowlist the Volumes JAR path you specify for your workspace (see https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/allowlist.html)",
+      "order": 2,
+      "pattern": "^/Volumes(?:/[a-z0-9_-]+)+$",
+      "pattern_match_failure_message": "Path must be of the form ^/Volumes(?:/[a-z0-9_-]+)+$"
+    }
+  }
+}
diff --git a/contrib/templates/scala-job/library/template_variables.tmpl b/contrib/templates/scala-job/library/template_variables.tmpl
@@ -0,0 +1,31 @@
+{{ define `version` -}}
+    0.1
+{{- end }}
+
+{{ define `dbr_version` -}}
+    16.2
+{{- end }}
+
+{{ define `scala_major_minor_version` -}}
+    2.12
+{{- end }}
+
+{{ define `scala_version` -}}
+    {{template `scala_major_minor_version` .}}.18
+{{- end}}
+
+{{ define `java_version` -}}
+    17
+{{- end}}
+
+{{ define `organization` -}}
+    com.examples
+{{- end }}
+
+{{ define `main_class_name` -}}
+    {{template `organization` .}}.Main
+{{- end }}
+
+{{ define `jar_path` -}}
+    ./target/scala-{{template `scala_major_minor_version` .}}/{{.project_name}}-assembly-{{template `version` .}}.jar
+{{- end }}
diff --git a/contrib/templates/scala-job/template/{{.project_name}}/.gitignore b/contrib/templates/scala-job/template/{{.project_name}}/.gitignore
@@ -0,0 +1 @@
+.databricks/
diff --git a/contrib/templates/scala-job/template/{{.project_name}}/README.md.tmpl b/contrib/templates/scala-job/template/{{.project_name}}/README.md.tmpl
@@ -0,0 +1,61 @@
+# {{.project_name}}
+
+The '{{.project_name}}' project was generated by using the scala-job template.
+
+## Getting started
+
+1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/install.html. The version must be v0.226.0 or later.
+
+2. Authenticate to your Databricks workspace (if you have not done so already):
+    ```
+    $ databricks configure
+    ```
+
+3. To deploy a development copy of this project, type:
+    ```
+    $ databricks bundle deploy --target dev
+    ```
+    (Note that "dev" is the default target, so the `--target` parameter
+    is optional here.)
+
+    This deploys everything that's defined for this project.
+    For example, the default template would deploy a job called
+    `[dev yourname] {{.project_name}}_job` to your workspace.
+    You can find that job by opening your workspace and clicking on **Workflows**.
+
+4. Similarly, to deploy a production copy, type:
+   ```
+   $ databricks bundle deploy --target prod
+   ```
+
+5. To run a job, use the "run" command:
+   ```
+   $ databricks bundle run
+   ```
+
+6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
+   https://docs.databricks.com/dev-tools/vscode-ext.html.
+
+7. For documentation on the Databricks Asset Bundles format used
+   for this project, and for CI/CD configuration, see
+   https://docs.databricks.com/dev-tools/bundles/index.html.
+
+## Local Devloop
-## Local Devloop
+## Local development
-## Local Devloop
+## Local development
+
+### Prerequisites
+- sbt v1.10.2 or later
+- java 17
- java 17
+- Java 17
- java 17
+- Java 17
+
+1. Import the current directory in your ide (we recommend IntelliJ) where build.sbt is located. Verify it is imported as sbt project.
+2. If you don’t have java, in Intellij, go to File -> Project Structure, SDKs -> + sign to add 17 -> OK
-2. If you don’t have java, in Intellij, go to File -> Project Structure, SDKs -> + sign to add 17 -> OK
+2. If you don’t have Java, in IntelliJ, go to File -> Project Structure, SDKs -> + sign to add 17 -> OK
-2. If you don’t have java, in Intellij, go to File -> Project Structure, SDKs -> + sign to add 17 -> OK
+2. If you don’t have Java, in IntelliJ, go to File -> Project Structure, SDKs -> + sign to add 17 -> OK
+
+   Then Run -> Edit Configurations -> Set version to Java 17 from drop
+3. You should now be able to run the code with the UI but you can also just simply run `sbt run` in the terminal.
+
+## Customizations
+
+### Job configuration
+The bundles piggybacks off the same configurations used in APIs. If you want to use an existing cluster instead of spinning one up everytime, replace job_cluster_key in tasks with existing_cluster_id: <your_cluster_id>
+
+You can also change to an all-purpose (dedicated) cluster by removing the data_security_mode of the created cluster
+
diff --git a/contrib/templates/scala-job/template/{{.project_name}}/build.sbt.tmpl b/contrib/templates/scala-job/template/{{.project_name}}/build.sbt.tmpl
@@ -0,0 +1,24 @@
+// This file is used to build the sbt project with Databricks Connect.
+// This also includes the instructions on how to to create the jar uploaded via databricks bundle
+scalaVersion := "{{template `scala_version` .}}"
+
+name := "{{.project_name}}"
+organization := "{{template `organization` .}}"
+version := "{{template `version` .}}"
+
+libraryDependencies += "com.databricks" % "databricks-connect" % "{{template `dbr_version` .}}.+"
+libraryDependencies += "org.slf4j" % "slf4j-simple" % "2.0.16"
+
+assembly / assemblyOption ~= { _.withIncludeScala(false) }
+assembly / assemblyExcludedJars := {
+  val cp = (assembly / fullClasspath).value
+  cp filter { _.data.getName.matches("scala-.*") } // remove Scala libraries
+}
+
+assemblyMergeStrategy := {
+  case _ => MergeStrategy.preferProject
+}
+
+// to run with new jvm options, a fork is required otherwise it uses same options as sbt process
+run / fork := true
+run / javaOptions += "--add-opens=java.base/java.nio=ALL-UNNAMED"
diff --git a/contrib/templates/scala-job/template/{{.project_name}}/databricks.yml.tmpl b/contrib/templates/scala-job/template/{{.project_name}}/databricks.yml.tmpl
@@ -0,0 +1,41 @@
+# This is a Databricks asset bundle definition for {{.project_name}}.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: {{.project_name}}
+  uuid: {{bundle_uuid}}
+
+include:
+  - resources/*.yml
+
+workspace:
+  host: {{workspace_host}}
+  artifact_path: {{.artifacts_dest_path}}/${bundle.name}/${bundle.target}/${workspace.current_user.short_name}
+
+artifacts:
+  default:
+    type: jar
+    build: sbt package && sbt assembly
+    path: .
+    files:
+      - source: {{template `jar_path` .}}
+
+targets:
+  dev:
+    # The default target uses 'mode: development' to create a development copy.
+    # - Deployed resources get prefixed with '[dev my_user_name]'
+    # - Any job schedules and triggers are paused by default.
+    # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
+    mode: development
+    default: true
+    workspace:
+      host: {{workspace_host}}
+
+  prod:
+    mode: production
+    workspace:
+      host: {{workspace_host}}
+      # We explicitly deploy to /Workspace/Users/{{user_name}} to make sure we only have a single copy.
+      root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target}
+    permissions:
+      - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}}
+        level: CAN_MANAGE
diff --git a/contrib/templates/scala-job/template/{{.project_name}}/project/plugins.sbt.tmpl b/contrib/templates/scala-job/template/{{.project_name}}/project/plugins.sbt.tmpl
@@ -0,0 +1,4 @@
+// The project folder is used to store sbt specific project files
+// This file is used to define the plugins that are used in the sbt project.
+// In particular, this includes the assembly plugin to generate an uber jar.
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.0.0")
diff --git a/contrib/templates/scala-job/template/{{.project_name}}/resources/.gitkeep b/contrib/templates/scala-job/template/{{.project_name}}/resources/.gitkeep
@@ -0,0 +1,3 @@
+
+This folder is reserved for Databricks Asset Bundles resource definitions.
+
diff --git a/...b/templates/scala-job/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl b/...b/templates/scala-job/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl
@@ -0,0 +1,22 @@
+# The main job for {{.project_name}}
+
+resources:
+  jobs:
+    {{.project_name}}:
+      name: {{.project_name}}
+      tasks:
+        - task_key: main_task
+          job_cluster_key: {{.project_name}}_job_cluster
+          spark_jar_task:
+            main_class_name: {{template `main_class_name` .}}
+          libraries:
+            - jar: ../{{template `jar_path` .}}
+      job_clusters:
+        - job_cluster_key: {{.project_name}}_job_cluster
+          new_cluster:
+            spark_version: {{template `dbr_version` .}}.x-scala{{template `scala_major_minor_version` .}}
+            node_type_id: i3.xlarge  # Default instance type (can be changed)
+            autoscale:
+              min_workers: 1
+              max_workers: 4
+            data_security_mode: USER_ISOLATION
diff --git a/...rib/templates/scala-job/template/{{.project_name}}/src/main/scala/com/examples/Main.scala b/...rib/templates/scala-job/template/{{.project_name}}/src/main/scala/com/examples/Main.scala
@@ -0,0 +1,45 @@
+/*
+This project is a simple example of how to use the Databricks Connect Scala client to run on
+serverless or on a Databricks cluster.
+ */
+package com.examples
+
+import com.databricks.connect.DatabricksSession
+import org.apache.spark.sql.{SparkSession, functions => F}
+import org.apache.spark.sql.functions.udf
+
+object Main {
+  def main(args: Array[String]): Unit = {
+    println("Hello, World!")
+
+    val spark = getSession()
+    println("Showing range ...")
+    spark.range(3).show()
+
+    println("Showing nyctaxi trips ...")
+    val df = spark.read.table("samples.nyctaxi.trips").limit(10)
+
+    // Define a simple UDF that formats the passenger count as a string
+    val testudf = udf((count: String) => s"test: $count")
+
+    // Apply the UDF to the passenger_count column
+    val transformedDF = df.withColumn("testresult", testudf(F.col("dropoff_zip")))
+
+    // Show the transformed DataFrame
+    transformedDF.show()
+  }
+
+  def getSession(): SparkSession = {
+    // Get DATABRICKS_RUNTIME_VERSION environment variable
+    if (sys.env.contains("DATABRICKS_RUNTIME_VERSION")) {
+      println("Running in a Databricks cluster")
+      SparkSession.builder().getOrCreate()
+    } else {
+      println("Running outside Databricks")
+      DatabricksSession.builder()
+        .serverless()
+        .addCompiledArtifacts(Main.getClass.getProtectionDomain.getCodeSource.getLocation.toURI)
+        .getOrCreate()
+    }
+  }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@

		This folder is reserved for Databricks Asset Bundles resource definitions.