Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
5f79aec
Add dabs scala template job
garlandz-db Feb 13, 2025
919e989
remove
garlandz-db Feb 13, 2025
58fb57a
update
garlandz-db Feb 14, 2025
4a95fa7
.
garlandz-db Feb 14, 2025
f73143e
.
garlandz-db Feb 14, 2025
126f47b
move
garlandz-db Feb 14, 2025
54f5265
update scala version
garlandz-db Feb 14, 2025
42cfeba
update scala version
garlandz-db Feb 14, 2025
c231994
update iwth example
garlandz-db Feb 14, 2025
0155fab
format
garlandz-db Feb 14, 2025
4769a07
fix
garlandz-db Feb 14, 2025
322f75a
use variables
garlandz-db Feb 14, 2025
9826168
changes
garlandz-db Feb 14, 2025
a1e4260
We add a whole damn unofficial release of dbconnect
garlandz-db Feb 14, 2025
4b2274f
clarify
garlandz-db Feb 14, 2025
bb3002a
add slf4j
garlandz-db Feb 17, 2025
8263434
clarify
garlandz-db Feb 17, 2025
8cd6917
fix this
garlandz-db Feb 17, 2025
310fff3
fix this
garlandz-db Feb 17, 2025
d0f52f8
simplify my life
garlandz-db Feb 17, 2025
9a63eb1
Changes
garlandz-db Feb 20, 2025
feed77a
fix
garlandz-db Feb 20, 2025
c7ef5ff
comments
garlandz-db Feb 20, 2025
e0ee690
fix wording
garlandz-db Feb 20, 2025
ad28acf
Move
garlandz-db Feb 20, 2025
8452f8e
Changes from feedback
garlandz-db Feb 24, 2025
f0c4dbf
fix
garlandz-db Feb 24, 2025
c3ffd78
Update contrib/templates/scala-job/README.md
garlandz-db Feb 28, 2025
6be2d4b
Update contrib/templates/scala-job/README.md
garlandz-db Feb 28, 2025
704453f
Update contrib/templates/scala-job/databricks_template_schema.json
garlandz-db Feb 28, 2025
47f1364
Update contrib/templates/scala-job/databricks_template_schema.json
garlandz-db Feb 28, 2025
7724ad8
Update contrib/templates/scala-job/template/{{.project_name}}/README.…
garlandz-db Feb 28, 2025
fdffc50
Update contrib/templates/scala-job/template/{{.project_name}}/README.…
garlandz-db Feb 28, 2025
972b5a6
Update contrib/templates/scala-job/template/{{.project_name}}/databri…
garlandz-db Feb 28, 2025
d2ad6b0
Update contrib/templates/scala-job/template/{{.project_name}}/databri…
garlandz-db Feb 28, 2025
587b9e3
Update contrib/templates/scala-job/template/{{.project_name}}/README.…
garlandz-db Feb 28, 2025
52e1785
add template organization variable to build.sbt.tmpl
garlandz-db Feb 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions contrib/templates/scala-job/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# scala-job

This is an (experimental) template for creating using Scala with Databricks Asset Bundles. It uses sbt to compile and package Scala files, and can be used with Databricks Connect for local development.

Run
```
databricks bundle init --template-dir contrib/templates/scala-job https://github.com/databricks/bundle-examples
```

and follow the generated README.md to get started.
19 changes: 19 additions & 0 deletions contrib/templates/scala-job/databricks_template_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"welcome_message": "\nWelcome to the scala-job template for Databricks Asset Bundles!\n\nA workspace was selected based on your current profile. For information about how to change this, see https://docs.databricks.com/dev-tools/cli/profiles.html.\nworkspace_host: {{workspace_host}}",
"properties": {
"project_name": {
"type": "string",
"description": "\nPlease provide a unique name for this project.\nproject_name",
"order": 1,
"pattern": "^[A-Za-z_][A-Za-z0-9-_]+$",
"pattern_match_failure_message": "Name must consist of letters, numbers, dashes, and underscores."
},
"artifacts_dest_path": {
"type": "string",
"description": "\nPlease provide the Volumes destination path in Databricks where the directory will be created containing the JAR and other artifacts to store. [example: /Volumes/abcdef1234567890].\nNote: Your admin must allowlist the Volumes JAR path you specify for your workspace (see https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/allowlist.html)",
"order": 2,
"pattern": "^/Volumes(?:/[a-z0-9_-]+)+$",
"pattern_match_failure_message": "Path must be of the form ^/Volumes(?:/[a-z0-9_-]+)+$"
}
}
}
31 changes: 31 additions & 0 deletions contrib/templates/scala-job/library/template_variables.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{{ define `version` -}}
0.1
{{- end }}

{{ define `dbr_version` -}}
16.2
{{- end }}

{{ define `scala_major_minor_version` -}}
2.12
{{- end }}

{{ define `scala_version` -}}
{{template `scala_major_minor_version` .}}.18
{{- end}}

{{ define `java_version` -}}
17
{{- end}}

{{ define `organization` -}}
com.examples
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: this should probably just be hardcoded? Since it's also hardcoded in the src/main.scala/com/examples path below?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

theres another use in build.sbt.tmpl so this will just reduce the hardcodedness as much as possible

{{- end }}

{{ define `main_class_name` -}}
{{template `organization` .}}.Main
{{- end }}

{{ define `jar_path` -}}
./target/scala-{{template `scala_major_minor_version` .}}/{{.project_name}}-assembly-{{template `version` .}}.jar
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.databricks/
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# {{.project_name}}

The '{{.project_name}}' project was generated by using the scala-job template.

## Getting started

1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/install.html. The version must be v0.226.0 or later.

2. Authenticate to your Databricks workspace (if you have not done so already):
```
$ databricks configure
```

3. To deploy a development copy of this project, type:
```
$ databricks bundle deploy --target dev
```
(Note that "dev" is the default target, so the `--target` parameter
is optional here.)

This deploys everything that's defined for this project.
For example, the default template would deploy a job called
`[dev yourname] {{.project_name}}_job` to your workspace.
You can find that job by opening your workspace and clicking on **Workflows**.

4. Similarly, to deploy a production copy, type:
```
$ databricks bundle deploy --target prod
```

5. To run a job, use the "run" command:
```
$ databricks bundle run
```

6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
https://docs.databricks.com/dev-tools/vscode-ext.html.

7. For documentation on the Databricks Asset Bundles format used
for this project, and for CI/CD configuration, see
https://docs.databricks.com/dev-tools/bundles/index.html.

## Local Devloop
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
## Local Devloop
## Local development


### Prerequisites
- sbt v1.10.2 or later
- java 17
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- java 17
- Java 17


1. Import the current directory in your ide (we recommend IntelliJ) where build.sbt is located. Verify it is imported as sbt project.
2. If you don’t have java, in Intellij, go to File -> Project Structure, SDKs -> + sign to add 17 -> OK
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
2. If you don’t have java, in Intellij, go to File -> Project Structure, SDKs -> + sign to add 17 -> OK
2. If you don’t have Java, in IntelliJ, go to File -> Project Structure, SDKs -> + sign to add 17 -> OK


Then Run -> Edit Configurations -> Set version to Java 17 from drop
3. You should now be able to run the code with the UI but you can also just simply run `sbt run` in the terminal.

## Customizations

### Job configuration
The bundles piggybacks off the same configurations used in APIs. If you want to use an existing cluster instead of spinning one up everytime, replace job_cluster_key in tasks with existing_cluster_id: <your_cluster_id>

You can also change to an all-purpose (dedicated) cluster by removing the data_security_mode of the created cluster

Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// This file is used to build the sbt project with Databricks Connect.
// This also includes the instructions on how to to create the jar uploaded via databricks bundle
scalaVersion := "{{template `scala_version` .}}"

name := "{{.project_name}}"
organization := "{{template `organization` .}}"
version := "{{template `version` .}}"

libraryDependencies += "com.databricks" % "databricks-connect" % "{{template `dbr_version` .}}.+"
libraryDependencies += "org.slf4j" % "slf4j-simple" % "2.0.16"

assembly / assemblyOption ~= { _.withIncludeScala(false) }
assembly / assemblyExcludedJars := {
val cp = (assembly / fullClasspath).value
cp filter { _.data.getName.matches("scala-.*") } // remove Scala libraries
}

assemblyMergeStrategy := {
case _ => MergeStrategy.preferProject
}

// to run with new jvm options, a fork is required otherwise it uses same options as sbt process
run / fork := true
run / javaOptions += "--add-opens=java.base/java.nio=ALL-UNNAMED"
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# This is a Databricks asset bundle definition for {{.project_name}}.
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:
name: {{.project_name}}
uuid: {{bundle_uuid}}

include:
- resources/*.yml

workspace:
host: {{workspace_host}}
artifact_path: {{.artifacts_dest_path}}/${bundle.name}/${bundle.target}/${workspace.current_user.short_name}

artifacts:
default:
type: jar
build: sbt package && sbt assembly
path: .
files:
- source: {{template `jar_path` .}}

targets:
dev:
# The default target uses 'mode: development' to create a development copy.
# - Deployed resources get prefixed with '[dev my_user_name]'
# - Any job schedules and triggers are paused by default.
# See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
mode: development
default: true
workspace:
host: {{workspace_host}}

prod:
mode: production
workspace:
host: {{workspace_host}}
# We explicitly deploy to /Workspace/Users/{{user_name}} to make sure we only have a single copy.
root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target}
permissions:
- {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}}
level: CAN_MANAGE
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// The project folder is used to store sbt specific project files
// This file is used to define the plugins that are used in the sbt project.
// In particular, this includes the assembly plugin to generate an uber jar.
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.0.0")
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

This folder is reserved for Databricks Asset Bundles resource definitions.

Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# The main job for {{.project_name}}

resources:
jobs:
{{.project_name}}:
name: {{.project_name}}
tasks:
- task_key: main_task
job_cluster_key: {{.project_name}}_job_cluster
spark_jar_task:
main_class_name: {{template `main_class_name` .}}
libraries:
- jar: ../{{template `jar_path` .}}
job_clusters:
- job_cluster_key: {{.project_name}}_job_cluster
new_cluster:
spark_version: {{template `dbr_version` .}}.x-scala{{template `scala_major_minor_version` .}}
node_type_id: i3.xlarge # Default instance type (can be changed)
autoscale:
min_workers: 1
max_workers: 4
data_security_mode: USER_ISOLATION
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
This project is a simple example of how to use the Databricks Connect Scala client to run on
serverless or on a Databricks cluster.
*/
package com.examples

import com.databricks.connect.DatabricksSession
import org.apache.spark.sql.{SparkSession, functions => F}
import org.apache.spark.sql.functions.udf

object Main {
def main(args: Array[String]): Unit = {
println("Hello, World!")

val spark = getSession()
println("Showing range ...")
spark.range(3).show()

println("Showing nyctaxi trips ...")
val df = spark.read.table("samples.nyctaxi.trips").limit(10)

// Define a simple UDF that formats the passenger count as a string
val testudf = udf((count: String) => s"test: $count")

// Apply the UDF to the passenger_count column
val transformedDF = df.withColumn("testresult", testudf(F.col("dropoff_zip")))

// Show the transformed DataFrame
transformedDF.show()
}

def getSession(): SparkSession = {
// Get DATABRICKS_RUNTIME_VERSION environment variable
if (sys.env.contains("DATABRICKS_RUNTIME_VERSION")) {
println("Running in a Databricks cluster")
SparkSession.builder().getOrCreate()
} else {
println("Running outside Databricks")
DatabricksSession.builder()
.serverless()
.addCompiledArtifacts(Main.getClass.getProtectionDomain.getCodeSource.getLocation.toURI)
.getOrCreate()
}
}
}