-
Notifications
You must be signed in to change notification settings - Fork 101
Add DABs template for scala job #66
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5f79aec
919e989
58fb57a
4a95fa7
f73143e
126f47b
54f5265
42cfeba
c231994
0155fab
4769a07
322f75a
9826168
a1e4260
4b2274f
bb3002a
8263434
8cd6917
310fff3
d0f52f8
9a63eb1
feed77a
c7ef5ff
e0ee690
ad28acf
8452f8e
f0c4dbf
c3ffd78
6be2d4b
704453f
47f1364
7724ad8
fdffc50
972b5a6
d2ad6b0
587b9e3
52e1785
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| # scala-job | ||
|
|
||
| This is an (experimental) template for creating using Scala with Databricks Asset Bundles. It uses sbt to compile and package Scala files, and can be used with Databricks Connect for local development. | ||
|
|
||
| Run | ||
| ``` | ||
| databricks bundle init --template-dir contrib/templates/scala-job https://github.com/databricks/bundle-examples | ||
| ``` | ||
|
|
||
| and follow the generated README.md to get started. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| { | ||
garlandz-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "welcome_message": "\nWelcome to the scala-job template for Databricks Asset Bundles!\n\nA workspace was selected based on your current profile. For information about how to change this, see https://docs.databricks.com/dev-tools/cli/profiles.html.\nworkspace_host: {{workspace_host}}", | ||
| "properties": { | ||
| "project_name": { | ||
| "type": "string", | ||
| "description": "\nPlease provide a unique name for this project.\nproject_name", | ||
| "order": 1, | ||
| "pattern": "^[A-Za-z_][A-Za-z0-9-_]+$", | ||
| "pattern_match_failure_message": "Name must consist of letters, numbers, dashes, and underscores." | ||
| }, | ||
| "artifacts_dest_path": { | ||
| "type": "string", | ||
| "description": "\nPlease provide the Volumes destination path in Databricks where the directory will be created containing the JAR and other artifacts to store. [example: /Volumes/abcdef1234567890].\nNote: Your admin must allowlist the Volumes JAR path you specify for your workspace (see https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/allowlist.html)", | ||
| "order": 2, | ||
| "pattern": "^/Volumes(?:/[a-z0-9_-]+)+$", | ||
| "pattern_match_failure_message": "Path must be of the form ^/Volumes(?:/[a-z0-9_-]+)+$" | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| {{ define `version` -}} | ||
| 0.1 | ||
| {{- end }} | ||
|
|
||
| {{ define `dbr_version` -}} | ||
| 16.2 | ||
| {{- end }} | ||
|
|
||
| {{ define `scala_major_minor_version` -}} | ||
| 2.12 | ||
| {{- end }} | ||
|
|
||
| {{ define `scala_version` -}} | ||
| {{template `scala_major_minor_version` .}}.18 | ||
| {{- end}} | ||
|
|
||
| {{ define `java_version` -}} | ||
| 17 | ||
| {{- end}} | ||
|
|
||
| {{ define `organization` -}} | ||
| com.examples | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: this should probably just be hardcoded? Since it's also hardcoded in the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. theres another use in build.sbt.tmpl so this will just reduce the hardcodedness as much as possible |
||
| {{- end }} | ||
|
|
||
| {{ define `main_class_name` -}} | ||
| {{template `organization` .}}.Main | ||
| {{- end }} | ||
|
|
||
| {{ define `jar_path` -}} | ||
| ./target/scala-{{template `scala_major_minor_version` .}}/{{.project_name}}-assembly-{{template `version` .}}.jar | ||
| {{- end }} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| .databricks/ |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,61 @@ | ||||||
| # {{.project_name}} | ||||||
|
|
||||||
| The '{{.project_name}}' project was generated by using the scala-job template. | ||||||
|
|
||||||
| ## Getting started | ||||||
|
|
||||||
| 1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/install.html. The version must be v0.226.0 or later. | ||||||
|
|
||||||
| 2. Authenticate to your Databricks workspace (if you have not done so already): | ||||||
| ``` | ||||||
| $ databricks configure | ||||||
| ``` | ||||||
|
|
||||||
| 3. To deploy a development copy of this project, type: | ||||||
| ``` | ||||||
| $ databricks bundle deploy --target dev | ||||||
| ``` | ||||||
| (Note that "dev" is the default target, so the `--target` parameter | ||||||
| is optional here.) | ||||||
|
|
||||||
| This deploys everything that's defined for this project. | ||||||
| For example, the default template would deploy a job called | ||||||
| `[dev yourname] {{.project_name}}_job` to your workspace. | ||||||
| You can find that job by opening your workspace and clicking on **Workflows**. | ||||||
|
|
||||||
| 4. Similarly, to deploy a production copy, type: | ||||||
| ``` | ||||||
| $ databricks bundle deploy --target prod | ||||||
| ``` | ||||||
|
|
||||||
| 5. To run a job, use the "run" command: | ||||||
| ``` | ||||||
| $ databricks bundle run | ||||||
| ``` | ||||||
|
|
||||||
| 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from | ||||||
| https://docs.databricks.com/dev-tools/vscode-ext.html. | ||||||
|
|
||||||
| 7. For documentation on the Databricks Asset Bundles format used | ||||||
| for this project, and for CI/CD configuration, see | ||||||
| https://docs.databricks.com/dev-tools/bundles/index.html. | ||||||
|
|
||||||
| ## Local Devloop | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| ### Prerequisites | ||||||
| - sbt v1.10.2 or later | ||||||
| - java 17 | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| 1. Import the current directory in your ide (we recommend IntelliJ) where build.sbt is located. Verify it is imported as sbt project. | ||||||
| 2. If you don’t have java, in Intellij, go to File -> Project Structure, SDKs -> + sign to add 17 -> OK | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| Then Run -> Edit Configurations -> Set version to Java 17 from drop | ||||||
| 3. You should now be able to run the code with the UI but you can also just simply run `sbt run` in the terminal. | ||||||
|
|
||||||
| ## Customizations | ||||||
|
|
||||||
| ### Job configuration | ||||||
| The bundles piggybacks off the same configurations used in APIs. If you want to use an existing cluster instead of spinning one up everytime, replace job_cluster_key in tasks with existing_cluster_id: <your_cluster_id> | ||||||
|
|
||||||
| You can also change to an all-purpose (dedicated) cluster by removing the data_security_mode of the created cluster | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| // This file is used to build the sbt project with Databricks Connect. | ||
| // This also includes the instructions on how to to create the jar uploaded via databricks bundle | ||
| scalaVersion := "{{template `scala_version` .}}" | ||
|
|
||
| name := "{{.project_name}}" | ||
| organization := "{{template `organization` .}}" | ||
| version := "{{template `version` .}}" | ||
|
|
||
| libraryDependencies += "com.databricks" % "databricks-connect" % "{{template `dbr_version` .}}.+" | ||
| libraryDependencies += "org.slf4j" % "slf4j-simple" % "2.0.16" | ||
|
|
||
| assembly / assemblyOption ~= { _.withIncludeScala(false) } | ||
| assembly / assemblyExcludedJars := { | ||
| val cp = (assembly / fullClasspath).value | ||
| cp filter { _.data.getName.matches("scala-.*") } // remove Scala libraries | ||
| } | ||
|
|
||
| assemblyMergeStrategy := { | ||
| case _ => MergeStrategy.preferProject | ||
| } | ||
|
|
||
| // to run with new jvm options, a fork is required otherwise it uses same options as sbt process | ||
| run / fork := true | ||
| run / javaOptions += "--add-opens=java.base/java.nio=ALL-UNNAMED" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| # This is a Databricks asset bundle definition for {{.project_name}}. | ||
| # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. | ||
| bundle: | ||
| name: {{.project_name}} | ||
| uuid: {{bundle_uuid}} | ||
|
|
||
| include: | ||
| - resources/*.yml | ||
|
|
||
| workspace: | ||
| host: {{workspace_host}} | ||
| artifact_path: {{.artifacts_dest_path}}/${bundle.name}/${bundle.target}/${workspace.current_user.short_name} | ||
|
|
||
| artifacts: | ||
| default: | ||
| type: jar | ||
| build: sbt package && sbt assembly | ||
| path: . | ||
| files: | ||
| - source: {{template `jar_path` .}} | ||
|
|
||
| targets: | ||
| dev: | ||
| # The default target uses 'mode: development' to create a development copy. | ||
| # - Deployed resources get prefixed with '[dev my_user_name]' | ||
| # - Any job schedules and triggers are paused by default. | ||
| # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. | ||
| mode: development | ||
| default: true | ||
| workspace: | ||
| host: {{workspace_host}} | ||
|
|
||
| prod: | ||
| mode: production | ||
| workspace: | ||
| host: {{workspace_host}} | ||
| # We explicitly deploy to /Workspace/Users/{{user_name}} to make sure we only have a single copy. | ||
| root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} | ||
| permissions: | ||
| - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} | ||
| level: CAN_MANAGE |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| // The project folder is used to store sbt specific project files | ||
| // This file is used to define the plugins that are used in the sbt project. | ||
| // In particular, this includes the assembly plugin to generate an uber jar. | ||
| addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.0.0") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
|
|
||
| This folder is reserved for Databricks Asset Bundles resource definitions. | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| # The main job for {{.project_name}} | ||
|
|
||
| resources: | ||
| jobs: | ||
| {{.project_name}}: | ||
| name: {{.project_name}} | ||
| tasks: | ||
| - task_key: main_task | ||
| job_cluster_key: {{.project_name}}_job_cluster | ||
| spark_jar_task: | ||
| main_class_name: {{template `main_class_name` .}} | ||
| libraries: | ||
| - jar: ../{{template `jar_path` .}} | ||
| job_clusters: | ||
| - job_cluster_key: {{.project_name}}_job_cluster | ||
| new_cluster: | ||
| spark_version: {{template `dbr_version` .}}.x-scala{{template `scala_major_minor_version` .}} | ||
| node_type_id: i3.xlarge # Default instance type (can be changed) | ||
| autoscale: | ||
| min_workers: 1 | ||
| max_workers: 4 | ||
| data_security_mode: USER_ISOLATION |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| /* | ||
| This project is a simple example of how to use the Databricks Connect Scala client to run on | ||
| serverless or on a Databricks cluster. | ||
| */ | ||
| package com.examples | ||
|
|
||
| import com.databricks.connect.DatabricksSession | ||
| import org.apache.spark.sql.{SparkSession, functions => F} | ||
| import org.apache.spark.sql.functions.udf | ||
|
|
||
| object Main { | ||
| def main(args: Array[String]): Unit = { | ||
| println("Hello, World!") | ||
|
|
||
| val spark = getSession() | ||
| println("Showing range ...") | ||
| spark.range(3).show() | ||
|
|
||
| println("Showing nyctaxi trips ...") | ||
| val df = spark.read.table("samples.nyctaxi.trips").limit(10) | ||
|
|
||
| // Define a simple UDF that formats the passenger count as a string | ||
| val testudf = udf((count: String) => s"test: $count") | ||
|
|
||
| // Apply the UDF to the passenger_count column | ||
| val transformedDF = df.withColumn("testresult", testudf(F.col("dropoff_zip"))) | ||
|
|
||
| // Show the transformed DataFrame | ||
| transformedDF.show() | ||
| } | ||
|
|
||
| def getSession(): SparkSession = { | ||
| // Get DATABRICKS_RUNTIME_VERSION environment variable | ||
| if (sys.env.contains("DATABRICKS_RUNTIME_VERSION")) { | ||
| println("Running in a Databricks cluster") | ||
| SparkSession.builder().getOrCreate() | ||
| } else { | ||
| println("Running outside Databricks") | ||
| DatabricksSession.builder() | ||
| .serverless() | ||
| .addCompiledArtifacts(Main.getClass.getProtectionDomain.getCodeSource.getLocation.toURI) | ||
| .getOrCreate() | ||
| } | ||
| } | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.