Spark 3.4: Change ExprUtils to implicit

Yxang · Yxang · commit 3dcdd81e4210 · 2023-07-25T17:54:46.000+08:00
diff --git a/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
@@ -106,7 +106,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
     ("cityHash64", Array("value"))
   ).foreach {
     case (func_name: String, func_args: Array[String]) =>
-      test(s"shard by $func_name")(runTest(func_name, func_args))
+      test(s"shard by $func_name(${func_args.mkString(",")})")(runTest(func_name, func_args))
   }
 
 }
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala b/spark-3.4/clickhouse-spark/src/main/scala/org/apache/spark/sql/clickhouse/ExprUtils.scala
@@ -31,34 +31,40 @@ import xenon.clickhouse.spec.ClusterSpec
 
 import scala.util.{Failure, Success, Try}
 
-class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with Serializable {
+object ExprUtils extends SQLConfHelper with Serializable {
 
-  def toSparkPartitions(partitionKey: Option[List[Expr]]): Array[Transform] =
-    partitionKey.seq.flatten.flatten(toSparkTransformOpt).toArray
+  def toSparkPartitions(partitionKey: Option[List[Expr]])(implicit
+    functionRegistry: FunctionRegistry
+  ): Array[Transform] =
+    partitionKey.seq.flatten.flatten(toSparkTransformOpt(_)).toArray
 
-  def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]]): Array[Transform] =
-    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt).toArray
+  def toSparkSplits(shardingKey: Option[Expr], partitionKey: Option[List[Expr]])(implicit
+    functionRegistry: FunctionRegistry
+  ): Array[Transform] =
+    (shardingKey.seq ++ partitionKey.seq.flatten).flatten(toSparkTransformOpt(_)).toArray
 
   def toSparkSortOrders(
     shardingKeyIgnoreRand: Option[Expr],
     partitionKey: Option[List[Expr]],
     sortingKey: Option[List[OrderExpr]],
     cluster: Option[ClusterSpec]
-  ): Array[SortOrder] =
+  )(implicit functionRegistry: FunctionRegistry): Array[SortOrder] =
     toSparkSplits(
       shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight)),
       partitionKey
     ).map(Expressions.sort(_, SortDirection.ASCENDING)) ++:
       sortingKey.seq.flatten.flatten { case OrderExpr(expr, asc, nullFirst) =>
         val direction = if (asc) SortDirection.ASCENDING else SortDirection.DESCENDING
         val nullOrder = if (nullFirst) NullOrdering.NULLS_FIRST else NullOrdering.NULLS_LAST
-        toSparkTransformOpt(expr).map(trans => Expressions.sort(trans, direction, nullOrder))
+        toSparkTransformOpt(expr).map(trans =>
+          Expressions.sort(trans, direction, nullOrder)
+        )
       }.toArray
 
   private def loadV2FunctionOpt(
     name: String,
     args: Seq[Expression]
-  ): Option[BoundFunction] = {
+  )(implicit functionRegistry: FunctionRegistry): Option[BoundFunction] = {
     def loadFunction(ident: Identifier): UnboundFunction =
       functionRegistry.load(ident.name).getOrElse(throw new NoSuchFunctionException(ident))
     val inputType = StructType(args.zipWithIndex.map {
@@ -77,7 +83,10 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     }
   }
 
-  def toCatalyst(v2Expr: V2Expression, fields: Array[StructField]): Expression =
+  def toCatalyst(
+    v2Expr: V2Expression,
+    fields: Array[StructField]
+  )(implicit functionRegistry: FunctionRegistry): Expression =
     v2Expr match {
       case IdentityTransform(ref) => toCatalyst(ref, fields)
       case ref: NamedReference if ref.fieldNames.length == 1 =>
@@ -88,35 +97,35 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
         BoundReference(ordinal, field.dataType, field.nullable)
       case t: Transform =>
         val catalystArgs = t.arguments().map(toCatalyst(_, fields))
-        loadV2FunctionOpt(t.name(), catalystArgs).map { bound =>
-          TransformExpression(bound, catalystArgs)
-        }.getOrElse {
+        loadV2FunctionOpt(t.name(), catalystArgs).map(bound => TransformExpression(bound, catalystArgs)).getOrElse {
           throw CHClientException(s"Unsupported expression: $v2Expr")
         }
       case _ => throw CHClientException(
           s"Unsupported expression: $v2Expr"
         )
     }
 
-  def toSparkTransformOpt(expr: Expr): Option[Transform] = Try(toSparkExpression(expr)) match {
-    // need this function because spark `Table`'s `partitioning` field should be `Transform`
-    case Success(t: Transform) => Some(t)
-    case Success(_) => None
-    case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None
-    case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow))
-  }
+  def toSparkTransformOpt(expr: Expr)(implicit functionRegistry: FunctionRegistry): Option[Transform] =
+    Try(toSparkExpression(expr)) match {
+      // need this function because spark `Table`'s `partitioning` field should be `Transform`
+      case Success(t: Transform) => Some(t)
+      case Success(_) => None
+      case Failure(_) if conf.getConf(IGNORE_UNSUPPORTED_TRANSFORM) => None
+      case Failure(rethrow) => throw new AnalysisException(rethrow.getMessage, cause = Some(rethrow))
+    }
 
-  def toSparkExpression(expr: Expr): V2Expression = expr match {
-    case FieldRef(col) => identity(col)
-    case StringLiteral(value) => literal(value)
-    case FuncExpr("rand", Nil) => apply("rand")
-    case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col)
-    case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) =>
-      apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*)
-    case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported")
-  }
+  def toSparkExpression(expr: Expr)(implicit functionRegistry: FunctionRegistry): V2Expression =
+    expr match {
+      case FieldRef(col) => identity(col)
+      case StringLiteral(value) => literal(value)
+      case FuncExpr("rand", Nil) => apply("rand")
+      case FuncExpr("toYYYYMMDD", List(FuncExpr("toDate", List(FieldRef(col))))) => identity(col)
+      case FuncExpr(funName, args) if functionRegistry.getFuncMappingByCk.contains(funName) =>
+        apply(functionRegistry.getFuncMappingByCk(funName), args.map(toSparkExpression): _*)
+      case unsupported => throw CHClientException(s"Unsupported ClickHouse expression: $unsupported")
+    }
 
-  def toClickHouse(transform: Transform): Expr = transform match {
+  def toClickHouse(transform: Transform)(implicit functionRegistry: FunctionRegistry): Expr = transform match {
     case IdentityTransform(fieldRefs) => FieldRef(fieldRefs.describe)
     case ApplyTransform(name, args) if functionRegistry.getFuncMappingBySpark.contains(name) =>
       FuncExpr(functionRegistry.getFuncMappingBySpark(name), args.map(arg => SQLExpr(arg.describe())).toList)
@@ -128,7 +137,7 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     primarySchema: StructType,
     secondarySchema: StructType,
     transform: Transform
-  ): StructField = transform match {
+  )(implicit functionRegistry: FunctionRegistry): StructField = transform match {
     case IdentityTransform(FieldReference(Seq(col))) => primarySchema.find(_.name == col)
         .orElse(secondarySchema.find(_.name == col))
         .getOrElse(throw CHClientException(s"Invalid partition column: $col"))
@@ -142,10 +151,6 @@ class ExprUtils(functionRegistry: FunctionRegistry) extends SQLConfHelper with S
     case bucket: BucketTransform => throw CHClientException(s"Bucket transform not support yet: $bucket")
     case other: Transform => throw CHClientException(s"Unsupported transform: $other")
   }
-}
-
-object ExprUtils {
-  def apply(functionRegistry: FunctionRegistry): ExprUtils = new ExprUtils(functionRegistry)
 
   def toSplitWithModulo(shardingKey: Expr, weight: Int): FuncExpr =
     FuncExpr("positiveModulo", List(shardingKey, StringLiteral(weight.toString)))
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala
@@ -209,7 +209,7 @@ class ClickHouseCatalog extends TableCatalog
 
     val partitionsClause = partitions match {
       case transforms if transforms.nonEmpty =>
-        transforms.map(ExprUtils(functionRegistry).toClickHouse(_).sql).mkString("PARTITION BY (", ", ", ")")
+        transforms.map(ExprUtils.toClickHouse(_)(functionRegistry).sql).mkString("PARTITION BY (", ", ", ")")
       case _ => ""
     }
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseTable.scala
@@ -133,11 +133,11 @@ case class ClickHouseTable(
   private lazy val metadataSchema: StructType =
     StructType(metadataColumns.map(_.asInstanceOf[ClickHouseMetadataColumn].toStructField))
 
-  override lazy val partitioning: Array[Transform] = ExprUtils(functionRegistry).toSparkPartitions(partitionKey)
+  override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey)(functionRegistry)
 
   override lazy val partitionSchema: StructType = StructType(
     partitioning.map(partTransform =>
-      ExprUtils(functionRegistry).inferTransformSchema(schema, metadataSchema, partTransform)
+      ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform)(functionRegistry)
     )
   )
 
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/ClickHouseWriter.scala
@@ -63,7 +63,7 @@ abstract class ClickHouseWriter(writeJob: WriteJobDescription)
   protected lazy val shardExpr: Option[Expression] = writeJob.sparkShardExpr match {
     case None => None
     case Some(v2Expr) =>
-      val catalystExpr = ExprUtils(writeJob.functionRegistry).toCatalyst(v2Expr, writeJob.dataSetSchema.fields)
+      val catalystExpr = ExprUtils.toCatalyst(v2Expr, writeJob.dataSetSchema.fields)(writeJob.functionRegistry)
       catalystExpr match {
         case BoundReference(_, dataType, _)
             if dataType.isInstanceOf[ByteType] // list all integral types here because we can not access `IntegralType`
diff --git a/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala b/spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/write/WriteJobDescription.scala
@@ -41,6 +41,8 @@ case class WriteJobDescription(
   functionRegistry: FunctionRegistry
 ) {
 
+  implicit val _functionRegistry: FunctionRegistry = functionRegistry
+
   def targetDatabase(convert2Local: Boolean): String = tableEngineSpec match {
     case dist: DistributedEngineSpec if convert2Local => dist.local_db
     case _ => tableSpec.database
@@ -57,7 +59,7 @@ case class WriteJobDescription(
   }
 
   def sparkShardExpr: Option[Expression] = shardingKeyIgnoreRand match {
-    case Some(expr) => ExprUtils(functionRegistry).toSparkTransformOpt(expr)
+    case Some(expr) => ExprUtils.toSparkTransformOpt(expr)
     case _ => None
   }
 
@@ -69,12 +71,12 @@ case class WriteJobDescription(
     //     need to apply module during sorting in `toSparkSortOrders`), data belongs to shard 1 will be sorted in the
     //     front for all tasks, resulting in instant high pressure for shard 1 when stage starts.
     if (writeOptions.repartitionByPartition) {
-      ExprUtils(functionRegistry).toSparkSplits(
+      ExprUtils.toSparkSplits(
         shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)),
         partitionKey
       )
     } else {
-      ExprUtils(functionRegistry).toSparkSplits(
+      ExprUtils.toSparkSplits(
         shardingKeyIgnoreRand.map(k => ExprUtils.toSplitWithModulo(k, cluster.get.totalWeight * 5)),
         None
       )
@@ -83,6 +85,6 @@ case class WriteJobDescription(
   def sparkSortOrders: Array[SortOrder] = {
     val _partitionKey = if (writeOptions.localSortByPartition) partitionKey else None
     val _sortingKey = if (writeOptions.localSortByKey) sortingKey else None
-    ExprUtils(functionRegistry).toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster)
+    ExprUtils.toSparkSortOrders(shardingKeyIgnoreRand, _partitionKey, _sortingKey, cluster)
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {`
`106`	`106`	`("cityHash64", Array("value"))`
`107`	`107`	`).foreach {`
`108`	`108`	`case (func_name: String, func_args: Array[String]) =>`
`109`		`- test(s"shard by $func_name")(runTest(func_name, func_args))`
	`109`	`+ test(s"shard by $func_name(${func_args.mkString(",")})")(runTest(func_name, func_args))`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`}`
Original file line number	Diff line number	Diff line change
`@@ -209,7 +209,7 @@ class ClickHouseCatalog extends TableCatalog`
`209`	`209`
`210`	`210`	`val partitionsClause = partitions match {`
`211`	`211`	`case transforms if transforms.nonEmpty =>`
`212`		`- transforms.map(ExprUtils(functionRegistry).toClickHouse(_).sql).mkString("PARTITION BY (", ", ", ")")`
	`212`	`+ transforms.map(ExprUtils.toClickHouse(_)(functionRegistry).sql).mkString("PARTITION BY (", ", ", ")")`
`213`	`213`	`case _ => ""`
`214`	`214`	`}`
`215`	`215`
Original file line number	Diff line number	Diff line change
`@@ -133,11 +133,11 @@ case class ClickHouseTable(`
`133`	`133`	`private lazy val metadataSchema: StructType =`
`134`	`134`	`StructType(metadataColumns.map(_.asInstanceOf[ClickHouseMetadataColumn].toStructField))`
`135`	`135`
`136`		`- override lazy val partitioning: Array[Transform] = ExprUtils(functionRegistry).toSparkPartitions(partitionKey)`
	`136`	`+ override lazy val partitioning: Array[Transform] = ExprUtils.toSparkPartitions(partitionKey)(functionRegistry)`
`137`	`137`
`138`	`138`	`override lazy val partitionSchema: StructType = StructType(`
`139`	`139`	`partitioning.map(partTransform =>`
`140`		`- ExprUtils(functionRegistry).inferTransformSchema(schema, metadataSchema, partTransform)`
	`140`	`+ ExprUtils.inferTransformSchema(schema, metadataSchema, partTransform)(functionRegistry)`
`141`	`141`	`)`
`142`	`142`	`)`
`143`	`143`