Skip to content

Commit 4e201d6

Browse files
committed
Spark 3.4: Adapt to hash function under clickhouse-core
1 parent 85a025f commit 4e201d6

File tree

16 files changed

+60
-493
lines changed

16 files changed

+60
-493
lines changed

spark-3.4/clickhouse-spark-it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,12 @@
1515
package org.apache.spark.sql.clickhouse.cluster
1616

1717
import org.apache.spark.sql.clickhouse.TestUtils.om
18-
import xenon.clickhouse.func.{CompositeFunctionRegistry, DynamicFunctionRegistry, StaticFunctionRegistry}
19-
import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard
18+
import xenon.clickhouse.func.{
19+
ClickHouseXxHash64Shard,
20+
CompositeFunctionRegistry,
21+
DynamicFunctionRegistry,
22+
StaticFunctionRegistry
23+
}
2024

2125
import java.lang.{Long => JLong}
2226

@@ -30,15 +34,6 @@ class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
3034
new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry))
3135
}
3236

33-
def product[A](xs: Seq[Seq[A]]): Seq[Seq[A]] =
34-
xs.toList match {
35-
case Nil => Seq(Seq())
36-
case head :: tail => for {
37-
h <- head
38-
t <- product(tail)
39-
} yield h +: t
40-
}
41-
4237
def runTest(funcSparkName: String, funcCkName: String, stringVal: String): Unit = {
4338
val sparkResult = spark.sql(
4439
s"""SELECT

spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/ClickHouseCatalog.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ import xenon.clickhouse.Constants._
2626
import xenon.clickhouse.client.NodeClient
2727
import xenon.clickhouse.exception.CHClientException
2828
import xenon.clickhouse.exception.ClickHouseErrCode._
29-
import xenon.clickhouse.func.clickhouse.ClickHouseXxHash64Shard
30-
import xenon.clickhouse.func.{FunctionRegistry, _}
29+
import xenon.clickhouse.func.{ClickHouseXxHash64Shard, FunctionRegistry, _}
3130
import xenon.clickhouse.spec._
3231

3332
import java.time.ZoneId
Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,15 @@
1212
* limitations under the License.
1313
*/
1414

15-
package xenon.clickhouse.func.clickhouse
15+
package xenon.clickhouse.func
1616

17-
import io.netty.buffer.{ByteBuf, Unpooled}
18-
import org.apache.spark.unsafe.types.UTF8String
19-
import xenon.clickhouse.func.MultiArgsHash
20-
import xenon.clickhouse.func.clickhouse.cityhash.{CityHash_v1_0_2, UInt128}
17+
import xenon.clickhouse.hash
2118

22-
object CityHash64 extends MultiArgsHash {
19+
object CityHash64 extends MultiStringArgsHash {
2320
// https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L694
2421

2522
override protected def funcName: String = "clickhouse_cityHash64"
2623
override val ckFuncNames: Array[String] = Array("cityHash64")
2724

28-
def convertToByteBuf(array: Array[Byte]): ByteBuf = {
29-
val byteBuf = Unpooled.buffer(array.length).writeBytes(array)
30-
byteBuf
31-
}
32-
33-
override def invokeBase(value: UTF8String): Long = {
34-
// ignore UInt64 vs Int64
35-
val data = value.getBytes
36-
CityHash_v1_0_2.CityHash64(convertToByteBuf(data), 0, data.length)
37-
}
38-
39-
override def combineHashes(v1: Long, v2: Long): Long = CityHash_v1_0_2.Hash128to64(new UInt128(v1, v2))
25+
override def applyHash(input: Array[Any]): Long = hash.CityHash64(input)
4026
}
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@
1212
* limitations under the License.
1313
*/
1414

15-
package xenon.clickhouse.func.clickhouse
15+
package xenon.clickhouse.func
1616

1717
import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
1818
import org.apache.spark.sql.types._
19-
import xenon.clickhouse.func.ClickhouseEquivFunction
2019

2120
import java.time.LocalDate
2221
import java.time.format.DateTimeFormatter

spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/FunctionRegistry.scala

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
package xenon.clickhouse.func
1616

1717
import org.apache.spark.sql.connector.catalog.functions.UnboundFunction
18-
import xenon.clickhouse.func.clickhouse._
1918

2019
import scala.collection.mutable
2120

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,12 @@
1212
* limitations under the License.
1313
*/
1414

15-
package xenon.clickhouse.func.clickhouse
15+
package xenon.clickhouse.func
1616

1717
import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
1818
import org.apache.spark.sql.types._
19-
import xenon.clickhouse.func.ClickhouseEquivFunction
2019

21-
import java.sql.{Date, Timestamp}
20+
import java.sql.Timestamp
2221
import java.text.SimpleDateFormat
2322

2423
object Hours extends UnboundFunction with ScalarFunction[Int] with ClickhouseEquivFunction {
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@
1212
* limitations under the License.
1313
*/
1414

15-
package xenon.clickhouse.func.clickhouse
15+
package xenon.clickhouse.func
1616

1717
import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
1818
import org.apache.spark.sql.types._
19-
import xenon.clickhouse.func.ClickhouseEquivFunction
2019

2120
object Mod extends UnboundFunction with ScalarFunction[Long] with ClickhouseEquivFunction {
2221

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@
1212
* limitations under the License.
1313
*/
1414

15-
package xenon.clickhouse.func.clickhouse
15+
package xenon.clickhouse.func
1616

1717
import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
1818
import org.apache.spark.sql.types._
19-
import xenon.clickhouse.func.ClickhouseEquivFunction
2019

2120
import java.time.LocalDate
2221
import java.time.format.DateTimeFormatter

spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiArgsHash.scala renamed to spark-3.4/clickhouse-spark/src/main/scala/xenon/clickhouse/func/MultiStringArgsHash.scala

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,32 +19,41 @@ import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFu
1919
import org.apache.spark.sql.types._
2020
import org.apache.spark.unsafe.types.UTF8String
2121

22-
abstract class MultiArgsHash extends UnboundFunction with ClickhouseEquivFunction {
22+
abstract class MultiStringArgsHash extends UnboundFunction with ClickhouseEquivFunction {
23+
24+
def applyHash(input: Array[Any]): Long
25+
26+
protected def funcName: String
27+
28+
override val ckFuncNames: Array[String]
29+
30+
override def description: String = s"$name: (value: string, ...) => hash_value: long"
31+
2332
private def isExceptedType(dt: DataType): Boolean =
2433
dt.isInstanceOf[StringType]
2534

2635
final override def name: String = funcName
36+
2737
final override def bind(inputType: StructType): BoundFunction = {
2838
val inputDataTypes = inputType.fields.map(_.dataType)
29-
if (inputDataTypes.forall(isExceptedType)) new ScalarFunction[Long] {
30-
override def inputTypes(): Array[DataType] = inputDataTypes
31-
override def name: String = funcName
32-
override def canonicalName: String = s"clickhouse.$name"
33-
override def resultType: DataType = LongType
34-
override def toString: String = name
35-
override def produceResult(input: InternalRow): Long = {
36-
val inputStrings: Seq[UTF8String] =
37-
input.toSeq(Seq.fill(input.numFields)(StringType)).asInstanceOf[Seq[UTF8String]]
38-
inputStrings.map(invokeBase).reduce(combineHashes)
39+
if (inputDataTypes.forall(isExceptedType)) {
40+
// need to new a ScalarFunction instance for each bind,
41+
// because we do not know the number of arguments in advance
42+
new ScalarFunction[Long] {
43+
override def inputTypes(): Array[DataType] = inputDataTypes
44+
override def name: String = funcName
45+
override def canonicalName: String = s"clickhouse.$name"
46+
override def resultType: DataType = LongType
47+
override def toString: String = name
48+
override def produceResult(input: InternalRow): Long = {
49+
val inputStrings: Array[Any] =
50+
input.toSeq(Seq.fill(input.numFields)(StringType)).asInstanceOf[Seq[UTF8String]].toArray
51+
.map(_.getBytes)
52+
applyHash(inputStrings)
53+
}
3954
}
40-
}
41-
else throw new UnsupportedOperationException(s"Expect multiple STRING argument. $description")
55+
} else throw new UnsupportedOperationException(s"Expect multiple STRING argument. $description")
4256

4357
}
4458

45-
protected def funcName: String
46-
override val ckFuncNames: Array[String]
47-
override def description: String = s"$name: (value: string, ...) => hash_value: long"
48-
def invokeBase(value: UTF8String): Long
49-
def combineHashes(v1: Long, v2: Long): Long
5059
}
Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,40 +12,25 @@
1212
* limitations under the License.
1313
*/
1414

15-
package xenon.clickhouse.func.clickhouse
15+
package xenon.clickhouse.func
1616

17-
import org.apache.commons.codec.digest.{MurmurHash2, MurmurHash3}
18-
import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, ScalarFunction, UnboundFunction}
19-
import org.apache.spark.sql.types._
20-
import org.apache.spark.unsafe.types.UTF8String
21-
import xenon.clickhouse.func.{ClickhouseEquivFunction, MultiArgsHash, Util}
17+
import xenon.clickhouse.hash
18+
import xenon.clickhouse.hash.HashUtils
2219

23-
object MurmurHash2_64 extends MultiArgsHash {
20+
object MurmurHash2_64 extends MultiStringArgsHash {
2421
// https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L460
2522

2623
override protected def funcName: String = "clickhouse_murmurHash2_64"
2724
override val ckFuncNames: Array[String] = Array("murmurHash2_64")
2825

29-
override def invokeBase(value: UTF8String): Long = {
30-
// ignore UInt64 vs Int64
31-
val data = value.getBytes
32-
MurmurHash2.hash64(data, data.length, 0)
33-
}
34-
35-
override def combineHashes(v1: Long, v2: Long): Long = Util.intHash64Impl(v1) ^ v2
26+
override def applyHash(input: Array[Any]): Long = hash.Murmurhash2_64(input)
3627
}
3728

38-
object MurmurHash2_32 extends MultiArgsHash {
29+
object MurmurHash2_32 extends MultiStringArgsHash {
3930
// https://github.com/ClickHouse/ClickHouse/blob/v23.5.3.24-stable/src/Functions/FunctionsHashing.h#L519
4031

4132
override protected def funcName: String = "clickhouse_murmurHash2_32"
4233
override val ckFuncNames: Array[String] = Array("murmurHash2_32")
4334

44-
override def invokeBase(value: UTF8String): Long = {
45-
val data = value.getBytes
46-
val v = MurmurHash2.hash32(data, data.length, 0)
47-
Util.toUInt32Range(v)
48-
}
49-
50-
override def combineHashes(v1: Long, v2: Long): Long = Util.toUInt32Range(Util.int32Impl(v1) ^ v2)
35+
override def applyHash(input: Array[Any]): Long = HashUtils.toUInt32(hash.Murmurhash2_32(input))
5136
}

0 commit comments

Comments
 (0)