Skip to content

Commit 6e8395f

Browse files
committed
YT-26680: Optimize rows digest computer (8-33x faster)
Before (MD5 \+ sort for each row): ``` -------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------------- BM_TRowsDigestComputer_ProcessRowsFewColumns_mean 203 ns 203 ns 5 BM_TRowsDigestComputer_ProcessRowsManyColumns_mean 7678 ns 7677 ns 5 BM_TRowsDigestComputer_ProcessRowsRandomOrder_mean 2794 ns 2794 ns 5 BM_TRowsDigestComputer_ProcessRowsMixedTypes_mean 496 ns 496 ns 5 BM_TRowsDigestComputer_ProcessRowsLongStrings_mean 1707 ns 1707 ns 5 BM_TRowsDigestComputer_ProcessRowsDynamicColumns_mean 885 ns 885 ns 5 BM_TRowsDigestComputer_ProcessRowsSparse_mean 435 ns 435 ns 5 ``` After (XXH3 \+ (binary search \+ insert for each new column) \+ batching buffer): ``` -------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------------- BM_TRowsDigestComputer_ProcessRowsFewColumns_mean 23.8 ns 23.8 ns 5 BM_TRowsDigestComputer_ProcessRowsManyColumns_mean 232 ns 232 ns 5 BM_TRowsDigestComputer_ProcessRowsRandomOrder_mean 99.5 ns 99.5 ns 5 BM_TRowsDigestComputer_ProcessRowsMixedTypes_mean 38.4 ns 38.4 ns 5 BM_TRowsDigestComputer_ProcessRowsLongStrings_mean 89.0 ns 89.0 ns 5 BM_TRowsDigestComputer_ProcessRowsDynamicColumns_mean 56.9 ns 56.9 ns 5 BM_TRowsDigestComputer_ProcessRowsSparse_mean 35.0 ns 35.0 ns 5 ``` Without batching (XXH3 \+ (binary search \+ insert for each new column)): ``` -------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------------- BM_TRowsDigestComputer_ProcessRowsFewColumns_mean 42.4 ns 42.4 ns 5 BM_TRowsDigestComputer_ProcessRowsManyColumns_mean 554 ns 554 ns 5 BM_TRowsDigestComputer_ProcessRowsRandomOrder_mean 228 ns 228 ns 5 BM_TRowsDigestComputer_ProcessRowsMixedTypes_mean 73.3 ns 73.3 ns 5 BM_TRowsDigestComputer_ProcessRowsLongStrings_mean 94.7 ns 94.7 ns 5 BM_TRowsDigestComputer_ProcessRowsDynamicColumns_mean 117 ns 117 ns 5 BM_TRowsDigestComputer_ProcessRowsSparse_mean 65.9 ns 65.9 ns 5 ``` XXH64 is slower than XXH3 by about 5-46% (XXH64 \+ (binary search \+ insert for each new column)): ``` -------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------------- BM_TRowsDigestComputer_ProcessRowsFewColumns_mean 47.2 ns 47.2 ns 5 BM_TRowsDigestComputer_ProcessRowsManyColumns_mean 712 ns 712 ns 5 BM_TRowsDigestComputer_ProcessRowsRandomOrder_mean 289 ns 289 ns 5 BM_TRowsDigestComputer_ProcessRowsMixedTypes_mean 87.8 ns 87.8 ns 5 BM_TRowsDigestComputer_ProcessRowsLongStrings_mean 129 ns 129 ns 5 BM_TRowsDigestComputer_ProcessRowsDynamicColumns_mean 144 ns 144 ns 5 BM_TRowsDigestComputer_ProcessRowsSparse_mean 79.0 ns 79.0 ns 5 ``` commit_hash:5e99e75ed9baf15ffdd32bd0a7a132a7911f0829
1 parent fbafe03 commit 6e8395f

14 files changed

+22
-14
lines changed

yt/yt/client/api/rowset.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ class TSchemafulRowsetWriter
172172
return VoidFuture;
173173
}
174174

175-
std::optional<TMD5Hash> GetDigest() const override
175+
std::optional<TRowsDigest> GetDigest() const override
176176
{
177177
return std::nullopt;
178178
}

yt/yt/client/formats/schemaful_writer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ TFuture<void> TSchemafulWriter::GetReadyEvent()
114114
return Result_;
115115
}
116116

117-
std::optional<TMD5Hash> TSchemafulWriter::GetDigest() const
117+
std::optional<TRowsDigest> TSchemafulWriter::GetDigest() const
118118
{
119119
return std::nullopt;
120120
}

yt/yt/client/formats/schemaful_writer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class TSchemafulWriter
3434

3535
TFuture<void> GetReadyEvent() override;
3636

37-
std::optional<NCrypto::TMD5Hash> GetDigest() const override;
37+
std::optional<NTableClient::TRowsDigest> GetDigest() const override;
3838

3939
private:
4040
const NConcurrency::IAsyncOutputStreamPtr Stream_;

yt/yt/client/queue_client/producer_client.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
#include "private.h"
44

5+
#include <yt/yt/client/table_client/public.h>
6+
57
#include <yt/yt/client/api/client.h>
68
#include <yt/yt/client/api/transaction.h>
79

@@ -175,7 +177,7 @@ class TProducerSession
175177
}));
176178
}
177179

178-
std::optional<TMD5Hash> GetDigest() const override
180+
std::optional<TRowsDigest> GetDigest() const override
179181
{
180182
return std::nullopt;
181183
}

yt/yt/client/table_client/adapters.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ class TSchemalessApiFromWriterAdapter
128128
return Schema_;
129129
}
130130

131-
std::optional<TMD5Hash> GetDigest() const override
131+
std::optional<TRowsDigest> GetDigest() const override
132132
{
133133
return std::nullopt;
134134
}

yt/yt/client/table_client/pipe.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ class TSchemafulPipe::TWriter
234234
return Data_->WriterReadyEvent;
235235
}
236236

237-
std::optional<TMD5Hash> GetDigest() const override
237+
std::optional<TRowsDigest> GetDigest() const override
238238
{
239239
return std::nullopt;
240240
}

yt/yt/client/table_client/public.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,4 +480,8 @@ YT_DEFINE_STRONG_TYPEDEF(TSignedWriteFragmentResultPtr, NSignature::TSignaturePt
480480

481481
////////////////////////////////////////////////////////////////////////////////
482482

483+
YT_DEFINE_STRONG_TYPEDEF(TRowsDigest, ui64);
484+
485+
////////////////////////////////////////////////////////////////////////////////
486+
483487
} // namespace NYT::NTableClient

yt/yt/client/table_client/schemaless_dynamic_table_writer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ class TSchemalessDynamicTableWriter
6565
return NameTable_;
6666
}
6767

68-
std::optional<TMD5Hash> GetDigest() const override
68+
std::optional<TRowsDigest> GetDigest() const override
6969
{
7070
return std::nullopt;
7171
}

yt/yt/client/table_client/unversioned_writer.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,16 @@ struct IUnversionedRowsetWriter
3232
[[nodiscard]] virtual bool Write(TRange<TUnversionedRow> rows) = 0;
3333

3434
/*!
35+
* TODO(apollo1321): Should be removed from base class and implemented as wrapper.
36+
*
3537
* Returns the digest of the written rows.
3638
*
3739
* Useful for checking the determinism of user jobs.
3840
* Returns nullopt when hash is not computed.
3941
*
4042
* Must not be called concurrently with Write method.
4143
*/
42-
virtual std::optional<NCrypto::TMD5Hash> GetDigest() const = 0;
44+
virtual std::optional<TRowsDigest> GetDigest() const = 0;
4345
};
4446

4547
DEFINE_REFCOUNTED_TYPE(IUnversionedRowsetWriter)

yt/yt/client/table_client/wire_protocol.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1282,7 +1282,7 @@ class TWireProtocolRowsetWriter
12821282
return CompressedBlocks_;
12831283
}
12841284

1285-
std::optional<TMD5Hash> GetDigest() const override
1285+
std::optional<TRowsDigest> GetDigest() const override
12861286
{
12871287
return std::nullopt;
12881288
}

0 commit comments

Comments
 (0)