diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 21e87bee4..d214fbb27 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -62,6 +62,10 @@ set(ICEBERG_SOURCES partition_field.cc partition_spec.cc partition_summary.cc + puffin/blob.cc + puffin/blob_metadata.cc + puffin/file_metadata.cc + puffin/puffin_compression_codec.cc row/arrow_array_wrapper.cc row/manifest_wrapper.cc row/partition_values.cc @@ -166,6 +170,7 @@ add_subdirectory(catalog) add_subdirectory(data) add_subdirectory(expression) add_subdirectory(manifest) +add_subdirectory(puffin) add_subdirectory(row) add_subdirectory(update) add_subdirectory(util) diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index bfc502fd8..9379cc4e0 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -80,6 +80,10 @@ iceberg_sources = files( 'partition_field.cc', 'partition_spec.cc', 'partition_summary.cc', + 'puffin/blob.cc', + 'puffin/blob_metadata.cc', + 'puffin/file_metadata.cc', + 'puffin/puffin_compression_codec.cc', 'row/arrow_array_wrapper.cc', 'row/manifest_wrapper.cc', 'row/partition_values.cc', @@ -221,6 +225,7 @@ install_headers( subdir('catalog') subdir('expression') subdir('manifest') +subdir('puffin') subdir('row') subdir('update') subdir('util') diff --git a/src/iceberg/puffin/CMakeLists.txt b/src/iceberg/puffin/CMakeLists.txt new file mode 100644 index 000000000..087ea09cb --- /dev/null +++ b/src/iceberg/puffin/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +iceberg_install_all_headers(iceberg/puffin) diff --git a/src/iceberg/puffin/blob.cc b/src/iceberg/puffin/blob.cc new file mode 100644 index 000000000..e1bec278c --- /dev/null +++ b/src/iceberg/puffin/blob.cc @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/puffin/blob.h" + +#include + +#include "iceberg/util/formatter_internal.h" + +namespace iceberg::puffin { + +std::string ToString(const Blob& blob) { + std::string repr = "Blob["; + std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", blob.type, + blob.input_fields); + std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},", + blob.snapshot_id, blob.sequence_number); + std::format_to(std::back_inserter(repr), "dataSize={}", blob.data.size()); + if (blob.requested_compression.has_value()) { + std::format_to(std::back_inserter(repr), ",requestedCompression={}", + iceberg::puffin::ToString(*blob.requested_compression)); + } + if (!blob.properties.empty()) { + std::format_to(std::back_inserter(repr), ",properties={}", blob.properties); + } + std::format_to(std::back_inserter(repr), "]"); + return repr; +} + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/blob.h b/src/iceberg/puffin/blob.h new file mode 100644 index 000000000..9f0eedcc7 --- /dev/null +++ b/src/iceberg/puffin/blob.h @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/blob.h +/// Blob data structure for Puffin files. + +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/puffin/puffin_compression_codec.h" + +namespace iceberg::puffin { + +/// \brief A blob to be written to a Puffin file. +/// +/// This represents the uncompressed blob data along with its metadata. +/// The actual compression is handled during writing. +struct ICEBERG_EXPORT Blob { + /// Type of the blob. See StandardBlobTypes for known types. + std::string type; + /// List of field IDs the blob was computed for. + /// The order of items is used to compute sketches stored in the blob. + std::vector input_fields; + /// ID of the Iceberg table's snapshot the blob was computed from. + int64_t snapshot_id; + /// Sequence number of the Iceberg table's snapshot the blob was computed from. + int64_t sequence_number; + /// The uncompressed blob data. + std::vector data; + /// Requested compression codec. If not set, the writer's default will be used. + std::optional requested_compression; + /// Additional properties of the blob. + std::unordered_map properties; + + /// \brief Compare two Blobs for equality. + friend bool operator==(const Blob& lhs, const Blob& rhs) = default; +}; + +/// \brief Returns a string representation of a Blob. +ICEBERG_EXPORT std::string ToString(const Blob& blob); + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/blob_metadata.cc b/src/iceberg/puffin/blob_metadata.cc new file mode 100644 index 000000000..b2ba97cbc --- /dev/null +++ b/src/iceberg/puffin/blob_metadata.cc @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/puffin/blob_metadata.h" + +#include + +#include "iceberg/util/formatter_internal.h" + +namespace iceberg::puffin { + +std::string ToString(const BlobMetadata& blob_metadata) { + std::string repr = "BlobMetadata["; + std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", + blob_metadata.type, blob_metadata.input_fields); + std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},", + blob_metadata.snapshot_id, blob_metadata.sequence_number); + std::format_to(std::back_inserter(repr), "offset={},length={}", blob_metadata.offset, + blob_metadata.length); + if (blob_metadata.compression_codec.has_value()) { + std::format_to(std::back_inserter(repr), ",compressionCodec='{}'", + *blob_metadata.compression_codec); + } + if (!blob_metadata.properties.empty()) { + std::format_to(std::back_inserter(repr), ",properties={}", blob_metadata.properties); + } + std::format_to(std::back_inserter(repr), "]"); + return repr; +} + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/blob_metadata.h b/src/iceberg/puffin/blob_metadata.h new file mode 100644 index 000000000..957fc0357 --- /dev/null +++ b/src/iceberg/puffin/blob_metadata.h @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/blob_metadata.h +/// Blob metadata structure for Puffin files. + +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" + +namespace iceberg::puffin { + +/// \brief Metadata about a blob stored in a Puffin file. +/// +/// This represents the metadata stored in the Puffin file footer, +/// including the blob's location within the file. +struct ICEBERG_EXPORT BlobMetadata { + /// Type of the blob. See StandardBlobTypes for known types. + std::string type; + /// List of field IDs the blob was computed for. + std::vector input_fields; + /// ID of the Iceberg table's snapshot the blob was computed from. + int64_t snapshot_id; + /// Sequence number of the Iceberg table's snapshot the blob was computed from. + int64_t sequence_number; + /// Offset in the file where the blob data starts. + int64_t offset; + /// Length of the blob data in the file. If compression_codec is set, this is + /// the compressed size; otherwise it is the raw size. + int64_t length; + /// Compression codec name (e.g. "lz4", "zstd"), or std::nullopt if the blob + /// is not compressed. + std::optional compression_codec; + /// Additional properties of the blob. + std::unordered_map properties; + + /// \brief Compare two BlobMetadatas for equality. + friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) = default; +}; + +/// \brief Returns a string representation of a BlobMetadata. +ICEBERG_EXPORT std::string ToString(const BlobMetadata& blob_metadata); + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/file_metadata.cc b/src/iceberg/puffin/file_metadata.cc new file mode 100644 index 000000000..b5233e40c --- /dev/null +++ b/src/iceberg/puffin/file_metadata.cc @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/puffin/file_metadata.h" + +#include + +#include "iceberg/util/formatter_internal.h" + +namespace iceberg::puffin { + +std::string ToString(const FileMetadata& file_metadata) { + std::string repr = "FileMetadata["; + std::format_to(std::back_inserter(repr), "blobs=["); + for (size_t i = 0; i < file_metadata.blobs.size(); ++i) { + if (i > 0) { + std::format_to(std::back_inserter(repr), ","); + } + std::format_to(std::back_inserter(repr), "{}", ToString(file_metadata.blobs[i])); + } + std::format_to(std::back_inserter(repr), "]"); + if (!file_metadata.properties.empty()) { + std::format_to(std::back_inserter(repr), ",properties={}", file_metadata.properties); + } + std::format_to(std::back_inserter(repr), "]"); + return repr; +} + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/file_metadata.h b/src/iceberg/puffin/file_metadata.h new file mode 100644 index 000000000..f47dc8766 --- /dev/null +++ b/src/iceberg/puffin/file_metadata.h @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/file_metadata.h +/// File metadata structure for Puffin files. + +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/puffin/blob_metadata.h" + +namespace iceberg::puffin { + +/// \brief Metadata about a Puffin file. +/// +/// This represents the metadata stored in the Puffin file footer, +/// including information about all blobs in the file. +struct ICEBERG_EXPORT FileMetadata { + /// List of blob metadata for all blobs in the file. + std::vector blobs; + /// File-level properties. + std::unordered_map properties; + + /// \brief Compare two FileMetadatas for equality. + friend bool operator==(const FileMetadata& lhs, const FileMetadata& rhs) = default; +}; + +/// \brief Returns a string representation of a FileMetadata. +ICEBERG_EXPORT std::string ToString(const FileMetadata& file_metadata); + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/meson.build b/src/iceberg/puffin/meson.build new file mode 100644 index 000000000..bb5bc3a5e --- /dev/null +++ b/src/iceberg/puffin/meson.build @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers( + [ + 'blob.h', + 'blob_metadata.h', + 'file_metadata.h', + 'puffin_compression_codec.h', + 'types.h', + ], + subdir: 'iceberg/puffin', +) diff --git a/src/iceberg/puffin/puffin_compression_codec.cc b/src/iceberg/puffin/puffin_compression_codec.cc new file mode 100644 index 000000000..a69109f59 --- /dev/null +++ b/src/iceberg/puffin/puffin_compression_codec.cc @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/puffin/puffin_compression_codec.h" + +#include +#include + +namespace iceberg::puffin { + +namespace { +constexpr std::string_view kLz4CodecName = "lz4"; +constexpr std::string_view kZstdCodecName = "zstd"; +} // namespace + +std::optional CodecName(PuffinCompressionCodec codec) { + switch (codec) { + case PuffinCompressionCodec::kNone: + return std::nullopt; + case PuffinCompressionCodec::kLz4: + return kLz4CodecName; + case PuffinCompressionCodec::kZstd: + return kZstdCodecName; + } + std::unreachable(); +} + +Result PuffinCompressionCodecFromName( + std::optional codec_name) { + if (!codec_name.has_value()) { + return PuffinCompressionCodec::kNone; + } + if (codec_name == kLz4CodecName) { + return PuffinCompressionCodec::kLz4; + } + if (codec_name == kZstdCodecName) { + return PuffinCompressionCodec::kZstd; + } + return InvalidArgument("Unknown codec name: {}", *codec_name); +} + +std::string ToString(PuffinCompressionCodec codec) { + auto name = CodecName(codec); + if (name.has_value()) { + return std::string(*name); + } + return "none"; +} + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/puffin_compression_codec.h b/src/iceberg/puffin/puffin_compression_codec.h new file mode 100644 index 000000000..629988951 --- /dev/null +++ b/src/iceberg/puffin/puffin_compression_codec.h @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/puffin_compression_codec.h +/// Compression codec definitions for Puffin files. + +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg::puffin { + +/// \brief Compression codecs supported by Puffin files. +enum class PuffinCompressionCodec { + /// No compression + kNone, + /// LZ4 single compression frame with content size present + kLz4, + /// Zstandard single compression frame with content size present + kZstd, +}; + +/// \brief Get the codec name for a compression codec. +ICEBERG_EXPORT std::optional CodecName(PuffinCompressionCodec codec); + +/// \brief Get the compression codec from a codec name. +ICEBERG_EXPORT Result PuffinCompressionCodecFromName( + std::optional codec_name); + +/// \brief Returns a string representation of a PuffinCompressionCodec. +ICEBERG_EXPORT std::string ToString(PuffinCompressionCodec codec); + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/types.h b/src/iceberg/puffin/types.h new file mode 100644 index 000000000..9a925a9a9 --- /dev/null +++ b/src/iceberg/puffin/types.h @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/types.h +/// Standard blob types and properties for Puffin files. + +#include + +namespace iceberg::puffin { + +/// \brief Standard blob types defined by the Iceberg specification. +struct StandardBlobTypes { + /// A serialized form of a "compact" Theta sketch produced by the + /// Apache DataSketches library. + static constexpr std::string_view kApacheDatasketchesThetaV1 = + "apache-datasketches-theta-v1"; + + /// A serialized deletion vector according to the Iceberg spec. + static constexpr std::string_view kDeletionVectorV1 = "deletion-vector-v1"; +}; + +/// \brief Standard file-level properties for Puffin files. +struct StandardPuffinProperties { + /// Human-readable identification of the application writing the file, + /// along with its version. Example: "Trino version 381". + static constexpr std::string_view kCreatedBy = "created-by"; +}; + +} // namespace iceberg::puffin diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index fdd88888e..fcbd8cd2c 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -124,6 +124,8 @@ add_iceberg_test(util_test add_iceberg_test(roaring_test SOURCES roaring_test.cc) +add_iceberg_test(puffin_test SOURCES puffin_test.cc) + if(ICEBERG_BUILD_BUNDLE) add_iceberg_test(avro_test USE_BUNDLE diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index 71ab6942e..43c4464b7 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -99,6 +99,7 @@ iceberg_tests = { ), }, 'roaring_test': {'sources': files('roaring_test.cc')}, + 'puffin_test': {'sources': files('puffin_test.cc')}, } if get_option('rest').enabled() diff --git a/src/iceberg/test/puffin_test.cc b/src/iceberg/test/puffin_test.cc new file mode 100644 index 000000000..afd342c6f --- /dev/null +++ b/src/iceberg/test/puffin_test.cc @@ -0,0 +1,319 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include "iceberg/puffin/blob.h" +#include "iceberg/puffin/blob_metadata.h" +#include "iceberg/puffin/file_metadata.h" +#include "iceberg/puffin/puffin_compression_codec.h" +#include "iceberg/puffin/types.h" +#include "iceberg/test/matchers.h" + +namespace iceberg::puffin { + +// ============================================================================ +// PuffinCompressionCodec Tests +// ============================================================================ + +TEST(PuffinCompressionCodecTest, CodecName) { + EXPECT_EQ(CodecName(PuffinCompressionCodec::kNone), std::nullopt); + EXPECT_EQ(CodecName(PuffinCompressionCodec::kLz4), "lz4"); + EXPECT_EQ(CodecName(PuffinCompressionCodec::kZstd), "zstd"); +} + +TEST(PuffinCompressionCodecTest, FromName) { + auto result_none = PuffinCompressionCodecFromName(std::nullopt); + ASSERT_THAT(result_none, IsOk()); + EXPECT_EQ(result_none.value(), PuffinCompressionCodec::kNone); + + auto result_lz4 = PuffinCompressionCodecFromName("lz4"); + ASSERT_THAT(result_lz4, IsOk()); + EXPECT_EQ(result_lz4.value(), PuffinCompressionCodec::kLz4); + + auto result_zstd = PuffinCompressionCodecFromName("zstd"); + ASSERT_THAT(result_zstd, IsOk()); + EXPECT_EQ(result_zstd.value(), PuffinCompressionCodec::kZstd); +} + +TEST(PuffinCompressionCodecTest, FromNameUnknown) { + EXPECT_THAT(PuffinCompressionCodecFromName("unknown"), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(PuffinCompressionCodecFromName("LZ4"), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(PuffinCompressionCodecFromName(""), IsError(ErrorKind::kInvalidArgument)); +} + +TEST(PuffinCompressionCodecTest, ToString) { + EXPECT_EQ(ToString(PuffinCompressionCodec::kNone), "none"); + EXPECT_EQ(ToString(PuffinCompressionCodec::kLz4), "lz4"); + EXPECT_EQ(ToString(PuffinCompressionCodec::kZstd), "zstd"); +} + +TEST(PuffinCompressionCodecTest, RoundTrip) { + for (auto codec : {PuffinCompressionCodec::kLz4, PuffinCompressionCodec::kZstd}) { + auto name = CodecName(codec); + ASSERT_TRUE(name.has_value()); + auto result = PuffinCompressionCodecFromName(*name); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result.value(), codec); + } + // kNone round-trips through nullopt + EXPECT_EQ(CodecName(PuffinCompressionCodec::kNone), std::nullopt); + auto result = PuffinCompressionCodecFromName(std::nullopt); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result.value(), PuffinCompressionCodec::kNone); +} + +// ============================================================================ +// Blob Tests +// ============================================================================ + +TEST(BlobTest, Equality) { + Blob blob1{ + .type = "test-blob", + .input_fields = {1, 2}, + .snapshot_id = 100, + .sequence_number = 1, + .data = {0x01, 0x02}, + }; + Blob blob2 = blob1; + EXPECT_EQ(blob1, blob2); +} + +TEST(BlobTest, InequalityByType) { + Blob blob1{.type = "a", .snapshot_id = 1, .sequence_number = 0}; + Blob blob2{.type = "b", .snapshot_id = 1, .sequence_number = 0}; + EXPECT_NE(blob1, blob2); +} + +TEST(BlobTest, InequalityByData) { + Blob blob1{.type = "a", .snapshot_id = 1, .sequence_number = 0, .data = {1}}; + Blob blob2{.type = "a", .snapshot_id = 1, .sequence_number = 0, .data = {2}}; + EXPECT_NE(blob1, blob2); +} + +TEST(BlobTest, InequalityByCompression) { + Blob blob1{.type = "a", .snapshot_id = 1, .sequence_number = 0}; + Blob blob2{.type = "a", + .snapshot_id = 1, + .sequence_number = 0, + .requested_compression = PuffinCompressionCodec::kZstd}; + EXPECT_NE(blob1, blob2); +} + +TEST(BlobTest, ToStringWithAllFields) { + Blob blob{ + .type = "test-blob", + .input_fields = {1, 2, 3}, + .snapshot_id = 12345, + .sequence_number = 67, + .data = {0x01, 0x02, 0x03}, + .requested_compression = PuffinCompressionCodec::kZstd, + .properties = {{"key", "value"}}, + }; + + auto str = ToString(blob); + EXPECT_THAT(str, testing::HasSubstr("test-blob")); + EXPECT_THAT(str, testing::HasSubstr("12345")); + EXPECT_THAT(str, testing::HasSubstr("67")); + EXPECT_THAT(str, testing::HasSubstr("dataSize=3")); + EXPECT_THAT(str, testing::HasSubstr("zstd")); + EXPECT_THAT(str, testing::HasSubstr("properties=")); +} + +TEST(BlobTest, ToStringMinimal) { + Blob blob{ + .type = "minimal", + .snapshot_id = 1, + .sequence_number = 0, + }; + + auto str = ToString(blob); + EXPECT_THAT(str, testing::HasSubstr("minimal")); + EXPECT_THAT(str, testing::HasSubstr("dataSize=0")); + EXPECT_THAT(str, testing::Not(testing::HasSubstr("requestedCompression"))); + EXPECT_THAT(str, testing::Not(testing::HasSubstr("properties="))); +} + +// ============================================================================ +// BlobMetadata Tests +// ============================================================================ + +TEST(BlobMetadataTest, Equality) { + BlobMetadata bm1{ + .type = "test", + .input_fields = {1}, + .snapshot_id = 10, + .sequence_number = 1, + .offset = 0, + .length = 100, + }; + BlobMetadata bm2 = bm1; + EXPECT_EQ(bm1, bm2); +} + +TEST(BlobMetadataTest, InequalityByOffset) { + BlobMetadata bm1{ + .type = "a", .snapshot_id = 1, .sequence_number = 0, .offset = 0, .length = 10}; + BlobMetadata bm2 = bm1; + bm2.offset = 999; + EXPECT_NE(bm1, bm2); +} + +TEST(BlobMetadataTest, InequalityByCompressionCodec) { + BlobMetadata bm1{ + .type = "a", .snapshot_id = 1, .sequence_number = 0, .offset = 0, .length = 10}; + BlobMetadata bm2 = bm1; + bm2.compression_codec = "zstd"; + EXPECT_NE(bm1, bm2); +} + +TEST(BlobMetadataTest, ToStringWithAllFields) { + BlobMetadata bm{ + .type = "test-blob", + .input_fields = {1, 2}, + .snapshot_id = 12345, + .sequence_number = 67, + .offset = 100, + .length = 200, + .compression_codec = "zstd", + .properties = {{"key", "value"}}, + }; + + auto str = ToString(bm); + EXPECT_THAT(str, testing::HasSubstr("test-blob")); + EXPECT_THAT(str, testing::HasSubstr("offset=100")); + EXPECT_THAT(str, testing::HasSubstr("length=200")); + EXPECT_THAT(str, testing::HasSubstr("zstd")); + EXPECT_THAT(str, testing::HasSubstr("properties=")); +} + +TEST(BlobMetadataTest, ToStringMinimal) { + BlobMetadata bm{ + .type = "minimal", + .snapshot_id = 1, + .sequence_number = 0, + .offset = 0, + .length = 0, + }; + + auto str = ToString(bm); + EXPECT_THAT(str, testing::HasSubstr("minimal")); + EXPECT_THAT(str, testing::Not(testing::HasSubstr("compressionCodec"))); + EXPECT_THAT(str, testing::Not(testing::HasSubstr("properties="))); +} + +// ============================================================================ +// FileMetadata Tests +// ============================================================================ + +TEST(FileMetadataTest, Equality) { + FileMetadata fm1{ + .blobs = {BlobMetadata{.type = "a", + .input_fields = {1}, + .snapshot_id = 1, + .sequence_number = 0, + .offset = 0, + .length = 10}}, + .properties = {{"k", "v"}}, + }; + FileMetadata fm2 = fm1; + EXPECT_EQ(fm1, fm2); +} + +TEST(FileMetadataTest, InequalityByProperties) { + FileMetadata fm1{.blobs = {}, .properties = {{"k", "v"}}}; + FileMetadata fm2{.blobs = {}, .properties = {{"k", "changed"}}}; + EXPECT_NE(fm1, fm2); +} + +TEST(FileMetadataTest, InequalityByBlobs) { + FileMetadata fm1{.blobs = {}}; + FileMetadata fm2{ + .blobs = {BlobMetadata{.type = "a", + .snapshot_id = 1, + .sequence_number = 0, + .offset = 0, + .length = 10}}, + }; + EXPECT_NE(fm1, fm2); +} + +TEST(FileMetadataTest, ToStringWithBlobs) { + FileMetadata fm{ + .blobs = {BlobMetadata{.type = "blob1", + .input_fields = {1}, + .snapshot_id = 100, + .sequence_number = 1, + .offset = 0, + .length = 50}}, + .properties = {{"created-by", "test"}}, + }; + + auto str = ToString(fm); + EXPECT_THAT(str, testing::HasSubstr("blob1")); + EXPECT_THAT(str, testing::HasSubstr("created-by")); +} + +TEST(FileMetadataTest, ToStringEmpty) { + FileMetadata fm{}; + auto str = ToString(fm); + EXPECT_THAT(str, testing::HasSubstr("FileMetadata")); + EXPECT_THAT(str, testing::HasSubstr("blobs=[]")); +} + +TEST(FileMetadataTest, ToStringMultipleBlobs) { + FileMetadata fm{ + .blobs = + { + BlobMetadata{.type = "first", + .snapshot_id = 1, + .sequence_number = 0, + .offset = 0, + .length = 10}, + BlobMetadata{.type = "second", + .snapshot_id = 2, + .sequence_number = 1, + .offset = 10, + .length = 20}, + }, + }; + + auto str = ToString(fm); + EXPECT_THAT(str, testing::HasSubstr("first")); + EXPECT_THAT(str, testing::HasSubstr("second")); +} + +// ============================================================================ +// Types Tests +// ============================================================================ + +TEST(TypesTest, StandardBlobTypes) { + EXPECT_EQ(StandardBlobTypes::kApacheDatasketchesThetaV1, + "apache-datasketches-theta-v1"); + EXPECT_EQ(StandardBlobTypes::kDeletionVectorV1, "deletion-vector-v1"); +} + +TEST(TypesTest, StandardPuffinProperties) { + EXPECT_EQ(StandardPuffinProperties::kCreatedBy, "created-by"); +} + +} // namespace iceberg::puffin