@@ -43,7 +43,7 @@ pub struct DataFile {
4343 pub ( crate ) file_path : String ,
4444 /// field id: 101
4545 ///
46- /// String file format name, avro, orc or parquet
46+ /// String file format name, ` avro`, ` orc`, `parquet`, or `puffin`
4747 pub ( crate ) file_format : DataFileFormat ,
4848 /// field id: 102
4949 ///
@@ -52,7 +52,7 @@ pub struct DataFile {
5252 pub ( crate ) partition : Struct ,
5353 /// field id: 103
5454 ///
55- /// Number of records in this file
55+ /// Number of records in this file, or the cardinality of a deletion vector
5656 pub ( crate ) record_count : u64 ,
5757 /// field id: 104
5858 ///
@@ -148,9 +148,35 @@ pub struct DataFile {
148148 /// delete files.
149149 #[ builder( default , setter( strip_option) ) ]
150150 pub ( crate ) sort_order_id : Option < i32 > ,
151+ /// field id: 142
152+ ///
153+ /// The _row_id for the first row in the data file.
154+ /// For more details, refer to https://github.com/apache/iceberg/blob/main/format/spec.md#first-row-id-inheritance
155+ #[ builder( default ) ]
156+ pub ( crate ) first_row_id : Option < i64 > ,
151157 /// This field is not included in spec. It is just store in memory representation used
152158 /// in process.
153159 pub ( crate ) partition_spec_id : i32 ,
160+ /// field id: 143
161+ ///
162+ /// Fully qualified location (URI with FS scheme) of a data file that all deletes reference.
163+ /// Position delete metadata can use `referenced_data_file` when all deletes tracked by the
164+ /// entry are in a single data file. Setting the referenced file is required for deletion vectors.
165+ #[ builder( default ) ]
166+ pub ( crate ) referenced_data_file : Option < String > ,
167+ /// field: 144
168+ ///
169+ /// The offset in the file where the content starts.
170+ /// The `content_offset` and `content_size_in_bytes` fields are used to reference a specific blob
171+ /// for direct access to a deletion vector. For deletion vectors, these values are required and must
172+ /// exactly match the `offset` and `length` stored in the Puffin footer for the deletion vector blob.
173+ #[ builder( default ) ]
174+ pub ( crate ) content_offset : Option < i64 > ,
175+ /// field: 145
176+ ///
177+ /// The length of a referenced content stored in the file; required if `content_offset` is present
178+ #[ builder( default ) ]
179+ pub ( crate ) content_size_in_bytes : Option < i64 > ,
154180}
155181
156182impl DataFile {
@@ -226,6 +252,10 @@ impl DataFile {
226252 pub fn equality_ids ( & self ) -> & [ i32 ] {
227253 & self . equality_ids
228254 }
255+ /// Get the first row id in the data file.
256+ pub fn first_row_id ( & self ) -> Option < i64 > {
257+ self . first_row_id
258+ }
229259 /// Get the sort order id of the data file.
230260 /// Only data files and equality delete files should be
231261 /// written with a non-null order id. Position deletes are required to be
@@ -235,6 +265,21 @@ impl DataFile {
235265 pub fn sort_order_id ( & self ) -> Option < i32 > {
236266 self . sort_order_id
237267 }
268+ /// Get the fully qualified referenced location for the corresponding data file.
269+ /// Positional delete files could have the field set, and deletion vectors must the field set.
270+ pub fn referenced_data_file ( & self ) -> Option < String > {
271+ self . referenced_data_file . clone ( )
272+ }
273+ /// Get the offset in the file where the blob content starts.
274+ /// Only meaningful for puffin blobs, and required for deletion vectors.
275+ pub fn content_offset ( & self ) -> Option < i64 > {
276+ self . content_offset
277+ }
278+ /// Get the length of a puffin blob.
279+ /// Only meaningful for puffin blobs, and required for deletion vectors.
280+ pub fn content_size_in_bytes ( & self ) -> Option < i64 > {
281+ self . content_size_in_bytes
282+ }
238283}
239284
240285/// Convert data files to avro bytes and write to writer.
@@ -323,6 +368,8 @@ pub enum DataFileFormat {
323368 Orc ,
324369 /// Parquet file format: <https://parquet.apache.org/>
325370 Parquet ,
371+ /// Puffin file format: <https://iceberg.apache.org/puffin-spec/>
372+ Puffin ,
326373}
327374
328375impl FromStr for DataFileFormat {
@@ -333,6 +380,7 @@ impl FromStr for DataFileFormat {
333380 "avro" => Ok ( Self :: Avro ) ,
334381 "orc" => Ok ( Self :: Orc ) ,
335382 "parquet" => Ok ( Self :: Parquet ) ,
383+ "puffin" => Ok ( Self :: Puffin ) ,
336384 _ => Err ( Error :: new (
337385 ErrorKind :: DataInvalid ,
338386 format ! ( "Unsupported data file format: {}" , s) ,
@@ -347,6 +395,7 @@ impl std::fmt::Display for DataFileFormat {
347395 DataFileFormat :: Avro => write ! ( f, "avro" ) ,
348396 DataFileFormat :: Orc => write ! ( f, "orc" ) ,
349397 DataFileFormat :: Parquet => write ! ( f, "parquet" ) ,
398+ DataFileFormat :: Puffin => write ! ( f, "puffin" ) ,
350399 }
351400 }
352401}
0 commit comments