From 6333f8bec7d45c3a1ba13b05e46a15ddaaacc62f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 30 Sep 2025 06:31:28 -0400 Subject: [PATCH 1/3] Hack off parsing statistics --- parquet/src/file/metadata/thrift_gen.rs | 49 ++++++++++++++----------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 921a212a93a3..1e693622c8cc 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -125,7 +125,7 @@ struct FileMetaData<'a> { 1: required i32 version 2: required list<'a> schema; 3: required i64 num_rows - 4: required list<'a> row_groups + 4: required list row_groups 5: optional list key_value_metadata 6: optional string created_by 7: optional list column_orders; @@ -135,8 +135,8 @@ struct FileMetaData<'a> { ); thrift_struct!( -struct RowGroup<'a> { - 1: required list<'a> columns +struct RowGroup { + 1: required list columns 2: required i64 total_byte_size 3: required i64 num_rows 4: optional list sorting_columns @@ -163,10 +163,10 @@ struct ColumnChunk<'a> { ); #[cfg(not(feature = "encryption"))] thrift_struct!( -struct ColumnChunk<'a> { +struct ColumnChunk { 1: optional string file_path 2: required i64 file_offset = 0 - 3: optional ColumnMetaData<'a> meta_data + 3: optional ColumnMetaData meta_data 4: optional i64 offset_index_offset 5: optional i32 offset_index_length 6: optional i64 column_index_offset @@ -176,7 +176,7 @@ struct ColumnChunk<'a> { type CompressionCodec = Compression; thrift_struct!( -struct ColumnMetaData<'a> { +struct ColumnMetaData { 1: required Type type_ 2: required list encodings // we don't expose path_in_schema so skip @@ -190,11 +190,14 @@ struct ColumnMetaData<'a> { 9: required i64 data_page_offset 10: optional i64 index_page_offset 11: optional i64 dictionary_page_offset - 12: optional Statistics<'a> statistics + // TEMP HACK: skip statistics to see how fast + // thrift can be parsed without them + //12: optional Statistics<'a> statistics 13: optional list encoding_stats; 14: optional i64 bloom_filter_offset; 15: optional i32 bloom_filter_length; - 16: optional SizeStatistics size_statistics; + // TEMP HACK: skip size_statistics to see how fast we can decode without it + //16: optional SizeStatistics size_statistics; 17: optional GeospatialStatistics geospatial_statistics; } ); @@ -317,7 +320,8 @@ fn convert_column( let data_page_offset = col_metadata.data_page_offset; let index_page_offset = col_metadata.index_page_offset; let dictionary_page_offset = col_metadata.dictionary_page_offset; - let statistics = convert_stats(column_type, col_metadata.statistics)?; + //let statistics = convert_stats(column_type, col_metadata.statistics)?; + let statistics = None; let encoding_stats = col_metadata.encoding_stats; let bloom_filter_offset = col_metadata.bloom_filter_offset; let bloom_filter_length = col_metadata.bloom_filter_length; @@ -325,21 +329,24 @@ fn convert_column( let offset_index_length = column.offset_index_length; let column_index_offset = column.column_index_offset; let column_index_length = column.column_index_length; - let (unencoded_byte_array_data_bytes, repetition_level_histogram, definition_level_histogram) = - if let Some(size_stats) = col_metadata.size_statistics { - ( - size_stats.unencoded_byte_array_data_bytes, - size_stats.repetition_level_histogram, - size_stats.definition_level_histogram, - ) - } else { - (None, None, None) - }; + let unencoded_byte_array_data_bytes = None; + // let (unencoded_byte_array_data_bytes, repetition_level_histogram, definition_level_histogram) = + // if let Some(size_stats) = col_metadata.size_statistics { + // ( + // size_stats.unencoded_byte_array_data_bytes, + // size_stats.repetition_level_histogram, + // size_stats.definition_level_histogram, + // ) + // } else { + // (None, None, None) + // }; let geo_statistics = convert_geo_stats(col_metadata.geospatial_statistics); - let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from); - let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from); + //let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from); + //let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from); + let repetition_level_histogram = None; + let definition_level_histogram = None; let result = ColumnChunkMetaData { column_descr, From 4fb76163880c08944969c86eacecf32ba1a57069 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 30 Sep 2025 07:05:28 -0400 Subject: [PATCH 2/3] Do not parse offset/column index if not needed --- parquet/src/file/metadata/thrift_gen.rs | 30 ++++++++++++++----------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 1e693622c8cc..fdf2b3272bae 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -153,12 +153,12 @@ struct ColumnChunk<'a> { 1: optional string<'a> file_path 2: required i64 file_offset = 0 3: optional ColumnMetaData<'a> meta_data - 4: optional i64 offset_index_offset - 5: optional i32 offset_index_length - 6: optional i64 column_index_offset - 7: optional i32 column_index_length + //4: optional i64 offset_index_offset + //5: optional i32 offset_index_length + //6: optional i64 column_index_offset + //7: optional i32 column_index_length 8: optional ColumnCryptoMetaData crypto_metadata - 9: optional binary<'a> encrypted_column_metadata + //9: optional binary<'a> encrypted_column_metadata } ); #[cfg(not(feature = "encryption"))] @@ -167,10 +167,10 @@ struct ColumnChunk { 1: optional string file_path 2: required i64 file_offset = 0 3: optional ColumnMetaData meta_data - 4: optional i64 offset_index_offset - 5: optional i32 offset_index_length - 6: optional i64 column_index_offset - 7: optional i32 column_index_length + //4: optional i64 offset_index_offset + //5: optional i32 offset_index_length + //6: optional i64 column_index_offset + //7: optional i32 column_index_length } ); @@ -325,10 +325,14 @@ fn convert_column( let encoding_stats = col_metadata.encoding_stats; let bloom_filter_offset = col_metadata.bloom_filter_offset; let bloom_filter_length = col_metadata.bloom_filter_length; - let offset_index_offset = column.offset_index_offset; - let offset_index_length = column.offset_index_length; - let column_index_offset = column.column_index_offset; - let column_index_length = column.column_index_length; + //let offset_index_offset = column.offset_index_offset; + //let offset_index_length = column.offset_index_length; + //let column_index_offset = column.column_index_offset; + //let column_index_length = column.column_index_length; + let offset_index_offset = None; + let offset_index_length = None; + let column_index_offset = None; + let column_index_length = None; let unencoded_byte_array_data_bytes = None; // let (unencoded_byte_array_data_bytes, repetition_level_histogram, definition_level_histogram) = // if let Some(size_stats) = col_metadata.size_statistics { From d24ac4fba7877b49ec54360e7baeb8669520bf04 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 30 Sep 2025 07:11:20 -0400 Subject: [PATCH 3/3] No unecessary statistics fields --- parquet/src/file/metadata/thrift_gen.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index fdf2b3272bae..2775c7bdc88a 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -193,12 +193,12 @@ struct ColumnMetaData { // TEMP HACK: skip statistics to see how fast // thrift can be parsed without them //12: optional Statistics<'a> statistics - 13: optional list encoding_stats; - 14: optional i64 bloom_filter_offset; - 15: optional i32 bloom_filter_length; + //13: optional list encoding_stats; + //14: optional i64 bloom_filter_offset; + //15: optional i32 bloom_filter_length; // TEMP HACK: skip size_statistics to see how fast we can decode without it //16: optional SizeStatistics size_statistics; - 17: optional GeospatialStatistics geospatial_statistics; + //17: optional GeospatialStatistics geospatial_statistics; } ); @@ -322,13 +322,16 @@ fn convert_column( let dictionary_page_offset = col_metadata.dictionary_page_offset; //let statistics = convert_stats(column_type, col_metadata.statistics)?; let statistics = None; - let encoding_stats = col_metadata.encoding_stats; - let bloom_filter_offset = col_metadata.bloom_filter_offset; - let bloom_filter_length = col_metadata.bloom_filter_length; + //let encoding_stats = col_metadata.encoding_stats; + //let bloom_filter_offset = col_metadata.bloom_filter_offset; + //let bloom_filter_length = col_metadata.bloom_filter_length; //let offset_index_offset = column.offset_index_offset; //let offset_index_length = column.offset_index_length; //let column_index_offset = column.column_index_offset; //let column_index_length = column.column_index_length; + let encoding_stats = None; + let bloom_filter_offset = None; + let bloom_filter_length = None; let offset_index_offset = None; let offset_index_length = None; let column_index_offset = None; @@ -345,7 +348,8 @@ fn convert_column( // (None, None, None) // }; - let geo_statistics = convert_geo_stats(col_metadata.geospatial_statistics); + // let geo_statistics = convert_geo_stats(col_metadata.geospatial_statistics); + let geo_statistics = None; //let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from); //let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);