Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 59 additions & 44 deletions parquet/src/file/metadata/thrift_gen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ struct FileMetaData<'a> {
1: required i32 version
2: required list<'a><SchemaElement> schema;
3: required i64 num_rows
4: required list<'a><RowGroup> row_groups
4: required list<RowGroup> row_groups
5: optional list<KeyValue> key_value_metadata
6: optional string created_by
7: optional list<ColumnOrder> column_orders;
Expand All @@ -135,8 +135,8 @@ struct FileMetaData<'a> {
);

thrift_struct!(
struct RowGroup<'a> {
1: required list<'a><ColumnChunk> columns
struct RowGroup {
1: required list<ColumnChunk> columns
2: required i64 total_byte_size
3: required i64 num_rows
4: optional list<SortingColumn> sorting_columns
Expand All @@ -153,30 +153,30 @@ struct ColumnChunk<'a> {
1: optional string<'a> file_path
2: required i64 file_offset = 0
3: optional ColumnMetaData<'a> meta_data
4: optional i64 offset_index_offset
5: optional i32 offset_index_length
6: optional i64 column_index_offset
7: optional i32 column_index_length
//4: optional i64 offset_index_offset
//5: optional i32 offset_index_length
//6: optional i64 column_index_offset
//7: optional i32 column_index_length
8: optional ColumnCryptoMetaData crypto_metadata
9: optional binary<'a> encrypted_column_metadata
//9: optional binary<'a> encrypted_column_metadata
}
);
#[cfg(not(feature = "encryption"))]
thrift_struct!(
struct ColumnChunk<'a> {
struct ColumnChunk {
1: optional string file_path
2: required i64 file_offset = 0
3: optional ColumnMetaData<'a> meta_data
4: optional i64 offset_index_offset
5: optional i32 offset_index_length
6: optional i64 column_index_offset
7: optional i32 column_index_length
3: optional ColumnMetaData meta_data
//4: optional i64 offset_index_offset
//5: optional i32 offset_index_length
//6: optional i64 column_index_offset
//7: optional i32 column_index_length
}
);

type CompressionCodec = Compression;
thrift_struct!(
struct ColumnMetaData<'a> {
struct ColumnMetaData {
1: required Type type_
2: required list<Encoding> encodings
// we don't expose path_in_schema so skip
Expand All @@ -190,12 +190,15 @@ struct ColumnMetaData<'a> {
9: required i64 data_page_offset
10: optional i64 index_page_offset
11: optional i64 dictionary_page_offset
12: optional Statistics<'a> statistics
13: optional list<PageEncodingStats> encoding_stats;
14: optional i64 bloom_filter_offset;
15: optional i32 bloom_filter_length;
16: optional SizeStatistics size_statistics;
17: optional GeospatialStatistics geospatial_statistics;
// TEMP HACK: skip statistics to see how fast
// thrift can be parsed without them
//12: optional Statistics<'a> statistics
//13: optional list<PageEncodingStats> encoding_stats;
//14: optional i64 bloom_filter_offset;
//15: optional i32 bloom_filter_length;
// TEMP HACK: skip size_statistics to see how fast we can decode without it
//16: optional SizeStatistics size_statistics;
//17: optional GeospatialStatistics geospatial_statistics;
}
);

Expand Down Expand Up @@ -317,29 +320,41 @@ fn convert_column(
let data_page_offset = col_metadata.data_page_offset;
let index_page_offset = col_metadata.index_page_offset;
let dictionary_page_offset = col_metadata.dictionary_page_offset;
let statistics = convert_stats(column_type, col_metadata.statistics)?;
let encoding_stats = col_metadata.encoding_stats;
let bloom_filter_offset = col_metadata.bloom_filter_offset;
let bloom_filter_length = col_metadata.bloom_filter_length;
let offset_index_offset = column.offset_index_offset;
let offset_index_length = column.offset_index_length;
let column_index_offset = column.column_index_offset;
let column_index_length = column.column_index_length;
let (unencoded_byte_array_data_bytes, repetition_level_histogram, definition_level_histogram) =
if let Some(size_stats) = col_metadata.size_statistics {
(
size_stats.unencoded_byte_array_data_bytes,
size_stats.repetition_level_histogram,
size_stats.definition_level_histogram,
)
} else {
(None, None, None)
};

let geo_statistics = convert_geo_stats(col_metadata.geospatial_statistics);

let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
//let statistics = convert_stats(column_type, col_metadata.statistics)?;
let statistics = None;
//let encoding_stats = col_metadata.encoding_stats;
//let bloom_filter_offset = col_metadata.bloom_filter_offset;
//let bloom_filter_length = col_metadata.bloom_filter_length;
//let offset_index_offset = column.offset_index_offset;
//let offset_index_length = column.offset_index_length;
//let column_index_offset = column.column_index_offset;
//let column_index_length = column.column_index_length;
let encoding_stats = None;
let bloom_filter_offset = None;
let bloom_filter_length = None;
let offset_index_offset = None;
let offset_index_length = None;
let column_index_offset = None;
let column_index_length = None;
let unencoded_byte_array_data_bytes = None;
// let (unencoded_byte_array_data_bytes, repetition_level_histogram, definition_level_histogram) =
// if let Some(size_stats) = col_metadata.size_statistics {
// (
// size_stats.unencoded_byte_array_data_bytes,
// size_stats.repetition_level_histogram,
// size_stats.definition_level_histogram,
// )
// } else {
// (None, None, None)
// };

// let geo_statistics = convert_geo_stats(col_metadata.geospatial_statistics);
let geo_statistics = None;

//let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
//let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
let repetition_level_histogram = None;
let definition_level_histogram = None;

let result = ColumnChunkMetaData {
column_descr,
Expand Down
Loading