Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d7700cf
feat(parquet): dictionary fallback heuristics
mzabaluev Apr 9, 2026
3aacbe5
test: fallback with OnUnfavorableCompression
mzabaluev Apr 13, 2026
83ded36
refactor(parquet): simplify dict_encoding_size
mzabaluev Apr 13, 2026
674a1b0
fix: unset plain data counter when flushing dict
mzabaluev Apr 13, 2026
13e042e
chore: fix clippy
mzabaluev Apr 13, 2026
becdcef
chore: license on plain_counter
mzabaluev Apr 13, 2026
1de7b01
refactor(parquet): dict_encoded_size cleanup
mzabaluev Apr 14, 2026
701ff2b
chore(parquet): rename uncompressed_data_size
mzabaluev Apr 14, 2026
1b6dd37
test: compression fallback wins
mzabaluev Apr 15, 2026
da73778
feat: DictionaryFallback::OnUnfavorableAfter
mzabaluev Apr 16, 2026
b392738
refactor: more compact plain data counter init
mzabaluev Apr 16, 2026
bd914bb
test(parquet): fix up compression fallback
mzabaluev Apr 16, 2026
898e7e5
test(parquet): test dictionary_fallback property
mzabaluev Apr 16, 2026
1e81173
chore: suggestions from code review
mzabaluev Apr 16, 2026
34b819d
chore: add missing import
mzabaluev Apr 17, 2026
692018e
chore: rename dict_encoding_size
mzabaluev Apr 17, 2026
4511917
fix: rework plain_counter to DictFallbackCounter
mzabaluev Apr 17, 2026
289e4e1
test: adjust dict fallback tests as per review
mzabaluev Apr 17, 2026
3ff12c8
feat: make dict fallback decision one-off
mzabaluev Apr 17, 2026
1fcea02
refactor: revert counter disabling, rename
mzabaluev Apr 20, 2026
389d038
test: verify encoded data after dict fallback
mzabaluev Apr 20, 2026
8873bc1
perf(parquet): eager cutoff for fallback counter
mzabaluev Apr 20, 2026
8ba290b
test: assert efficiency of dictionary fallback
mzabaluev Apr 21, 2026
1827d04
test: remove page stats printout
mzabaluev Apr 21, 2026
87a19c4
docs: clarify purpose of plain_encoded_size
mzabaluev Apr 21, 2026
14ab09e
refactor: consistently panic on bool
mzabaluev Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 47 additions & 6 deletions parquet/src/arrow/arrow_writer/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ use crate::column::writer::encoder::{
ColumnValueEncoder, DataPageValues, DictionaryPage, create_bloom_filter,
};
use crate::data_type::{AsBytes, ByteArray, Int32Type};
use crate::encodings::encoding::{DeltaBitPackEncoder, Encoder};
use crate::encodings::encoding::{DeltaBitPackEncoder, DictFallbackCounter, Encoder};
use crate::encodings::rle::RleEncoder;
use crate::errors::{ParquetError, Result};
use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion};
use crate::file::properties::{
DictionaryFallback, EnabledStatistics, WriterProperties, WriterVersion,
};
use crate::geospatial::accumulator::{GeoStatsAccumulator, try_new_geo_stats_accumulator};
use crate::geospatial::statistics::GeospatialStatistics;
use crate::schema::types::ColumnDescPtr;
Expand Down Expand Up @@ -421,6 +423,7 @@ impl DictEncoder {
pub struct ByteArrayEncoder {
fallback: FallbackEncoder,
dict_encoder: Option<DictEncoder>,
dict_fallback_counter: Option<DictFallbackCounter>,
statistics_enabled: EnabledStatistics,
min_value: Option<ByteArray>,
max_value: Option<ByteArray>,
Expand All @@ -442,10 +445,17 @@ impl ColumnValueEncoder for ByteArrayEncoder {
where
Self: Sized,
{
let dictionary = props
let dict_encoder = props
.dictionary_enabled(descr.path())
.then(DictEncoder::default);

let dict_fallback_counter = match props.dictionary_fallback(descr.path()) {
DictionaryFallback::OnUnfavorableAfter(min_sample_len) if dict_encoder.is_some() => {
Some(DictFallbackCounter::new(descr, min_sample_len))
}
_ => None,
};

let fallback = FallbackEncoder::new(descr, props)?;

let (bloom_filter, bloom_filter_target_fpp) = create_bloom_filter(props, descr)?;
Expand All @@ -459,7 +469,8 @@ impl ColumnValueEncoder for ByteArrayEncoder {
statistics_enabled,
bloom_filter,
bloom_filter_target_fpp,
dict_encoder: dictionary,
dict_encoder,
dict_fallback_counter,
min_value: None,
max_value: None,
geo_stats_accumulator,
Expand Down Expand Up @@ -521,6 +532,20 @@ impl ColumnValueEncoder for ByteArrayEncoder {
}
}

fn is_dict_encoding_unfavorable(&self) -> Option<bool> {
match (&self.dict_encoder, &self.dict_fallback_counter) {
(Some(encoder), Some(counter)) => {
let dict_size = encoder.estimated_dict_page_size();
counter.is_dict_encoding_unfavorable(dict_size)
}
_ => None,
}
}

fn disable_dict_fallback_accounting(&mut self) {
self.dict_fallback_counter = None;
}

fn flush_dict_page(&mut self) -> Result<Option<DictionaryPage>> {
match self.dict_encoder.take() {
Some(encoder) => {
Expand All @@ -530,6 +555,8 @@ impl ColumnValueEncoder for ByteArrayEncoder {
));
}

self.dict_fallback_counter = None;

Ok(Some(encoder.flush_dict_page()))
}
_ => Ok(None),
Expand All @@ -541,7 +568,13 @@ impl ColumnValueEncoder for ByteArrayEncoder {
let max_value = self.max_value.take();

match &mut self.dict_encoder {
Some(encoder) => Ok(encoder.flush_data_page(min_value, max_value)),
Some(encoder) => {
let data_page = encoder.flush_data_page(min_value, max_value);
if let Some(counter) = self.dict_fallback_counter.as_mut() {
counter.commit_page(&data_page);
}
Ok(data_page)
}
_ => self.fallback.flush_data_page(min_value, max_value),
}
}
Expand Down Expand Up @@ -582,7 +615,15 @@ where
}

match &mut encoder.dict_encoder {
Some(dict_encoder) => dict_encoder.encode(values, indices),
Some(dict_encoder) => {
dict_encoder.encode(values, indices);
if let Some(counter) = encoder.dict_fallback_counter.as_mut() {
for idx in indices {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little worried about performance here. It would be nice if after we've collected enough samples and decided on dict vs fallback, we stop gathering these statistics.

Copy link
Copy Markdown
Contributor Author

@mzabaluev mzabaluev Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a mind to keep comparing after every encoded data page, for cases when the configured minimal sample is still not indicative of the overall value distribution and the efficiency degrades somewhere farther down the page chunk. But I understand the concern. Since this behavior is tunable per column through the writer API, I think it's OK to cut counting. For consistency, this should be also done in the generic encoder, I assume?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I didn't want to flag it in both places. 😄

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a mind to keep comparing after every encoded data page, for cases when the configured minimal sample is still not indicative of the overall value distribution and the efficiency degrades somewhere farther down the page chunk.

Fair...but then perhaps the size limit will catch it. In any event, we should stop collectin after we have actually fallen back 😉

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should stop collectin after we have actually fallen back 😉

That's already the case, with the plain_data_size_counter member set to None in both flush_dict_page implementations, and the collecting is also not happening in the put methods in case there is no dictionary. Though if I implement a fix for #9739, this may need to be refactored.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The counting shuts down after reaching the sample size threshold in 3ff12c8.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still not too granular, I guess, if it only happens on page flush and the fallback might be decided on an earlier write batch.

Copy link
Copy Markdown
Contributor Author

@mzabaluev mzabaluev Apr 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Further reworked in 8873bc1.

let value = values.value(*idx);
counter.update_byte_array(value.as_ref());
}
}
}
None => encoder.fallback.encode(values, indices),
}
}
Expand Down
169 changes: 157 additions & 12 deletions parquet/src/arrow/arrow_writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1669,6 +1669,7 @@ mod tests {

use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
use crate::arrow::{ARROW_SCHEMA_META_KEY, PARQUET_FIELD_ID_META_KEY};
use crate::basic::PageType;
use crate::column::page::{Page, PageReader};
use crate::file::metadata::thrift::PageHeader;
use crate::file::page_index::column_index::ColumnIndexMetaData;
Expand All @@ -1691,13 +1692,14 @@ mod tests {
use crate::data_type::AsBytes;
use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData, ParquetMetaDataReader};
use crate::file::properties::{
BloomFilterPosition, EnabledStatistics, ReaderProperties, WriterVersion,
BloomFilterPosition, DictionaryFallback, EnabledStatistics, ReaderProperties, WriterVersion,
};
use crate::file::serialized_reader::ReadOptionsBuilder;
use crate::file::{
reader::{FileReader, SerializedFileReader},
statistics::Statistics,
};
use crate::record::RowAccessor;

#[test]
fn arrow_writer() {
Expand Down Expand Up @@ -2572,6 +2574,91 @@ mod tests {
);
}

#[test]
fn arrow_writer_dictionary_fallback_on_unfavorable_compression() {
let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)]));

let mut builder = StringBuilder::with_capacity(100, 329 * 10_000);

// Generate an array of 10 unique 10 character strings.
// This results in a dictionary encoding larger than the plain encoded data,
// which should trigger a fallback to PLAIN encoding.
for i in 0..10 {
let value = i
.to_string()
.repeat(10)
.chars()
.take(10)
.collect::<String>();

builder.append_value(value);
}

let array = Arc::new(builder.finish());

let batch = RecordBatch::try_new(schema, vec![array.clone()]).unwrap();

let file = tempfile::tempfile().unwrap();

// Set dictionary fallback to trigger fallback to PLAIN encoding on unfavorable compression
let props = WriterProperties::builder()
.set_dictionary_fallback(DictionaryFallback::OnUnfavorableAfter(1))
.set_data_page_row_count_limit(2)
.set_write_batch_size(1)
.build();

let mut writer =
ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(), Some(props))
.expect("Unable to write file");
writer.write(&batch).unwrap();
writer.close().unwrap();

let options = ReadOptionsBuilder::new()
.with_encoding_stats_as_mask(false)
.build();
let reader =
SerializedFileReader::new_with_options(file.try_clone().unwrap(), options).unwrap();

let column = reader.metadata().row_group(0).columns();

assert_eq!(column.len(), 1);

// check page encoding stats, should be one dict page, one dict encoded page, and 5
// plain encoded pages
let stats = column[0].page_encoding_stats().unwrap();
assert!(
stats
.iter()
.any(|s| s.page_type == PageType::DICTIONARY_PAGE),
"stats are {stats:?}"
);
let num_dict_encoded: i32 = stats
.iter()
.filter(|s| {
s.page_type == PageType::DATA_PAGE && s.encoding == Encoding::RLE_DICTIONARY
})
.map(|s| s.count)
.sum();
assert_eq!(num_dict_encoded, 1);
let num_plain_encoded: i32 = stats
.iter()
.filter(|s| s.page_type == PageType::DATA_PAGE && s.encoding == Encoding::PLAIN)
.map(|s| s.count)
.sum();
assert_eq!(num_plain_encoded, 5);

// Read back the values and confirm they match the original array.
let rows: Vec<_> = reader
.get_row_iter(None)
.unwrap()
.map(|r| r.unwrap())
.collect();
assert_eq!(rows.len(), array.len());
for (i, row) in rows.iter().enumerate() {
assert_eq!(row.get_string(0).unwrap(), array.value(i));
}
}

#[test]
fn arrow_writer_float_nans() {
let f16_field = Field::new("a", DataType::Float16, false);
Expand Down Expand Up @@ -4789,6 +4876,15 @@ mod tests {
assert_eq!(chunk_page_stats, file_page_stats);
}

fn get_dict_page_size(meta: &ColumnChunkMetaData, data: Bytes) -> usize {
let mut reader = SerializedPageReader::new(Arc::new(data), meta, 0, None).unwrap();
let page = reader.get_next_page().unwrap().unwrap();
match page {
Page::DictionaryPage { buf, .. } => buf.len(),
_ => panic!("expected DictionaryPage"),
}
}

#[test]
fn test_different_dict_page_size_limit() {
let array = Arc::new(Int64Array::from_iter(0..1024 * 1024));
Expand All @@ -4813,18 +4909,67 @@ mod tests {
let col0_meta = metadata.row_group(0).column(0);
let col1_meta = metadata.row_group(0).column(1);

let get_dict_page_size = move |meta: &ColumnChunkMetaData| {
let mut reader =
SerializedPageReader::new(Arc::new(data.clone()), meta, 0, None).unwrap();
let page = reader.get_next_page().unwrap().unwrap();
match page {
Page::DictionaryPage { buf, .. } => buf.len(),
_ => panic!("expected DictionaryPage"),
}
};
assert_eq!(get_dict_page_size(col0_meta, data.clone()), 1024 * 1024);
assert_eq!(get_dict_page_size(col1_meta, data.clone()), 1024 * 1024 * 4);
}

#[test]
fn test_dict_page_size_decided_by_compression_fallback() {
Copy link
Copy Markdown
Contributor

@etseidl etseidl Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a test, I saved the output from this and examined the sizing. Without the heuristic, the encoded size for col0 is 8658384 bytes (the default fallback mechanism kicked in after 7 pages). With the heuristic, col1 is 8391126 bytes, a savings of 3%.

I also modified the test to mod the index with 32767. In that instance, col1 was still 8391126 bytes, but col0 was only 2231581, nearly 4X smaller.

I know this is not entirely representative, but it does again point out the pitfalls of too simplistic an approach.

Edit: I did a test of spark with the latter file (32k cardinality). By default, it opts to fallback for all pages, so the file is even larger. If I modify the global parquet.page.row.count.limit to 132000, it then opts for dictionary encoding as it should.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have modified the test in 1b6dd37 to demonstrate a case when even an early fallback decision brings about 12% compression. But I generally agree with your assessment, so more work is needed.

Another quirk is seen in this test: a dictionary page is still flushed to encode the first data page, even though there is no benefit. Parquet-java takes care to hand over the accumulated values to the plain encoder to be re-encoded.

// Generate values that are well dispersed across a range approximating (0..256 * 1024)
let array = Arc::new(Int32Array::from_iter(
(0i32..1024 * 1024).map(|x| x.wrapping_mul(163019) % 262139),
));
Comment on lines +4918 to +4921
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is i32, the wrapping_mul ultimately does wrap around to negative numbers, so the cardinality will actually approach 512k.

Overall I don't think this test is really necessary. The earlier test covers the early fall back.

let schema = Arc::new(Schema::new(vec![Field::new(
"col0",
arrow_schema::DataType::Int32,
false,
)]));
let batch = arrow_array::RecordBatch::try_new(schema.clone(), vec![array]).unwrap();

let props = WriterProperties::builder()
.set_dictionary_page_size_limit(1024 * 1024)
.build();
let mut writer = ArrowWriter::try_new(Vec::new(), schema.clone(), Some(props)).unwrap();
writer.write(&batch).unwrap();
let data = Bytes::from(writer.into_inner().unwrap());

let file_length_dict = data.len();

let mut metadata = ParquetMetaDataReader::new();
metadata.try_parse(&data).unwrap();
let metadata = metadata.finish().unwrap();
let full_dict_meta = metadata.row_group(0).column(0);
assert_eq!(get_dict_page_size(full_dict_meta, data.clone()), 1_048_576);

assert_eq!(get_dict_page_size(col0_meta), 1024 * 1024);
assert_eq!(get_dict_page_size(col1_meta), 1024 * 1024 * 4);
let props = WriterProperties::builder()
.set_dictionary_page_size_limit(1024 * 1024)
.set_column_dictionary_fallback(
ColumnPath::from("col0"),
DictionaryFallback::OnUnfavorableAfter(32_768),
)
.build();
let mut writer = ArrowWriter::try_new(Vec::new(), schema.clone(), Some(props)).unwrap();
writer.write(&batch).unwrap();
let data = Bytes::from(writer.into_inner().unwrap());

let file_length_fallback = data.len();

let mut metadata = ParquetMetaDataReader::new();
metadata.try_parse(&data).unwrap();
let metadata = metadata.finish().unwrap();
let fallback_meta = metadata.row_group(0).column(0);
assert_eq!(
get_dict_page_size(fallback_meta, data.clone()),
32_768 * std::mem::size_of::<u32>()
);

let compression_ratio = file_length_fallback as f64 / file_length_dict as f64;
assert!(
compression_ratio < 0.9,
"File encoded with dictionary fallback encoding does not result in sufficient compression,
got {file_length_fallback} vs {file_length_dict} ({:.2}%)",
compression_ratio * 100.0
);
}

struct WriteBatchesShape {
Expand Down
Loading
Loading