diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a9cf352fa..7135d0e8f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -81,6 +81,7 @@ - Aligned `core_lib::math::u256` user docs with unified LE stack limb ordering (`a0/b0` on top), removing conflicting `[b7..b0, a7..a0]` notation ([#3066](https://github.com/0xMiden/miden-vm/pull/3066)). - Made all internal `core::math` procedures natively little-endian ([#3084](https://github.com/0xMiden/miden-vm/pull/3084)). - [BREAKING] Updated the Miden crypto stack to `miden-crypto` v0.25, and switched SMT leaf hashing to use Poseidon2 domain separation so masm-side leaf digests match `SmtLeaf::hash()` ([#3095](https://github.com/0xMiden/miden-vm/pull/3095)). +- Improved performances of auxiliary trace generation ([#3119](https://github.com/0xMiden/miden-vm/pull/3119)). - [BREAKING] Reject post-last operation-indexed decorators in block assembly and serialized MAST forests; use `after_exit` for decorators that run after a block exits ([#3114](https://github.com/0xMiden/miden-vm/pull/3114)). - [BREAKING] Removed `Continuation::AfterExitDecoratorsBasicBlock`. New MAST merges operation-indexed decorators at the post-last-op sentinel index into `after_exit` at build time; execution uses `AfterExitDecorators` only, with legacy forests still supported ([#2633](https://github.com/0xMiden/miden-vm/issues/2633)). diff --git a/air/src/lookup/aux_builder.rs b/air/src/lookup/aux_builder.rs index eae0ad5592..eb7ccae1a2 100644 --- a/air/src/lookup/aux_builder.rs +++ b/air/src/lookup/aux_builder.rs @@ -54,7 +54,7 @@ use super::{Challenges, LookupAir, ProverLookupBuilder, prover::build_lookup_fra /// [`crate::trace::main_trace::ROW_MAJOR_CHUNK_SIZE`] so we stay consistent with the /// repo's row-major tuning: ~512 rows × avg shape ~3 ≈ 1.5 K fractions per chunk and /// ~24 KiB of chunk-local scratch, comfortably L1-resident on any modern x86/arm core. -const ACCUMULATE_ROWS_PER_CHUNK: usize = 512; +pub(crate) const ACCUMULATE_ROWS_PER_CHUNK: usize = 512; // TOP-LEVEL DRIVER // ================================================================================================ @@ -174,6 +174,26 @@ where } } + #[cfg(feature = "concurrent")] + /// Build a `LookupFractions` from already-populated `fractions` and `counts` buffers. + pub(super) fn from_parts( + shape: Vec, + num_rows: usize, + fractions: Vec<(F, EF)>, + counts: Vec, + ) -> Self { + let num_cols = shape.len(); + debug_assert_eq!(counts.len(), num_rows * num_cols); + + Self { + fractions, + counts, + shape, + num_rows, + num_cols, + } + } + /// Number of permutation columns. pub fn num_columns(&self) -> usize { self.num_cols diff --git a/air/src/lookup/prover.rs b/air/src/lookup/prover.rs index bb4c33a19e..548798a6cc 100644 --- a/air/src/lookup/prover.rs +++ b/air/src/lookup/prover.rs @@ -158,6 +158,7 @@ pub fn build_lookup_fractions( where F: Field, EF: ExtensionField, + A: Sync, for<'a> A: LookupAir>, { let num_rows = main_trace.height(); @@ -165,25 +166,59 @@ where let flat: &[F] = main_trace.values.borrow(); let shape = air.column_shape().to_vec(); - let mut fractions = LookupFractions::from_shape(shape, num_rows); - // Per-row periodic slice, filled in place each row — no per-iteration allocation. - let mut periodic_row: Vec = vec![F::ZERO; periodic_columns.len()]; - - for r in 0..num_rows { - let curr = &flat[r * width..(r + 1) * width]; - let nxt_idx = (r + 1) % num_rows; - let next = &flat[nxt_idx * width..(nxt_idx + 1) * width]; - let window = RowWindow::from_two_rows(curr, next); + // Fill one chunk of rows into a fresh per-chunk `LookupFractions`. + let process_chunk = |row_lo: usize, row_hi: usize| -> LookupFractions { + let mut chunk = LookupFractions::from_shape(shape.clone(), row_hi - row_lo); + let mut periodic_row: Vec = vec![F::ZERO; periodic_columns.len()]; + for r in row_lo..row_hi { + let curr = &flat[r * width..(r + 1) * width]; + let nxt_idx = (r + 1) % num_rows; + let next = &flat[nxt_idx * width..(nxt_idx + 1) * width]; + let window = RowWindow::from_two_rows(curr, next); + for (i, col) in periodic_columns.iter().enumerate() { + periodic_row[i] = col[r % col.len()]; + } + let mut lb = + ProverLookupBuilder::new(window, &periodic_row, challenges, air, &mut chunk); + air.eval(&mut lb); + } + chunk + }; - for (i, col) in periodic_columns.iter().enumerate() { - periodic_row[i] = col[r % col.len()]; + #[cfg(not(feature = "concurrent"))] + let fractions = process_chunk(0, num_rows); + + // Concatenation after parallel processing preserves global row order because chunks + // tile `0..num_rows` contiguously and each chunk's `fractions` / `counts` are + // row-major within the chunk. + #[cfg(feature = "concurrent")] + let fractions = { + use miden_crypto::parallel::*; + + let num_cols = shape.len(); + let rows_per_chunk = crate::lookup::aux_builder::ACCUMULATE_ROWS_PER_CHUNK; + let num_chunks = num_rows.div_ceil(rows_per_chunk); + + let chunks: Vec> = (0..num_chunks) + .into_par_iter() + .map(|chunk_idx| { + let row_lo = chunk_idx * rows_per_chunk; + let row_hi = (row_lo + rows_per_chunk).min(num_rows); + process_chunk(row_lo, row_hi) + }) + .collect(); + + let total_fractions: usize = chunks.iter().map(|c| c.fractions.len()).sum(); + let mut fractions_vec: Vec<(F, EF)> = Vec::with_capacity(total_fractions); + let mut counts_vec: Vec = Vec::with_capacity(num_rows * num_cols); + for chunk in chunks { + fractions_vec.extend(chunk.fractions); + counts_vec.extend(chunk.counts); } - let mut lb = - ProverLookupBuilder::new(window, &periodic_row, challenges, air, &mut fractions); - air.eval(&mut lb); - } + LookupFractions::from_parts(shape, num_rows, fractions_vec, counts_vec) + }; debug_assert_eq!( fractions.counts().len(), diff --git a/processor/src/trace/chiplets/memory/mod.rs b/processor/src/trace/chiplets/memory/mod.rs index 95c8fe723c..c575ad1082 100644 --- a/processor/src/trace/chiplets/memory/mod.rs +++ b/processor/src/trace/chiplets/memory/mod.rs @@ -280,7 +280,7 @@ impl Memory { }; let (delta_hi, delta_lo) = split_u32_into_u16(delta); - range.add_range_checks(row, &[delta_lo, delta_hi]); + range.add_range_checks(&[delta_lo, delta_hi]); // word index decomposition range checks: prove addr is a valid 32-bit value // by checking w0, w1, and 4*w1 are all in [0, 2^16). diff --git a/processor/src/trace/parallel/mod.rs b/processor/src/trace/parallel/mod.rs index 75eff9dc19..bcc706c313 100644 --- a/processor/src/trace/parallel/mod.rs +++ b/processor/src/trace/parallel/mod.rs @@ -422,8 +422,8 @@ fn initialize_range_checker( let mut range_checker = RangeChecker::new(); // Add all u32 range checks recorded during execution - for (clk, values) in range_checker_replay.into_iter() { - range_checker.add_range_checks(clk, &values); + for (_clk, values) in range_checker_replay.into_iter() { + range_checker.add_range_checks(&values); } // Add all memory-related range checks diff --git a/processor/src/trace/range/mod.rs b/processor/src/trace/range/mod.rs index bd8e68bd71..18323d7607 100644 --- a/processor/src/trace/range/mod.rs +++ b/processor/src/trace/range/mod.rs @@ -3,7 +3,6 @@ use core::mem::MaybeUninit; use miden_air::trace::RANGE_CHECK_TRACE_WIDTH; -use super::RowIndex; use crate::{ Felt, ZERO, utils::{assume_init_vec, uninit_vector}, @@ -50,10 +49,6 @@ pub struct RangeCheckTrace { pub struct RangeChecker { /// Tracks lookup count for each checked value. lookups: BTreeMap, - /// Range check lookups performed by all user operations, grouped and sorted by clock cycle. - /// Each cycle is mapped to a vector of the range checks requested at that cycle, which can - /// come from the stack, memory, or both. - cycle_lookups: BTreeMap>, } impl RangeChecker { @@ -66,7 +61,7 @@ impl RangeChecker { // range checker table are initialized. this simplifies trace table building later on. lookups.insert(0, 0); lookups.insert(u16::MAX, 0); - Self { lookups, cycle_lookups: BTreeMap::new() } + Self { lookups } } // TRACE MUTATORS @@ -78,7 +73,7 @@ impl RangeChecker { } /// Adds range check lookups from the stack or memory to this [RangeChecker] instance. - pub fn add_range_checks(&mut self, clk: RowIndex, values: &[u16]) { + pub fn add_range_checks(&mut self, values: &[u16]) { // range checks requests only come from memory or from the stack, which always request 2 or // 4 lookups respectively. debug_assert!(values.len() == 2 || values.len() == 4); @@ -87,17 +82,6 @@ impl RangeChecker { // add the specified value to the trace of this range checker's lookups. self.add_value(*value); } - - // track the range check requests at each cycle - // TODO: optimize this to use a struct instead of vectors, e.g. (#2793): - // struct MemoryLookupValues { - // num_lookups: u8, - // lookup_values: [u16; 6], - // } - self.cycle_lookups - .entry(clk) - .and_modify(|entry| entry.extend_from_slice(values)) - .or_insert_with(|| values.to_vec()); } // EXECUTION TRACE GENERATION (INTERNAL)