From ad6d2f6f96d928a31cdd102f1482ab423ceac0b3 Mon Sep 17 00:00:00 2001 From: Paul Conner Date: Mon, 2 Feb 2026 14:46:47 -0800 Subject: [PATCH] Speed up Stim Sampling with Faster Ref Sample This PR speeds up `stim sample` by switching the reference sample calculation from the `TableauSimulator` to the `ReferenceSampleTree`. Calculating the reference sample takes a large portion of the time for larger codes. Testing of performance for larger codes (disance 25 at 1M rounds) was done by building stim with `bazel build :stim`, then running the following CLI command: `time bazel-bin/stim --gen surface_code --task rotated_memory_x --distance 25 --rounds 1000000 --after_clifford_depolarization 0.001 | bazel-bin/stim sample --shots 10 --out_format=r8 > ./debug.r8` Metrics given are based on my machine (linux), but all metrics should be considered relative to eachother. The time taken for generating the circuit is considered trivial (< 0.1s). Before this change, this sample took ~7m 23s. With this change, this sample took ~2m 12s, a ~3.4x speedup (about as fast as not calculating a reference sample at all). I also looked into `FrameSimulator`'s logic to look for more speedup opportunities. The only real opportunity seen is to use multi-threading with worker threads. In particular, any of the overloads for `simd_bits_range_ref::for_each_word()` could likely benefit from being done in parallel across multiple worker threads. Async file IO (either using native ``/`OVERLAPPED`/etc, or hand-rolling queued writes where `putc()` is called from another thread) could also possibly help to bring down total sample duration. However, any multi-threaded work can be handled/discussed in another PR. Changes: * Added an overload for `ReferenceSampleTree::decompress_into()` that works with `simd_bits`. * Uses the `vector` overload (instead of using `operator[]` on the tree directly in the loop) as it is the roughly same speed when built normally, but much faster in debug (from what I saw). * Updated `stim::command_sample()` to use `ReferenceSampleTree` instead of `TableauSimulator` for calculating the reference sample. * The output sample is still fully expanded out into a flat `simd_bits` for use with the compare / file writing logic. * Adding `--skip_loop_folding` CLI flag to disable `ReferenceSampleTree`, falling back to `TableauSimulator`. * Updating `command_sample_help()` to document this new command. --- doc/usage_command_line.md | 25 ++++++++++++ src/stim/cmd/command_sample.cc | 43 ++++++++++++++++++++- src/stim/util_top/reference_sample_tree.h | 4 ++ src/stim/util_top/reference_sample_tree.inl | 13 +++++++ 4 files changed, 83 insertions(+), 2 deletions(-) diff --git a/doc/usage_command_line.md b/doc/usage_command_line.md index b5f107f60..2e5418e32 100644 --- a/doc/usage_command_line.md +++ b/doc/usage_command_line.md @@ -1676,6 +1676,7 @@ SYNOPSIS [--out_format 01|b8|r8|ptb64|hits|dets] \ [--seed int] \ [--shots int] \ + [--skip_loop_folding] \ [--skip_reference_sample] DESCRIPTION @@ -1762,6 +1763,30 @@ OPTIONS Must be an integer between 0 and a quintillion (10^18). + --skip_loop_folding + Skips loop folding logic on the reference sample calculation. + + When this argument is specified, the reference sample (that is used + to convert measurement flip data from frame simulations into actual + measurement data) is generated by iterating through the entire + flattened circuit with no loop detection. + + Loop folding can enormously improve performance for circuits + containing REPEAT blocks with large repeat counts, by detecting + periodicity in loops and fast-forwarding across them when computing + the reference sample for the circuit. However, in some cases the + analysis is not able to detect the periodicity that is present. For + example, this has been observed in honeycomb code circuits. When + this happens, the folding-capable analysis is slower than simply + analyzing the flattened circuit without any specialized loop logic. + The `--skip_loop_folding` flag can be used to just analyze the + flattened circuit, bypassing this slowdown for circuits such as + honeycomb code circuits. + + By default, loop detection is enabled. Pass this flag to disable + it (when appropriate by use case). + + --skip_reference_sample Asserts the circuit can produce a noiseless sample that is just 0s. diff --git a/src/stim/cmd/command_sample.cc b/src/stim/cmd/command_sample.cc index 2d94acdc1..01d85f59b 100644 --- a/src/stim/cmd/command_sample.cc +++ b/src/stim/cmd/command_sample.cc @@ -21,18 +21,20 @@ #include "stim/simulators/tableau_simulator.h" #include "stim/util_bot/arg_parse.h" #include "stim/util_bot/probability_util.h" +#include "stim/util_top/reference_sample_tree.h" using namespace stim; int stim::command_sample(int argc, const char **argv) { check_for_unknown_arguments( - {"--seed", "--skip_reference_sample", "--out_format", "--out", "--in", "--shots"}, + {"--seed", "--skip_reference_sample", "--skip_loop_folding", "--out_format", "--out", "--in", "--shots"}, {"--sample", "--frame0"}, "sample", argc, argv); const auto &out_format = find_enum_argument("--out_format", "01", format_name_to_enum_map(), argc, argv); bool skip_reference_sample = find_bool_argument("--skip_reference_sample", argc, argv); + bool skip_loop_folding = find_bool_argument("--skip_loop_folding", argc, argv); uint64_t num_shots = find_argument("--shots", argc, argv) ? (uint64_t)find_int64_argument("--shots", 1, 0, INT64_MAX, argc, argv) : find_argument("--sample", argc, argv) ? (uint64_t)find_int64_argument("--sample", 1, 0, INT64_MAX, argc, argv) @@ -56,7 +58,13 @@ int stim::command_sample(int argc, const char **argv) { auto circuit = Circuit::from_file(in); simd_bits ref(0); if (!skip_reference_sample) { - ref = TableauSimulator::reference_sample_circuit(circuit); + if (skip_loop_folding) { + ref = TableauSimulator::reference_sample_circuit(circuit); + } else { + ReferenceSampleTree reference_sample_measurement_bits = + ReferenceSampleTree::from_circuit_reference_sample(circuit.aliased_noiseless_circuit()); + reference_sample_measurement_bits.decompress_into(ref); + } } sample_batch_measurements_writing_results_to_disk(circuit, ref, num_shots, out, out_format.id, rng); } @@ -128,6 +136,37 @@ SubCommandHelp stim::command_sample_help() { )PARAGRAPH"), }); + result.flags.push_back( + SubCommandHelpFlag{ + "--skip_loop_folding", + "bool", + "false", + {"[none]", "[switch]"}, + clean_doc_string(R"PARAGRAPH( + Skips loop folding logic on the reference sample calculation. + + When this argument is specified, the reference sample (that is used + to convert measurement flip data from frame simulations into actual + measurement data) is generated by iterating through the entire + flattened circuit with no loop detection. + + Loop folding can enormously improve performance for circuits + containing REPEAT blocks with large repeat counts, by detecting + periodicity in loops and fast-forwarding across them when computing + the reference sample for the circuit. However, in some cases the + analysis is not able to detect the periodicity that is present. For + example, this has been observed in honeycomb code circuits. When + this happens, the folding-capable analysis is slower than simply + analyzing the flattened circuit without any specialized loop logic. + The `--skip_loop_folding` flag can be used to just analyze the + flattened circuit, bypassing this slowdown for circuits such as + honeycomb code circuits. + + By default, loop detection is enabled. Pass this flag to disable + it (when appropriate by use case). + )PARAGRAPH"), + }); + result.flags.push_back( SubCommandHelpFlag{ "--out_format", diff --git a/src/stim/util_top/reference_sample_tree.h b/src/stim/util_top/reference_sample_tree.h index 92dfcf144..366f4e7d4 100644 --- a/src/stim/util_top/reference_sample_tree.h +++ b/src/stim/util_top/reference_sample_tree.h @@ -37,6 +37,10 @@ struct ReferenceSampleTree { /// Writes the contents of the tree into the given output vector. void decompress_into(std::vector &output) const; + /// Writes the contents of the tree into the given output simd_bits. + template + void decompress_into(simd_bits &output) const; + /// Folds redundant children into the repetition count, if they repeat this many times. /// /// For example, if the tree's children are [A, B, C, A, B, C] and the tree has no diff --git a/src/stim/util_top/reference_sample_tree.inl b/src/stim/util_top/reference_sample_tree.inl index 94c52e27c..670ad97c8 100644 --- a/src/stim/util_top/reference_sample_tree.inl +++ b/src/stim/util_top/reference_sample_tree.inl @@ -2,6 +2,19 @@ namespace stim { +template +void ReferenceSampleTree::decompress_into(simd_bits &output) const { + std::vector v; + this->decompress_into(v); + + simd_bits result(v.size()); + for (size_t k = 0; k < v.size(); k++) { + result[k] ^= v[k]; + } + + output = std::move(result); +} + template ReferenceSampleTree CompressedReferenceSampleHelper::do_loop_with_no_folding(const Circuit &loop, uint64_t reps) { ReferenceSampleTree result;