diff --git a/Cargo.lock b/Cargo.lock index 650553b..a536860 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -343,6 +343,12 @@ dependencies = [ "writeable", ] +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -371,9 +377,12 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.16.1" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" +dependencies = [ + "foldhash", +] [[package]] name = "icu_calendar" @@ -630,9 +639,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", "hashbrown", @@ -781,6 +790,7 @@ dependencies = [ name = "message-format-compiler" version = "0.1.0" dependencies = [ + "hashbrown", "icu_normalizer", "message-format-runtime", "serde", diff --git a/apps/message-format-cli/src/main.rs b/apps/message-format-cli/src/main.rs index 3ce21eb..f106c39 100644 --- a/apps/message-format-cli/src/main.rs +++ b/apps/message-format-cli/src/main.rs @@ -25,8 +25,9 @@ use std::io::{self, IsTerminal, Write}; use std::path::{Path, PathBuf}; use message_format_compiler::{ - BuildError, CompileOptions, CompileReport, DiagnosticSeverity, FunctionManifest, ResourceInput, - SourceKind, compile_resources, compile_resources_with_manifest, + BuildError, CompileOptions, CompileReport, DiagnosticSeverity, FunctionManifest, + LiteralDeduplication, LiteralStats, ResourceInput, SourceKind, compile_resources, + compile_resources_with_manifest, }; use message_format_resource_json::{JsonProfile, parse_json_resource}; use message_format_resource_toml::parse_resource_toml; @@ -55,6 +56,8 @@ fn run(args: impl IntoIterator) -> Result { source_map_output, functions_manifest, check_only, + literal_deduplication, + literal_stats, } => { compile_command( &inputs, @@ -63,6 +66,8 @@ fn run(args: impl IntoIterator) -> Result { source_map_output.as_deref(), functions_manifest.as_deref(), check_only, + literal_deduplication, + literal_stats, )?; Ok(RunOutcome::Done) } @@ -79,6 +84,8 @@ enum Command { source_map_output: Option, functions_manifest: Option, check_only: bool, + literal_deduplication: LiteralDeduplication, + literal_stats: bool, }, } @@ -113,6 +120,8 @@ fn parse_args(args: impl IntoIterator) -> Result let mut source_map_output = None; let mut functions_manifest = None; let mut check_only = false; + let mut literal_deduplication = LiteralDeduplication::Enabled; + let mut literal_stats = false; while let Some(arg) = args.next() { match arg.as_str() { @@ -143,6 +152,15 @@ fn parse_args(args: impl IntoIterator) -> Result "--check" => { check_only = true; } + "--no-lits-dedup" => { + literal_deduplication = LiteralDeduplication::Disabled; + } + "--measure-lits-dedup" => { + literal_deduplication = LiteralDeduplication::MeasureOnly; + } + "--literal-stats" => { + literal_stats = true; + } "-h" | "--help" => { return Ok(Command::Help(usage())); } @@ -169,6 +187,8 @@ fn parse_args(args: impl IntoIterator) -> Result source_map_output, functions_manifest, check_only, + literal_deduplication, + literal_stats, }) } @@ -179,6 +199,8 @@ fn compile_command( source_map_output: Option<&Path>, functions_manifest: Option<&Path>, check_only: bool, + literal_deduplication: LiteralDeduplication, + literal_stats: bool, ) -> Result<(), String> { let manifest = if let Some(path) = functions_manifest { Some(load_function_manifest(path)?) @@ -187,13 +209,21 @@ fn compile_command( }; let compiled = match input_format { - InputFormat::ResourceToml => compile_toml_resource_inputs(input_paths, manifest.as_ref()), - InputFormat::JsonFlat => { - compile_json_resource_inputs(input_paths, manifest.as_ref(), JsonProfile::Flat) - } - InputFormat::JsonChrome => { - compile_json_resource_inputs(input_paths, manifest.as_ref(), JsonProfile::Chrome) + InputFormat::ResourceToml => { + compile_toml_resource_inputs(input_paths, manifest.as_ref(), literal_deduplication) } + InputFormat::JsonFlat => compile_json_resource_inputs( + input_paths, + manifest.as_ref(), + JsonProfile::Flat, + literal_deduplication, + ), + InputFormat::JsonChrome => compile_json_resource_inputs( + input_paths, + manifest.as_ref(), + JsonProfile::Chrome, + literal_deduplication, + ), }; if compiled.has_errors() { return Err(render_compile_report(compiled)); @@ -202,6 +232,10 @@ fn compile_command( .compiled .ok_or_else(|| String::from("compile report completed without catalog or errors"))?; + if literal_stats { + eprintln!("{}", render_literal_stats(compiled.literal_stats)); + } + if check_only { return Ok(()); } @@ -231,8 +265,15 @@ fn compile_command( fn compile_toml_resource_inputs( input_paths: &[PathBuf], manifest: Option<&FunctionManifest>, + literal_deduplication: LiteralDeduplication, ) -> CompileReport { - compile_resource_inputs(input_paths, manifest, parse_resource_toml, "resource-toml") + compile_resource_inputs( + input_paths, + manifest, + parse_resource_toml, + "resource-toml", + literal_deduplication, + ) } fn render_compile_report(report: CompileReport) -> String { @@ -254,6 +295,7 @@ fn compile_json_resource_inputs( input_paths: &[PathBuf], manifest: Option<&FunctionManifest>, profile: JsonProfile, + literal_deduplication: LiteralDeduplication, ) -> CompileReport { let label = match profile { JsonProfile::Flat => "json-flat", @@ -264,6 +306,7 @@ fn compile_json_resource_inputs( manifest, |name, source| parse_json_resource(name, source, profile), label, + literal_deduplication, ) } @@ -272,6 +315,7 @@ fn compile_resource_inputs( manifest: Option<&FunctionManifest>, parse: impl Fn(String, &str) -> Result, label: &str, + literal_deduplication: LiteralDeduplication, ) -> CompileReport where E: std::fmt::Display, @@ -305,10 +349,14 @@ where } } + let options = CompileOptions { + literal_deduplication, + ..CompileOptions::default() + }; let mut report = if let Some(manifest) = manifest { - compile_resources_with_manifest(inputs, CompileOptions::default(), manifest) + compile_resources_with_manifest(inputs, options, manifest) } else { - compile_resources(inputs, CompileOptions::default()) + compile_resources(inputs, options) }; if !diagnostics.is_empty() { report.compiled = None; @@ -324,6 +372,21 @@ fn load_function_manifest(path: &Path) -> Result { .map_err(|err| format!("failed to parse {}: {err}", path.display())) } +fn render_literal_stats(stats: LiteralStats) -> String { + format!( + "literal-stats mode={:?} slices={} unique={} duplicates={} input_bytes={} unique_bytes={} duplicate_bytes={} emitted_bytes={} saved_bytes={}", + stats.deduplication, + stats.literal_slices, + stats.unique_literals, + stats.duplicate_literals, + stats.input_literal_bytes, + stats.unique_literal_bytes, + stats.duplicate_literal_bytes, + stats.emitted_literal_bytes, + stats.saved_literal_bytes, + ) +} + fn render_source_map_json(source_map: &message_format_compiler::SourceMap) -> String { let mut out = String::new(); out.push_str("{\n \"sources\": [\n"); @@ -409,13 +472,13 @@ fn json_escape(value: &str) -> String { fn usage() -> String { String::from( - "usage:\n message-format-cli compile [--check] --input-format resource-toml|json-flat|json-chrome [-o OUTPUT] [--source-map PATH] [--functions PATH] INPUT...", + "usage:\n message-format-cli compile [--check] [--literal-stats] [--no-lits-dedup|--measure-lits-dedup] --input-format resource-toml|json-flat|json-chrome [-o OUTPUT] [--source-map PATH] [--functions PATH] INPUT...", ) } #[cfg(test)] mod tests { - use super::{Command, InputFormat, parse_args, run}; + use super::{Command, InputFormat, LiteralDeduplication, parse_args, run}; use std::path::{Path, PathBuf}; use std::{ fs, @@ -466,6 +529,35 @@ mod tests { source_map_output: Some(PathBuf::from("out.map.json")), functions_manifest: Some(PathBuf::from("functions.toml")), check_only: true, + literal_deduplication: LiteralDeduplication::Enabled, + literal_stats: false, + } + ); + } + + #[test] + fn parse_compile_command_with_literal_dedup_options() { + let command = parse_args([ + String::from("compile"), + String::from("--literal-stats"), + String::from("--measure-lits-dedup"), + String::from("--input-format"), + String::from("resource-toml"), + String::from("messages.toml"), + ]) + .expect("parsed"); + + assert_eq!( + command, + Command::Compile { + inputs: vec![PathBuf::from("messages.toml")], + input_format: InputFormat::ResourceToml, + output: None, + source_map_output: None, + functions_manifest: None, + check_only: false, + literal_deduplication: LiteralDeduplication::MeasureOnly, + literal_stats: true, } ); } @@ -489,6 +581,8 @@ mod tests { source_map_output: None, functions_manifest: None, check_only: false, + literal_deduplication: LiteralDeduplication::Enabled, + literal_stats: false, } ); } @@ -512,6 +606,8 @@ mod tests { source_map_output: None, functions_manifest: None, check_only: false, + literal_deduplication: LiteralDeduplication::Enabled, + literal_stats: false, } ); } diff --git a/crates/message-format-compiler/Cargo.toml b/crates/message-format-compiler/Cargo.toml index ee3abf4..c1ee87e 100644 --- a/crates/message-format-compiler/Cargo.toml +++ b/crates/message-format-compiler/Cargo.toml @@ -12,6 +12,10 @@ categories.workspace = true description = "Compiler from MF2 text to message-format catalog bytecode." [dependencies] +hashbrown = { default-features = false, features = [ + "default-hasher", + "raw-entry", +], version = "0.17.0" } icu_normalizer = { default-features = false, features = ["compiled_data"], version = "2.1.1" } message-format-runtime = { workspace = true } serde = { features = ["derive"], version = "1.0" } diff --git a/crates/message-format-compiler/src/compile/lowering.rs b/crates/message-format-compiler/src/compile/lowering.rs index 334ddf9..ce21cc9 100644 --- a/crates/message-format-compiler/src/compile/lowering.rs +++ b/crates/message-format-compiler/src/compile/lowering.rs @@ -1,8 +1,12 @@ // Copyright 2026 the Message Format Authors // SPDX-License-Identifier: Apache-2.0 OR MIT -use std::collections::BTreeMap; +use std::{ + collections::{BTreeMap, hash_map::DefaultHasher}, + hash::{Hash, Hasher}, +}; +use hashbrown::{HashMap, hash_map::RawEntryMut}; use message_format_runtime::schema; use crate::semantic::{ @@ -11,7 +15,121 @@ use crate::semantic::{ }; use super::interning::{FunctionCatalogKey, function_catalog_key}; -use super::{CompileError, escape_fallback_literal, function_dynamic_options}; +use super::{ + CompileError, LiteralDeduplication, LiteralStats, escape_fallback_literal, + function_dynamic_options, +}; + +#[derive(Debug)] +pub(super) struct LiteralPool { + mode: LiteralDeduplication, + bytes: String, + offsets: HashMap, + stats: LiteralStats, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +struct LiteralSpan { + off: u32, + len: u32, +} + +impl LiteralPool { + pub(super) fn new(mode: LiteralDeduplication) -> Self { + Self { + mode, + bytes: String::new(), + offsets: HashMap::new(), + stats: LiteralStats { + deduplication: mode, + ..LiteralStats::default() + }, + } + } + + pub(super) fn intern(&mut self, value: &str) -> Result<(u32, u32), CompileError> { + let len = + u32::try_from(value.len()).map_err(|_| CompileError::size_overflow("literal data"))?; + self.stats.literal_slices += 1; + self.stats.input_literal_bytes += value.len(); + + if value.is_empty() { + return Ok((0, len)); + } + + if self.mode == LiteralDeduplication::Disabled { + return self.append(value, len); + } + + let hash = literal_hash(value); + match self + .offsets + .raw_entry_mut() + .from_hash(hash, |span| span_matches(self.bytes.as_str(), *span, value)) + { + RawEntryMut::Occupied(entry) => { + self.stats.duplicate_literals += 1; + self.stats.duplicate_literal_bytes += value.len(); + if self.mode == LiteralDeduplication::Enabled { + self.stats.saved_literal_bytes += value.len(); + return Ok((entry.key().off, len)); + } + } + RawEntryMut::Vacant(entry) => { + let (offset, len) = append_literal(&mut self.bytes, &mut self.stats, value, len)?; + entry.insert_with_hasher(hash, LiteralSpan { off: offset, len }, (), |span| { + literal_hash( + span_text(self.bytes.as_str(), *span) + .expect("literal span must reference appended bytes"), + ) + }); + self.stats.unique_literals += 1; + self.stats.unique_literal_bytes += value.len(); + return Ok((offset, len)); + } + } + + self.append(value, len) + } + + fn append(&mut self, value: &str, len: u32) -> Result<(u32, u32), CompileError> { + append_literal(&mut self.bytes, &mut self.stats, value, len) + } + + pub(super) fn into_parts(self) -> (String, LiteralStats) { + (self.bytes, self.stats) + } +} + +fn span_matches(bytes: &str, span: LiteralSpan, value: &str) -> bool { + span_text(bytes, span) == Some(value) +} + +fn span_text(bytes: &str, span: LiteralSpan) -> Option<&str> { + let start = span.off as usize; + let len = span.len as usize; + let end = start.checked_add(len)?; + bytes.get(start..end) +} + +fn append_literal( + bytes: &mut String, + stats: &mut LiteralStats, + value: &str, + len: u32, +) -> Result<(u32, u32), CompileError> { + let offset = + u32::try_from(bytes.len()).map_err(|_| CompileError::size_overflow("literal data"))?; + bytes.push_str(value); + stats.emitted_literal_bytes += value.len(); + Ok((offset, len)) +} + +fn literal_hash(value: &str) -> u64 { + let mut hasher = DefaultHasher::new(); + value.hash(&mut hasher); + hasher.finish() +} /// Compute the fallback string for a call part. fn compute_fallback(part: &Part) -> String { @@ -46,27 +164,19 @@ pub(super) fn lower_parts( parts: &[Part], string_map: &BTreeMap, func_map: &BTreeMap, - literals: &mut String, + literals: &mut LiteralPool, code: &mut Vec, ) -> Result<(), CompileError> { for part in parts { match part { Part::Text(value) => { - let off = u32::try_from(literals.len()) - .map_err(|_| CompileError::size_overflow("literal data"))?; - literals.push_str(value); - let len = u32::try_from(value.len()) - .map_err(|_| CompileError::size_overflow("literal data"))?; + let (off, len) = literals.intern(value)?; code.push(schema::OP_OUT_SLICE); code.extend_from_slice(&off.to_le_bytes()); code.extend_from_slice(&len.to_le_bytes()); } Part::Literal(value) => { - let off = u32::try_from(literals.len()) - .map_err(|_| CompileError::size_overflow("literal data"))?; - literals.push_str(value); - let len = u32::try_from(value.len()) - .map_err(|_| CompileError::size_overflow("literal data"))?; + let (off, len) = literals.intern(value)?; code.push(schema::OP_OUT_EXPR); code.extend_from_slice(&off.to_le_bytes()); code.extend_from_slice(&len.to_le_bytes()); @@ -176,7 +286,7 @@ fn lower_select( select: &SelectExpr, string_map: &BTreeMap, func_map: &BTreeMap, - literals: &mut String, + literals: &mut LiteralPool, code: &mut Vec, ) -> Result<(), CompileError> { emit_selector_start(&select.selector, string_map, func_map, code)?; diff --git a/crates/message-format-compiler/src/compile/mod.rs b/crates/message-format-compiler/src/compile/mod.rs index c9f2825..b426254 100644 --- a/crates/message-format-compiler/src/compile/mod.rs +++ b/crates/message-format-compiler/src/compile/mod.rs @@ -33,10 +33,22 @@ use frontend::parse_single_message_with_source; use interning::{ collect_functions, collect_strings, escape_fallback_literal, function_dynamic_options, }; -use lowering::lower_parts; +use lowering::{LiteralPool, lower_parts}; -/// Compile-time behavior options. +/// Literal-pool optimization mode for compiler-emitted `LITS` bytes. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum LiteralDeduplication { + /// Reuse identical non-empty literal fragments in the emitted catalog. + #[default] + Enabled, + /// Append every literal fragment directly and skip duplicate tracking. + Disabled, + /// Append every literal fragment directly while measuring duplicate opportunities. + MeasureOnly, +} + +/// Compile-time behavior options. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct CompileOptions { /// Whether bare placeholder expressions are implicitly treated as `:string`. /// @@ -50,6 +62,24 @@ pub struct CompileOptions { /// /// Default: `false`. pub default_bidi_isolation: bool, + /// Whether repeated literal text fragments are deduplicated in `LITS`. + /// + /// `Enabled` shares identical text and literal-expression fragments. + /// `Disabled` preserves the original append-only layout and skips duplicate + /// tracking. `MeasureOnly` preserves the original append-only layout while + /// measuring duplicate opportunities. + /// + /// Default: [`LiteralDeduplication::Enabled`]. + pub literal_deduplication: LiteralDeduplication, +} + +impl Default for CompileOptions { + fn default() -> Self { + Self { + default_bidi_isolation: false, + literal_deduplication: LiteralDeduplication::Enabled, + } + } } /// Explicitly keyed MF2 message input for multi-source compilation. @@ -85,6 +115,33 @@ pub struct SourceMap { pub messages: Vec, } +/// Compiler statistics for literal text emitted into the catalog `LITS` chunk. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct LiteralStats { + /// Literal deduplication mode used for this compilation. + pub deduplication: LiteralDeduplication, + /// Number of literal text/expression slices encountered during lowering. + pub literal_slices: usize, + /// Number of distinct non-empty literal strings encountered when duplicate + /// tracking was enabled. + pub unique_literals: usize, + /// Number of literal slices that matched an earlier non-empty literal when + /// duplicate tracking was enabled. + pub duplicate_literals: usize, + /// Total bytes represented by all lowered literal slices before sharing. + pub input_literal_bytes: usize, + /// Bytes represented by distinct non-empty literal strings when duplicate + /// tracking was enabled. + pub unique_literal_bytes: usize, + /// Bytes represented by duplicate literal slices when duplicate tracking + /// was enabled. + pub duplicate_literal_bytes: usize, + /// Actual bytes emitted into the catalog `LITS` chunk. + pub emitted_literal_bytes: usize, + /// Bytes avoided in `LITS` by reusing earlier literal slices. + pub saved_literal_bytes: usize, +} + /// Compiled catalog plus optional provenance sidecar data. #[derive(Debug, Clone, PartialEq, Eq)] pub struct CompiledCatalog { @@ -92,6 +149,8 @@ pub struct CompiledCatalog { pub bytes: Vec, /// Source provenance collected during compilation. pub source_map: SourceMap, + /// Literal-pool size and reuse statistics from compilation. + pub literal_stats: LiteralStats, } /// Build-time error that can attach one logical input source to a compiler error. @@ -510,13 +569,14 @@ pub fn compile_str(source: &str) -> Result, CompileError> { /// /// let options = CompileOptions { /// default_bidi_isolation: true, +/// ..CompileOptions::default() /// }; /// let bytes = compile("{ $name }", options).unwrap(); /// assert!(!bytes.is_empty()); /// ``` pub fn compile(source: &str, options: CompileOptions) -> Result, CompileError> { let message = parse_single_message_with_source(source, options, Some(SourceId(0)))?; - compile_parsed_messages(vec![message], None) + compile_parsed_messages(vec![message], None, options) } /// Compile source text using a function manifest for custom compile-time validation. @@ -529,7 +589,7 @@ pub fn compile_with_manifest( manifest: &FunctionManifest, ) -> Result, CompileError> { let message = parse_single_message_with_source(source, options, Some(SourceId(0)))?; - compile_parsed_messages(vec![message], Some(manifest)) + compile_parsed_messages(vec![message], Some(manifest), options) } /// Compile multiple explicitly keyed MF2 message inputs into one catalog with provenance. @@ -641,7 +701,7 @@ fn compile_builder_report( mut diagnostics: Vec, ) -> CompileReport { let CatalogBuilder { - options: _, + options, function_manifest, sources, messages, @@ -655,11 +715,12 @@ fn compile_builder_report( )); if diagnostics.is_empty() { - match encode_messages(&messages) { - Ok(bytes) => CompileReport::success( + match encode_messages(&messages, options) { + Ok(encoded) => CompileReport::success( CompiledCatalog { - bytes, + bytes: encoded.bytes, source_map: build_source_map(messages, sources), + literal_stats: encoded.literal_stats, }, diagnostics, ), @@ -677,9 +738,10 @@ fn compile_builder_report( fn compile_parsed_messages( messages: Vec, function_manifest: Option<&FunctionManifest>, + options: CompileOptions, ) -> Result, CompileError> { validate_messages(&messages, function_manifest)?; - encode_messages(&messages) + encode_messages(&messages, options).map(|encoded| encoded.bytes) } fn validate_messages( @@ -718,7 +780,15 @@ fn collect_build_errors( diagnostics } -fn encode_messages(messages: &[Message]) -> Result, CompileError> { +struct EncodedMessages { + bytes: Vec, + literal_stats: LiteralStats, +} + +fn encode_messages( + messages: &[Message], + options: CompileOptions, +) -> Result { let mut all_strings = BTreeSet::new(); collect_strings(messages, &mut all_strings); @@ -761,7 +831,7 @@ fn encode_messages(messages: &[Message]) -> Result, CompileError> { }) .collect(); - let mut literals = String::new(); + let mut literals = LiteralPool::new(options.literal_deduplication); let mut code = Vec::new(); let mut entries = Vec::new(); @@ -787,7 +857,12 @@ fn encode_messages(messages: &[Message]) -> Result, CompileError> { } sort_messages(&mut entries); - encode_catalog(&strings, &literals, &entries, &code, &func_entries) + let (literals, literal_stats) = literals.into_parts(); + let bytes = encode_catalog(&strings, &literals, &entries, &code, &func_entries)?; + Ok(EncodedMessages { + bytes, + literal_stats, + }) } fn ensure_unique_message_ids(messages: &[Message]) -> Result<(), CompileError> { diff --git a/crates/message-format-compiler/src/compile/tests.rs b/crates/message-format-compiler/src/compile/tests.rs index b60f420..fcca2a5 100644 --- a/crates/message-format-compiler/src/compile/tests.rs +++ b/crates/message-format-compiler/src/compile/tests.rs @@ -42,6 +42,19 @@ fn arg(catalog: &Catalog, name: &str, value: Value) -> (u32, Value) { (arg_id(catalog, name), value) } +fn chunk_len(bytes: &[u8], tag: [u8; 4]) -> u32 { + let chunk_count = u32::from_le_bytes(bytes[16..20].try_into().expect("chunk count")); + let chunk_table_offset = + u32::from_le_bytes(bytes[20..24].try_into().expect("chunk table offset")) as usize; + for idx in 0..chunk_count as usize { + let pos = chunk_table_offset + idx * 16; + if bytes[pos..pos + 4] == tag { + return u32::from_le_bytes(bytes[pos + 8..pos + 12].try_into().expect("chunk len")); + } + } + panic!("missing chunk {tag:?}"); +} + #[derive(Default)] struct MarkupOptionSink { options: Vec<(String, String)>, @@ -250,6 +263,232 @@ fn compile_inputs_merges_multiple_files_and_tracks_origins() { ); } +#[test] +fn compile_inputs_deduplicates_literal_text_slices() { + let compiled = expect_compiled(compile_inputs( + [ + CompileInput { + name: "one.mf2", + message_id: "one", + source: "Hello", + kind: SourceKind::MessageFormat, + }, + CompileInput { + name: "two.mf2", + message_id: "two", + source: "Hello", + kind: SourceKind::MessageFormat, + }, + ], + CompileOptions::default(), + )); + + let catalog = Catalog::from_bytes(&compiled.bytes).expect("catalog"); + assert_eq!( + compiled.literal_stats.deduplication, + LiteralDeduplication::Enabled + ); + assert_eq!(compiled.literal_stats.literal_slices, 2); + assert_eq!(compiled.literal_stats.unique_literals, 1); + assert_eq!(compiled.literal_stats.duplicate_literals, 1); + assert_eq!(compiled.literal_stats.input_literal_bytes, 10); + assert_eq!(compiled.literal_stats.unique_literal_bytes, 5); + assert_eq!(compiled.literal_stats.duplicate_literal_bytes, 5); + assert_eq!(compiled.literal_stats.emitted_literal_bytes, 5); + assert_eq!(compiled.literal_stats.saved_literal_bytes, 5); + assert_eq!(chunk_len(&compiled.bytes, *b"LITS"), 5); + assert_eq!( + catalog.code(), + &[ + schema::OP_OUT_SLICE, + 0, + 0, + 0, + 0, + 5, + 0, + 0, + 0, + schema::OP_HALT, + schema::OP_OUT_SLICE, + 0, + 0, + 0, + 0, + 5, + 0, + 0, + 0, + schema::OP_HALT, + ] + ); +} + +#[test] +fn disabled_literal_deduplication_keeps_append_only_literals_without_duplicate_tracking() { + let compiled = expect_compiled(compile_inputs( + [ + CompileInput { + name: "one.mf2", + message_id: "one", + source: "Hello", + kind: SourceKind::MessageFormat, + }, + CompileInput { + name: "two.mf2", + message_id: "two", + source: "Hello", + kind: SourceKind::MessageFormat, + }, + ], + CompileOptions { + literal_deduplication: LiteralDeduplication::Disabled, + ..CompileOptions::default() + }, + )); + + let catalog = Catalog::from_bytes(&compiled.bytes).expect("catalog"); + assert_eq!(chunk_len(&compiled.bytes, *b"LITS"), 10); + assert_eq!(compiled.literal_stats.literal_slices, 2); + assert_eq!(compiled.literal_stats.duplicate_literals, 0); + assert_eq!(compiled.literal_stats.input_literal_bytes, 10); + assert_eq!(compiled.literal_stats.emitted_literal_bytes, 10); + assert_eq!(compiled.literal_stats.saved_literal_bytes, 0); + assert_eq!( + catalog.code(), + &[ + schema::OP_OUT_SLICE, + 0, + 0, + 0, + 0, + 5, + 0, + 0, + 0, + schema::OP_HALT, + schema::OP_OUT_SLICE, + 5, + 0, + 0, + 0, + 5, + 0, + 0, + 0, + schema::OP_HALT, + ] + ); +} + +#[test] +fn measure_only_literal_deduplication_counts_opportunities_without_rewriting_literals() { + let compiled = expect_compiled(compile_inputs( + [ + CompileInput { + name: "one.mf2", + message_id: "one", + source: "Hello", + kind: SourceKind::MessageFormat, + }, + CompileInput { + name: "two.mf2", + message_id: "two", + source: "Hello", + kind: SourceKind::MessageFormat, + }, + ], + CompileOptions { + literal_deduplication: LiteralDeduplication::MeasureOnly, + ..CompileOptions::default() + }, + )); + + let catalog = Catalog::from_bytes(&compiled.bytes).expect("catalog"); + assert_eq!(chunk_len(&compiled.bytes, *b"LITS"), 10); + assert_eq!(compiled.literal_stats.literal_slices, 2); + assert_eq!(compiled.literal_stats.unique_literals, 1); + assert_eq!(compiled.literal_stats.duplicate_literals, 1); + assert_eq!(compiled.literal_stats.duplicate_literal_bytes, 5); + assert_eq!(compiled.literal_stats.input_literal_bytes, 10); + assert_eq!(compiled.literal_stats.emitted_literal_bytes, 10); + assert_eq!(compiled.literal_stats.saved_literal_bytes, 0); + assert_eq!( + catalog.code(), + &[ + schema::OP_OUT_SLICE, + 0, + 0, + 0, + 0, + 5, + 0, + 0, + 0, + schema::OP_HALT, + schema::OP_OUT_SLICE, + 5, + 0, + 0, + 0, + 5, + 0, + 0, + 0, + schema::OP_HALT, + ] + ); +} + +#[test] +fn compile_inputs_deduplicates_literal_expression_slices() { + let compiled = expect_compiled(compile_inputs( + [ + CompileInput { + name: "one.mf2", + message_id: "one", + source: "{|Hello|}", + kind: SourceKind::MessageFormat, + }, + CompileInput { + name: "two.mf2", + message_id: "two", + source: "{|Hello|}", + kind: SourceKind::MessageFormat, + }, + ], + CompileOptions::default(), + )); + + let catalog = Catalog::from_bytes(&compiled.bytes).expect("catalog"); + assert_eq!(chunk_len(&compiled.bytes, *b"LITS"), 5); + assert_eq!( + catalog.code(), + &[ + schema::OP_OUT_EXPR, + 0, + 0, + 0, + 0, + 5, + 0, + 0, + 0, + schema::OP_HALT, + schema::OP_OUT_EXPR, + 0, + 0, + 0, + 0, + 5, + 0, + 0, + 0, + schema::OP_HALT, + ] + ); +} + #[test] fn compile_resources_merges_named_message_bodies() { let compiled = expect_compiled(compile_resources( @@ -1301,6 +1540,7 @@ fn default_bidi_isolation_rewrites_bare_interpolation_to_string_call() { "{ $name }", CompileOptions { default_bidi_isolation: true, + ..CompileOptions::default() }, ) .expect("parsed"); @@ -1327,6 +1567,7 @@ fn default_bidi_isolation_rewrites_bare_literal_expression_to_string_call() { "{ hello }", CompileOptions { default_bidi_isolation: true, + ..CompileOptions::default() }, ) .expect("parsed"); diff --git a/crates/message-format-compiler/src/lib.rs b/crates/message-format-compiler/src/lib.rs index 608716f..99ffc4f 100644 --- a/crates/message-format-compiler/src/lib.rs +++ b/crates/message-format-compiler/src/lib.rs @@ -125,9 +125,9 @@ pub use compile::{ BuildError, BuildErrorContext, CatalogBuilder, CompileError, CompileInput, CompileOptions, - CompileReport, CompiledCatalog, DiagnosticContext, DiagnosticSeverity, MessageSource, - SourceMap, compile, compile_inputs, compile_inputs_with_manifest, compile_resources, - compile_resources_with_manifest, compile_str, compile_with_manifest, + CompileReport, CompiledCatalog, DiagnosticContext, DiagnosticSeverity, LiteralDeduplication, + LiteralStats, MessageSource, SourceMap, compile, compile_inputs, compile_inputs_with_manifest, + compile_resources, compile_resources_with_manifest, compile_str, compile_with_manifest, }; pub use emit::escape_text; pub use manifest::{ diff --git a/crates/message-format-conformance/src/harness.rs b/crates/message-format-conformance/src/harness.rs index ec1d2ca..d7891bd 100644 --- a/crates/message-format-conformance/src/harness.rs +++ b/crates/message-format-conformance/src/harness.rs @@ -448,6 +448,7 @@ fn run_wg_test_detail(test: &WgTest) -> String { fn run_wg_test_result(test: &WgTest) -> (bool, String) { let compile_options = CompileOptions { default_bidi_isolation: test.bidi_isolation.as_deref().unwrap_or("none") == "default", + ..CompileOptions::default() }; match compile(&test.src, compile_options) { Ok(bytes) => { diff --git a/crates/message-format-conformance/src/tr35/formatting.rs b/crates/message-format-conformance/src/tr35/formatting.rs index e65d29b..c504bc0 100644 --- a/crates/message-format-conformance/src/tr35/formatting.rs +++ b/crates/message-format-conformance/src/tr35/formatting.rs @@ -162,6 +162,7 @@ fn bidi_isolation_wraps_placeholders() { &[("name", Value::Str("World".into()))], CompileOptions { default_bidi_isolation: true, + ..CompileOptions::default() }, ); // FSI = U+2068, PDI = U+2069 @@ -189,6 +190,7 @@ fn no_bidi_without_string_function() { &[("name", Value::Str("World".into()))], CompileOptions { default_bidi_isolation: false, + ..CompileOptions::default() }, ); assert!(