Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sjsonnet/src/sjsonnet/Evaluator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1344,7 +1344,7 @@ class Evaluator(
case Val.Num(_, rd) => Val.cachedNum(pos, ld % rd)
case _ => failBinOp(l, e.op, r, pos)
}
case ls: Val.Str => Val.Str(pos, Format.format(ls.str, r, pos))
case ls: Val.Str => Format.format(ls.str, r, pos)
case _ => failBinOp(l, e.op, r, pos)
}

Expand Down
106 changes: 94 additions & 12 deletions sjsonnet/src/sjsonnet/Format.scala
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,13 @@ object Format {
* this case we can use a fast path that caches the object key lookup and avoids widenRaw
* entirely.
*/
val allSimpleNamedString: Boolean)
val allSimpleNamedString: Boolean,
/**
* True when every literal segment (leading + inter-spec literals) contains only printable
* ASCII with no `"` or `\`. Computed once at parse time; combined at format time with the
* ASCII-safety of each interpolated value to set the result's [[Val.Str._asciiSafe]] flag.
*/
val literalsAsciiSafe: Boolean)
extends CompiledFormat

final class FormatSpec private (val bits: Long) extends AnyVal {
Expand Down Expand Up @@ -264,7 +270,7 @@ object Format {
}
}

def format(s: String, values0: Val, pos: Position)(implicit evaluator: EvalScope): String = {
def format(s: String, values0: Val, pos: Position)(implicit evaluator: EvalScope): Val.Str = {
val parsed = parseFormatCached(s, evaluator.formatCache)
format(parsed, values0, pos)
}
Expand All @@ -287,6 +293,22 @@ object Format {
}
}

/**
* Scalar ASCII-JSON-safe check over a substring window of `s`. Matches the predicate used by
* [[Platform.isAsciiJsonSafe]] (printable ASCII, no `"` or `\`). Used at format-parse time so the
* result can be cached on [[RuntimeFormat]] and combined with per-value ASCII-safety at format
* time.
*/
private def isAsciiJsonSafeRange(s: String, from: Int, to: Int): Boolean = {
var i = from
while (i < to) {
val c = s.charAt(i)
if (c < 32 || c == '"' || c == '\\' || c >= 128) return false
i += 1
}
true
}

/**
* Hand-written format string scanner. Replaces the fastparse-based parser with direct
* `String.indexOf('%')` scanning, which is a JVM intrinsic / native SIMD-optimized operation. For
Expand All @@ -306,12 +328,15 @@ object Format {
var lastLabel: String = null
var firstNamedLabel: String = null
var allNamedLabelsSame = true
var allLiteralsAscii = true

// Find the first '%' to extract the leading literal
var pos = s.indexOf('%')
val leadingStart = 0
val leadingEnd = if (pos < 0) len else pos
staticChars += leadingEnd - leadingStart
if (allLiteralsAscii && !isAsciiJsonSafeRange(s, leadingStart, leadingEnd))
allLiteralsAscii = false

while (pos >= 0 && pos < len) {
pos += 1 // skip the '%'
Expand Down Expand Up @@ -451,6 +476,8 @@ object Format {
litStartsBuilder += litStart
litEndsBuilder += litEnd
staticChars += litEnd - litStart
if (allLiteralsAscii && !isAsciiJsonSafeRange(s, litStart, litEnd))
allLiteralsAscii = false

pos = nextPct
}
Expand Down Expand Up @@ -483,7 +510,8 @@ object Format {
litStarts,
litEnds,
singleNamedLabel,
allSimpleNamed
allSimpleNamed,
allLiteralsAscii
)
}

Expand All @@ -499,6 +527,7 @@ object Format {
var staticChars = leading.length
var hasAnyStar = false
var allSimpleNamed = true
var allLiteralsAscii = Platform.isAsciiJsonSafe(leading)
var idx = 0
while (idx < size) {
val (formatted, literal) = chunks(idx)
Expand All @@ -510,6 +539,7 @@ object Format {
staticChars += literal.length
hasAnyStar ||= formatted.widthStar || formatted.precisionStar
allSimpleNamed = false
if (allLiteralsAscii && !Platform.isAsciiJsonSafe(literal)) allLiteralsAscii = false
idx += 1
}
// No source string available for fastparse path; offset arrays are unused
Expand All @@ -526,12 +556,13 @@ object Format {
emptyStarts,
emptyEnds,
null,
allSimpleNamed
allSimpleNamed,
allLiteralsAscii
)
}

def format(leading: String, chunks: scala.Seq[(FormatSpec, String)], values0: Val, pos: Position)(
implicit evaluator: EvalScope): String = {
implicit evaluator: EvalScope): Val.Str = {
format(lowerParsedFormat((leading, chunks)), values0, pos)
}

Expand All @@ -551,7 +582,7 @@ object Format {
}

private def format(parsed: RuntimeFormat, values0: Val, pos: Position)(implicit
evaluator: EvalScope): String = {
evaluator: EvalScope): Val.Str = {

// Super-fast path: all specs are simple %(key)s with an object value.
// Avoids per-spec pattern matching, widenRaw, and uses offset-based literal appends.
Expand Down Expand Up @@ -581,6 +612,9 @@ object Format {
else new java.lang.StringBuilder(parsed.staticChars + specBits.length * 8)
if (!singleSpecNoStatic) appendLeading(output, parsed)
var singleFormatted: String = null
// Result ASCII-safety: starts from the format string's literal ASCII-safety, then ANDs with
// each spec's output ASCII-safety. Once false, stays false.
var resultAsciiSafe = parsed.literalsAsciiSafe
var i = 0
var idx = 0
// Use while-loop instead of for/zipWithIndex to avoid iterator allocation
Expand Down Expand Up @@ -662,6 +696,8 @@ object Format {
// This avoids the overhead of materializing to ujson.Value and then matching on it,
// which is a significant cost for format-heavy workloads like large_string_template.
val rawVal = raw.value
if (resultAsciiSafe && !specOutputAsciiSafe(rawVal, formatted.conversion))
resultAsciiSafe = false
val formattedValue = rawVal match {
case f: Val.Func => Error.fail("Cannot format function value", f)
case vs: Val.Str =>
Expand Down Expand Up @@ -748,7 +784,8 @@ object Format {
"Too many values to format: %d, expected %d".format(valuesArr.length, i)
)
}
if (singleSpecNoStatic) singleFormatted else output.toString()
val resultStr = if (singleSpecNoStatic) singleFormatted else output.toString()
if (resultAsciiSafe) Val.Str.asciiSafe(pos, resultStr) else Val.Str(pos, resultStr)
}

/**
Expand All @@ -758,27 +795,33 @@ object Format {
* redundant object lookups and the generic dispatch overhead.
*/
private def formatSimpleNamedString(parsed: RuntimeFormat, obj: Val.Obj, pos: Position)(implicit
evaluator: EvalScope): String = {
evaluator: EvalScope): Val.Str = {
val output = new java.lang.StringBuilder(parsed.staticChars + parsed.specBits.length * 16)

// Append leading literal using offsets if source is available, else use string
appendLeading(output, parsed)

var resultAsciiSafe = parsed.literalsAsciiSafe

val singleLabel = parsed.singleNamedLabel
if (singleLabel != null) {
val str = simpleStringValue(obj.value(singleLabel, pos)(evaluator).value)
val rawVal = obj.value(singleLabel, pos)(evaluator).value
if (resultAsciiSafe && !simpleStringValueAsciiSafe(rawVal)) resultAsciiSafe = false
val str = simpleStringValue(rawVal)
var idx = 0
while (idx < parsed.specBits.length) {
output.append(str)
appendLiteral(output, parsed, idx)
idx += 1
}
return output.toString
val resultStr = output.toString
return if (resultAsciiSafe) Val.Str.asciiSafe(pos, resultStr) else Val.Str(pos, resultStr)
}

// Cache for repeated key lookups: most format strings reuse the same key many times
var cachedKey: String = null
var cachedStr: String = null
var cachedAsciiSafe: Boolean = true

var idx = 0
while (idx < parsed.specBits.length) {
Expand All @@ -791,19 +834,23 @@ object Format {
else {
val rawVal = obj.value(key, pos)(evaluator).value
val s = simpleStringValue(rawVal)
val safe = simpleStringValueAsciiSafe(rawVal)
cachedKey = key
cachedStr = s
cachedAsciiSafe = safe
s
}

if (resultAsciiSafe && !cachedAsciiSafe) resultAsciiSafe = false
output.append(str)

// Append trailing literal using offsets if source is available
appendLiteral(output, parsed, idx)

idx += 1
}
output.toString
val resultStr = output.toString
if (resultAsciiSafe) Val.Str.asciiSafe(pos, resultStr) else Val.Str(pos, resultStr)
}

private def simpleStringValue(rawVal: Val)(implicit evaluator: EvalScope): String =
Expand All @@ -826,6 +873,41 @@ object Format {
value.toString
}

/**
* ASCII-safety predicate matching the output of [[simpleStringValue]] (used by the simple
* `%(name)s` fast path). Numeric/boolean/null literals are always ASCII; strings forward their
* cached `_asciiSafe` flag; complex types route through Renderer which may emit non-ASCII.
*/
@inline private def simpleStringValueAsciiSafe(rawVal: Val): Boolean = rawVal match {
case vs: Val.Str => vs._asciiSafe
case _: Val.Num => true
case _: Val.True => true
case _: Val.False => true
case _: Val.Null => true
case _ => false
}

/**
* ASCII-safety predicate for the output of a single format spec, used by the general [[format]]
* path. Mirrors the conversion logic below: strings forward their cached flag, numerics produce
* ASCII (except `%c` which depends on the codepoint), other scalars are always ASCII, and Arr/Obj
* go through Renderer (which preserves non-ASCII source bytes).
*/
@inline private def specOutputAsciiSafe(rawVal: Val, conversion: Char): Boolean = rawVal match {
case vs: Val.Str => vs._asciiSafe
case vn: Val.Num =>
conversion match {
case 'c' =>
val ch = vn.asDouble.toInt
ch >= 32 && ch < 127 && ch != '"' && ch != '\\'
case _ => true
}
case _: Val.True => true
case _: Val.False => true
case _: Val.Null => true
case _ => false
}

private def formatInteger(formatted: FormatSpec, s: Double): String = {
// Fast path: if the value fits in a Long (and isn't Long.MinValue where
// negation overflows), avoid BigInt allocation entirely
Expand Down Expand Up @@ -1013,6 +1095,6 @@ object Format {
// Each PartialApplyFmt instance caches its own parsed format, so no external cache needed.
private val parsed = scanFormat(fmt)
def evalRhs(values0: Eval, ev: EvalScope, pos: Position): Val =
Val.Str(pos, format(parsed, values0.value, pos)(ev))
format(parsed, values0.value, pos)(ev)
}
}
2 changes: 1 addition & 1 deletion sjsonnet/src/sjsonnet/stdlib/MathModule.scala
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ object MathModule extends AbstractFunctionModule {
def evalRhs(a: Eval, b: Eval, ev: EvalScope, pos: Position): Val = {
(a.value, b.value) match {
case (x: Val.Num, y: Val.Num) => Val.cachedNum(pos, x.asDouble % y.asDouble)
case _ => Val.Str(pos, Format.format(a.value.asString, b.value, pos)(ev))
case _ => Format.format(a.value.asString, b.value, pos)(ev)
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion sjsonnet/src/sjsonnet/stdlib/StringModule.scala
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,7 @@ object StringModule extends AbstractFunctionModule {
*/
private object Format_ extends Val.Builtin2("format", "str", "vals") {
def evalRhs(str: Eval, vals: Eval, ev: EvalScope, pos: Position): Val =
Val.Str(pos, Format.format(str.value.asString, vals.value, pos)(ev))
Format.format(str.value.asString, vals.value, pos)(ev)
override def specialize(args: Array[Expr], tailstrict: Boolean): (Val.Builtin, Array[Expr]) =
args match {
case Array(str, fmt: Val.Str) =>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Directional coverage for Format ASCII-safety propagation.
// Ensures format strings preserve correct values across paths that set Val.Str._asciiSafe:
// - simple %(name)s fast path with ASCII / non-ASCII literals and values
// - general format path with %s / %d / %c / %o / %x conversions
// - mixed ASCII literals + non-ASCII string interpolations (output must be correct)

std.assertEqual("hello %s" % "world", "hello world") &&
std.assertEqual("héllo %s" % "world", "héllo world") &&
std.assertEqual("hello %s" % "wörld", "hello wörld") &&
std.assertEqual("héllo %s" % "wörld", "héllo wörld") &&

// Simple %(name)s fast path: ASCII format + ASCII value
std.assertEqual("name=%(n)s" % { n: "alice" }, "name=alice") &&
// Simple %(name)s fast path: non-ASCII literal
std.assertEqual("námé=%(n)s" % { n: "alice" }, "námé=alice") &&
// Simple %(name)s fast path: non-ASCII value
std.assertEqual("name=%(n)s" % { n: "álice" }, "name=álice") &&
// Simple %(name)s fast path: repeated key with non-ASCII
std.assertEqual("%(n)s/%(n)s" % { n: "ümlaut" }, "ümlaut/ümlaut") &&

// Numeric conversions are always ASCII
std.assertEqual("%d" % 42, "42") &&
std.assertEqual("%05d" % 7, "00007") &&
std.assertEqual("%x" % 255, "ff") &&
std.assertEqual("%o" % 8, "10") &&
std.assertEqual("%.2f" % 3.14159, "3.14") &&

// %c with ASCII codepoint -> ASCII output
std.assertEqual("%c" % 65, "A") &&
// %c with non-ASCII codepoint -> non-ASCII output (must still render correctly)
std.assertEqual("%c" % 233, "é") &&

// Boolean / null
std.assertEqual("%s" % true, "true") &&
std.assertEqual("%s" % null, "null") &&

// Multi-spec
std.assertEqual("%s = %d, %s" % ["x", 1, "y"], "x = 1, y") &&

// Verify output is JSON-renderable correctly (this hits ByteRenderer's asciiSafe path)
std.assertEqual(std.manifestJson("héllo %s" % "wörld"), '"héllo wörld"') &&
std.assertEqual(std.manifestJson("hello %s" % "world"), '"hello world"') &&

true
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
true
Loading