From be5d5da882f0bd8fc99d489d608d70268e66717e Mon Sep 17 00:00:00 2001 From: barry3406 Date: Sat, 11 Apr 2026 09:06:37 -0700 Subject: [PATCH] Fix roundtrip of top-level string scalars that look like YAML structure When UnwrapScalar is enabled (the default for yaml output), the yaml encoder writes node.Value verbatim as a bare line. Any string whose content is itself a valid YAML mapping, sequence, or alias then round trips as that container instead of as a string. For example, the input document `"this: should really work"` previously re-emitted as the bare line `this: should really work`, which the next reader parses as a one key map, destroying the original scalar. The same problem surfaces whenever a multiline string literal happens to contain `key: value` lines, which is the form the bug report uses for its second reproducer. Guard the fast-path by re-parsing node.Value with yaml.v4: if the bare form decodes to a non-scalar, fall through to the regular encoder so it can apply the quoting style required by the YAML spec. The check is limited to `!!str` nodes and to structural reinterpretations, so tag expressions such as `!!int` and plain strings that re-read as integers, booleans, or nulls are unaffected. An unparseable value (e.g. one containing NUL) stays on the fast-path so downstream NUL-aware writers still see the raw bytes. Updates the base64 "decode yaml document" scenario whose expected output was `a: apple\n` bare; it is now emitted as a block literal, which round-trips back to the same string. Reproducer: ``` printf '"this: should really work"\n' | yq -p yaml -o yaml ``` Before this fix the second run of yq parses the output as a map; after, it remains the original string. Fixes #2608 --- pkg/yqlib/base64_test.go | 11 +++-- pkg/yqlib/encoder_yaml.go | 27 ++++++++++- pkg/yqlib/encoder_yaml_test.go | 84 ++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 pkg/yqlib/encoder_yaml_test.go diff --git a/pkg/yqlib/base64_test.go b/pkg/yqlib/base64_test.go index 95cdb80a52..b64718447a 100644 --- a/pkg/yqlib/base64_test.go +++ b/pkg/yqlib/base64_test.go @@ -81,10 +81,13 @@ var base64Scenarios = []formatScenario{ scenarioType: "decode", }, { - skipDoc: true, - description: "decode yaml document", - input: base64EncodedYaml, - expected: base64DecodedYaml + "\n", + skipDoc: true, + description: "decode yaml document", + input: base64EncodedYaml, + // The decoded payload ("a: apple\n") would re-parse as a map if + // emitted bare, so the yaml encoder keeps it as a block literal to + // preserve roundtrip safety. See issue #2608. + expected: "|\n a: apple\n", scenarioType: "decode", }, { diff --git a/pkg/yqlib/encoder_yaml.go b/pkg/yqlib/encoder_yaml.go index b46ae44428..684bf337c2 100644 --- a/pkg/yqlib/encoder_yaml.go +++ b/pkg/yqlib/encoder_yaml.go @@ -35,7 +35,7 @@ func (ye *yamlEncoder) Encode(writer io.Writer, node *CandidateNode) error { if strings.Contains(node.LeadingContent, "\r\n") { lineEnding = "\r\n" } - if node.Kind == ScalarNode && ye.prefs.UnwrapScalar { + if node.Kind == ScalarNode && ye.prefs.UnwrapScalar && !bareStringNeedsQuoting(node) { valueToPrint := node.Value if node.LeadingContent == "" || valueToPrint != "" { valueToPrint = valueToPrint + lineEnding @@ -78,3 +78,28 @@ func (ye *yamlEncoder) Encode(writer io.Writer, node *CandidateNode) error { } return nil } + +// bareStringNeedsQuoting reports whether a top-level string scalar would be +// structurally reinterpreted if emitted as an unquoted bare value. The +// unwrap-scalar fast-path writes node.Value verbatim, which silently turns a +// string like "this: should really work" into a mapping on the next read, or +// "- item" into a sequence. When this returns true the caller falls through +// to the full yaml encoder, which applies the quoting style required by the +// YAML spec. Scalar-to-scalar reinterpretations (e.g. "123" parsing as an int +// tag) are not covered here: they preserve the node shape and are handled by +// callers that care about explicit tag preservation. +func bareStringNeedsQuoting(node *CandidateNode) bool { + if node.Tag != "!!str" { + return false + } + var parsed yaml.Node + if err := yaml.Unmarshal([]byte(node.Value), &parsed); err != nil { + // Unparseable bare form (e.g. control characters): leave it on the + // fast-path so callers that check for those characters still see them. + return false + } + if parsed.Kind != yaml.DocumentNode || len(parsed.Content) != 1 { + return false + } + return parsed.Content[0].Kind != yaml.ScalarNode +} diff --git a/pkg/yqlib/encoder_yaml_test.go b/pkg/yqlib/encoder_yaml_test.go new file mode 100644 index 0000000000..ae688cc87d --- /dev/null +++ b/pkg/yqlib/encoder_yaml_test.go @@ -0,0 +1,84 @@ +package yqlib + +import ( + "bytes" + "strings" + "testing" +) + +// TestYamlEncoderUnwrapScalarRoundtripSafety verifies that a top-level string +// scalar whose unquoted form would re-parse as a non-scalar node (map or +// sequence) is emitted quoted even when UnwrapScalar is enabled. Safe plain +// strings continue to round-trip through the existing fast-path. See #2608. +func TestYamlEncoderUnwrapScalarRoundtripSafety(t *testing.T) { + cases := []struct { + name string + value string + wantBare bool // true: output equals value+"\n"; false: output must differ + }{ + {name: "colon_parses_as_map", value: "this: should really work"}, + {name: "dash_parses_as_seq", value: "- item"}, + {name: "multiline_maplike", value: "a: a\nb: b"}, + {name: "safe_plain_string", value: "hello world", wantBare: true}, + {name: "safe_identifier", value: "cat", wantBare: true}, + {name: "safe_digits_preserved", value: "123", wantBare: true}, + {name: "safe_null_word_preserved", value: "null", wantBare: true}, + {name: "safe_tag_shorthand_preserved", value: "!!int", wantBare: true}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + prefs := NewDefaultYamlPreferences() + prefs.UnwrapScalar = true + + var buf bytes.Buffer + err := NewYamlEncoder(prefs).Encode(&buf, &CandidateNode{ + Kind: ScalarNode, + Tag: "!!str", + Value: tc.value, + }) + if err != nil { + t.Fatalf("encode failed: %v", err) + } + got := buf.String() + + if tc.wantBare { + if got != tc.value+"\n" { + t.Fatalf("expected bare %q, got %q", tc.value+"\n", got) + } + return + } + + // Ambiguous input: must not be emitted as the bare value. + if got == tc.value+"\n" { + t.Fatalf("value %q was emitted bare; expected quoted form", tc.value) + } + + // The output must round-trip back to a string scalar with the + // same value, proving structural roundtrip safety. + decoder := NewYamlDecoder(NewDefaultYamlPreferences()) + nodes, err := readDocuments(strings.NewReader(got), "test.yaml", 0, decoder) + if err != nil { + t.Fatalf("decode of %q failed: %v", got, err) + } + if nodes.Len() != 1 { + t.Fatalf("expected one document, got %d", nodes.Len()) + } + candidate := nodes.Front().Value.(*CandidateNode) + // readDocuments wraps the document; descend to the scalar. + scalar := candidate + for scalar.Kind != ScalarNode && len(scalar.Content) == 1 { + scalar = scalar.Content[0] + } + if scalar.Kind != ScalarNode { + t.Fatalf("round-tripped node is not a scalar: kind=%v value=%q", scalar.Kind, scalar.Value) + } + if scalar.Tag != "!!str" { + t.Fatalf("round-tripped tag is %q, want !!str", scalar.Tag) + } + if scalar.Value != tc.value { + t.Fatalf("round-tripped value is %q, want %q", scalar.Value, tc.value) + } + }) + } +}