From 78d58d01c15508770abf70e2c31b1c4928d2d897 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:22:47 +0200 Subject: [PATCH 1/5] HTML API: Add tests for serializing decoded carriage returns. Red TDD step: decoded carriage returns in text and attribute values must serialize as so that normalized output is idempotent: a raw CR in serialized output would be normalized to a line feed when parsed again. The raw-CR attribute and class-update cases pass already through the preprocessing-correct getters and pin that behavior. See #65372. --- .../html-api/wpHtmlProcessor-serialize.php | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index e516addb6c314..c421dd57c77eb 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -509,6 +509,96 @@ public static function data_provider_normalized_fuzzer_cases_that_should_be_idem ); } + /** + * Ensures that decoded carriage returns are serialized as character references. + * + * @ticket 65372 + * + * @dataProvider data_provider_decoded_carriage_returns + * + * @param string $input HTML input containing a decoded carriage return. + * @param string $expected Expected normalized output. + */ + public function test_normalize_serializes_decoded_carriage_returns_as_character_references( string $input, string $expected ) { + $normalized = WP_HTML_Processor::normalize( $input ); + + $this->assertSame( $expected, $normalized, 'Should have serialized the carriage return as a character reference.' ); + $this->assertSame( + $expected, + WP_HTML_Processor::normalize( $normalized ), + 'Normalizing already-normalized HTML should not change the serialized carriage return.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_provider_decoded_carriage_returns() { + return array( + 'Regular text' => array( '

a b

', '

a b

' ), + 'Regular text with non-canonical character reference' => array( '

a b

', '

a b

' ), + 'RCDATA title' => array( 'a b', 'a b' ), + 'RCDATA textarea with leading-newline preservation' => array( '', "" ), + 'Attribute value' => array( '

', '

' ), + 'Table text' => array( '
x
', '
x
' ), + 'Template text' => array( '', '' ), + ); + } + + /** + * Ensures that raw carriage returns in attribute values are serialized as line feeds. + * + * @ticket 65372 + * + * @dataProvider data_provider_raw_attribute_carriage_returns + * + * @param string $input HTML input containing raw carriage returns. + * @param string $expected Expected normalized output. + */ + public function test_normalize_serializes_raw_attribute_carriage_returns_as_line_feeds( string $input, string $expected ) { + $normalized = WP_HTML_Processor::normalize( $input ); + + $this->assertSame( $expected, $normalized, 'Should have serialized raw attribute carriage returns as line feeds.' ); + $this->assertSame( + $expected, + WP_HTML_Processor::normalize( $normalized ), + 'Normalizing already-normalized HTML should not change raw attribute newlines.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_provider_raw_attribute_carriage_returns() { + return array( + 'Raw carriage return' => array( "

", "

" ), + 'Raw CRLF pair' => array( "

", "

" ), + ); + } + + /** + * Ensures that raw carriage returns are normalized before class updates are serialized. + * + * @ticket 65372 + */ + public function test_serialize_token_normalizes_raw_class_carriage_returns_before_class_updates() { + $processor = WP_HTML_Processor::create_fragment( "

" ); + + $this->assertTrue( $processor->next_tag( 'P' ), 'Should find the P element.' ); + + $processor->add_class( 'c' ); + + $this->assertSame( + "

", + $processor->serialize_token(), + 'Should have serialized raw class carriage returns as line feeds before adding classes.' + ); + } + /** * Data provider. * From 4127e365f5b25780a471b57249fa230b0c9a6637 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:23:44 +0200 Subject: [PATCH 2/5] HTML API: Serialize decoded carriage returns as character references. The serializer emitted decoded carriage returns raw into text and attribute values, where input preprocessing turns them into line feeds on the next parse: normalized output never reached a fixed point for documents containing . Escaping CR after htmlspecialchars() keeps the character through parse/serialize round trips. Attribute values read through get_attribute(), whose input preprocessing guarantees raw source carriage returns already arrive normalized to line feeds, so only genuinely decoded CRs are escaped. See #65372. --- .../html-api/class-wp-html-processor.php | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index c46151f05d9be..afab3a3f5f4e2 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1383,7 +1383,7 @@ public function serialize_token(): string { break; case '#text': - $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + $html .= self::serialize_decoded_text( $this->get_modifiable_text() ); break; // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely. @@ -1446,7 +1446,7 @@ public function serialize_token(): string { $value = $this->get_attribute( $attribute_name ); if ( is_string( $value ) ) { - $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; + $html .= '="' . self::serialize_decoded_text( $value ) . '"'; } $previous_attribute_was_true = true === $value; @@ -1501,7 +1501,7 @@ public function serialize_token(): string { break; default: - $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + $text = self::serialize_decoded_text( $text ); } $html .= "{$text}"; @@ -1510,6 +1510,27 @@ public function serialize_token(): string { return $html; } + /** + * Serializes decoded text for use in text nodes and attribute values. + * + * A decoded carriage return must serialize as a character reference: + * the HTML parser's input preprocessing turns a raw CR into a line + * feed, so emitting it raw would change the text on the next parse + * and serialized output would never reach a fixed point. + * + * @since 7.1.0 + * + * @param string $text Decoded text to serialize. + * @return string Serialized text. + */ + private static function serialize_decoded_text( string $text ): string { + return str_replace( + "\r", + ' ', + htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ) + ); + } + /** * Parses next element in the 'initial' insertion mode. * From 9c3302fd069b826954e278d46fb93f9ff71a930f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:25:53 +0200 Subject: [PATCH 3/5] HTML API: Pin serialization of NULL bytes in API-supplied values. An attribute value set through set_attribute() may contain NULL bytes; serializing them as U+FFFD keeps normalized output idempotent, where browsers' innerHTML emits the raw byte and loses it to replacement on the next parse. This pins the behavior ahead of consolidating the serializer's NULL handling. See #65372. --- .../html-api/wpHtmlProcessor-serialize.php | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index c421dd57c77eb..6d14974964861 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -599,6 +599,30 @@ public function test_serialize_token_normalizes_raw_class_carriage_returns_befor ); } + /** + * Ensures NULL bytes in attribute values set through the API serialize + * as U+FFFD so that serialized output parses back to the same value. + * + * Browsers serialize the raw NULL byte in innerHTML, which does not + * round-trip: re-parsing replaces it with U+FFFD. Serializing U+FFFD + * directly is a benign deviation which keeps output idempotent, like + * serializing decoded carriage returns as . + * + * @ticket 65372 + */ + public function test_serialize_token_replaces_null_bytes_in_enqueued_attribute_values() { + $processor = WP_HTML_Processor::create_fragment( '

' ); + + $this->assertTrue( $processor->next_tag( 'P' ), 'Should find the P element.' ); + $this->assertTrue( $processor->set_attribute( 'title', "a\x00b" ), 'Should have set the attribute.' ); + + $this->assertSame( + "

", + $processor->serialize_token(), + 'Should have serialized the NULL byte as U+FFFD.' + ); + } + /** * Data provider. * From 96c6fb8ce3fcded03aac45ed391d1cdd09e72db9 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:26:42 +0200 Subject: [PATCH 4/5] HTML API: Consolidate serializer NULL-byte handling. The getters now expose tag and attribute names with NULL bytes already replaced by U+FFFD, leaving the serializer's name scrubbing dead, and the only live input to the per-attribute whole-buffer scrub was an API-supplied attribute value. That replacement moves into serialize_decoded_text() next to the carriage-return escaping, which exists for the same reason: emitting bytes the next parse would transform. UTF-8 scrubbing of qualified names remains, as invalid sequences can still reach serialization through source names. See #65372. --- .../html-api/class-wp-html-processor.php | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index afab3a3f5f4e2..bca775ce62d02 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1404,10 +1404,9 @@ public function serialize_token(): string { return $html; } - $tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() ); + $tag_name = $this->get_tag(); $in_html = 'html' === $this->get_namespace(); $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); - $qualified_name = str_replace( "\x00", "\u{FFFD}", $qualified_name ); if ( $this->is_tag_closer() ) { $html .= ""; @@ -1426,7 +1425,6 @@ public function serialize_token(): string { $seen_attribute_names = array(); foreach ( $attribute_names as $attribute_name ) { $qualified_attribute_name = $this->get_qualified_attribute_name( $attribute_name ); - $qualified_attribute_name = str_replace( "\x00", "\u{FFFD}", $qualified_attribute_name ); $qualified_attribute_name = wp_scrub_utf8( $qualified_attribute_name ); if ( isset( $seen_attribute_names[ $qualified_attribute_name ] ) ) { continue; @@ -1450,7 +1448,6 @@ public function serialize_token(): string { } $previous_attribute_was_true = true === $value; - $html = str_replace( "\x00", "\u{FFFD}", $html ); } if ( ! $in_html && $this->has_self_closing_flag() ) { @@ -1518,17 +1515,20 @@ public function serialize_token(): string { * feed, so emitting it raw would change the text on the next parse * and serialized output would never reach a fixed point. * + * NULL bytes, possible in API-supplied values, serialize as U+FFFD + * for the same reason: the tokenizer would replace or remove a raw + * NULL byte on the next parse. + * * @since 7.1.0 * * @param string $text Decoded text to serialize. * @return string Serialized text. */ private static function serialize_decoded_text( string $text ): string { - return str_replace( - "\r", - ' ', - htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ) - ); + $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + $text = str_replace( "\r", ' ', $text ); + + return str_replace( "\x00", "\u{FFFD}", $text ); } /** From 3a74497a392d05171da93895da0861fc23df92bd Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:37:00 +0200 Subject: [PATCH 5/5] HTML API: Pin rawtext serialization and reparse round trips. From adversarial review: pins that SCRIPT and STYLE contents serialize without escaping, where character references do not decode, and that serialize_token() output for modified class and NULL-containing attribute values parses back to the same decoded values. See #65372. --- .../html-api/wpHtmlProcessor-serialize.php | 45 ++++++++++++++++++- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index 6d14974964861..d09cada99ed50 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -592,11 +592,47 @@ public function test_serialize_token_normalizes_raw_class_carriage_returns_befor $processor->add_class( 'c' ); + $serialized = $processor->serialize_token(); $this->assertSame( "

", - $processor->serialize_token(), + $serialized, 'Should have serialized raw class carriage returns as line feeds before adding classes.' ); + + $reparsed = WP_HTML_Processor::create_fragment( $serialized ); + $this->assertTrue( $reparsed->next_tag( 'P' ), 'Should find the reparsed P element.' ); + $this->assertSame( "a\nb c", $reparsed->get_attribute( 'class' ), 'The serialized class should parse back to the same value.' ); + } + + /** + * Ensures rawtext element contents serialize without escaping: + * character references do not decode inside SCRIPT and STYLE, so + * escaping their contents or emitting ` ` there would corrupt them. + * + * @ticket 65372 + * + * @dataProvider data_provider_rawtext_contents + * + * @param string $html HTML whose rawtext contents must serialize unchanged. + */ + public function test_normalize_preserves_rawtext_contents( string $html ) { + $this->assertSame( + $html, + WP_HTML_Processor::normalize( $html ), + 'Should have serialized the rawtext contents unchanged.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_provider_rawtext_contents() { + return array( + 'SCRIPT with character references' => array( '' ), + 'STYLE with character references' => array( '' ), + ); } /** @@ -616,11 +652,16 @@ public function test_serialize_token_replaces_null_bytes_in_enqueued_attribute_v $this->assertTrue( $processor->next_tag( 'P' ), 'Should find the P element.' ); $this->assertTrue( $processor->set_attribute( 'title', "a\x00b" ), 'Should have set the attribute.' ); + $serialized = $processor->serialize_token(); $this->assertSame( "

", - $processor->serialize_token(), + $serialized, 'Should have serialized the NULL byte as U+FFFD.' ); + + $reparsed = WP_HTML_Processor::create_fragment( $serialized ); + $this->assertTrue( $reparsed->next_tag( 'P' ), 'Should find the reparsed P element.' ); + $this->assertSame( "a\u{FFFD}b", $reparsed->get_attribute( 'title' ), 'The serialized title should parse back to the same value.' ); } /**