Skip to content
Open
35 changes: 28 additions & 7 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1390,7 +1390,7 @@ public function serialize_token(): string {
break;

case '#text':
$html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
$html .= self::serialize_decoded_text( $this->get_modifiable_text() );
break;

// Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
Expand All @@ -1411,10 +1411,9 @@ public function serialize_token(): string {
return $html;
}

$tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() );
$tag_name = $this->get_tag();
$in_html = 'html' === $this->get_namespace();
$qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();
$qualified_name = str_replace( "\x00", "\u{FFFD}", $qualified_name );

if ( $this->is_tag_closer() ) {
$html .= "</{$qualified_name}>";
Expand All @@ -1433,7 +1432,6 @@ public function serialize_token(): string {
$seen_attribute_names = array();
foreach ( $attribute_names as $attribute_name ) {
$qualified_attribute_name = $this->get_qualified_attribute_name( $attribute_name );
$qualified_attribute_name = str_replace( "\x00", "\u{FFFD}", $qualified_attribute_name );
$qualified_attribute_name = wp_scrub_utf8( $qualified_attribute_name );
/**
* Spaces only appear via the foreign attribute adjustment table.
Expand All @@ -1458,11 +1456,10 @@ public function serialize_token(): string {
$value = $this->get_attribute( $attribute_name );

if ( is_string( $value ) ) {
$html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"';
$html .= '="' . self::serialize_decoded_text( $value ) . '"';
}

$previous_attribute_was_true = true === $value;
$html = str_replace( "\x00", "\u{FFFD}", $html );
}

if ( ! $in_html && $this->has_self_closing_flag() ) {
Expand Down Expand Up @@ -1514,7 +1511,7 @@ public function serialize_token(): string {
break;

default:
$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
$text = self::serialize_decoded_text( $text );
}

$html .= "{$text}</{$qualified_name}>";
Expand All @@ -1523,6 +1520,30 @@ public function serialize_token(): string {
return $html;
}

/**
* Serializes decoded text for use in text nodes and attribute values.
*
* A decoded carriage return must serialize as a character reference:
* the HTML parser's input preprocessing turns a raw CR into a line
* feed, so emitting it raw would change the text on the next parse
* and serialized output would never reach a fixed point.
*
* NULL bytes, possible in API-supplied values, serialize as U+FFFD
* for the same reason: the tokenizer would replace or remove a raw
* NULL byte on the next parse.
*
* @since 7.1.0
*
* @param string $text Decoded text to serialize.
* @return string Serialized text.
*/
private static function serialize_decoded_text( string $text ): string {
$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
$text = str_replace( "\r", '&#13;', $text );

return str_replace( "\x00", "\u{FFFD}", $text );
}

/**
* Parses next element in the 'initial' insertion mode.
*
Expand Down
155 changes: 155 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,161 @@ public static function data_provider_normalized_fuzzer_cases_that_should_be_idem
);
}

/**
* Ensures that decoded carriage returns are serialized as character references.
*
* @ticket 65372
*
* @dataProvider data_provider_decoded_carriage_returns
*
* @param string $input HTML input containing a decoded carriage return.
* @param string $expected Expected normalized output.
*/
public function test_normalize_serializes_decoded_carriage_returns_as_character_references( string $input, string $expected ) {
$normalized = WP_HTML_Processor::normalize( $input );

$this->assertSame( $expected, $normalized, 'Should have serialized the carriage return as a character reference.' );
$this->assertSame(
$expected,
WP_HTML_Processor::normalize( $normalized ),
'Normalizing already-normalized HTML should not change the serialized carriage return.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_provider_decoded_carriage_returns() {
return array(
'Regular text' => array( '<p>a&#13;b</p>', '<p>a&#13;b</p>' ),
'Regular text with non-canonical character reference' => array( '<p>a&#x0D;b</p>', '<p>a&#13;b</p>' ),
'RCDATA title' => array( '<title>a&#13;b</title>', '<title>a&#13;b</title>' ),
'RCDATA textarea with leading-newline preservation' => array( '<textarea>a&#13;b</textarea>', "<textarea>\na&#13;b</textarea>" ),
'Attribute value' => array( '<p title="a&#13;b"></p>', '<p title="a&#13;b"></p>' ),
'Table text' => array( '<table><tr><td>x&#13;</td></tr></table>', '<table><tbody><tr><td>x&#13;</td></tr></tbody></table>' ),
'Template text' => array( '<template><p>a&#13;b</p></template>', '<template><p>a&#13;b</p></template>' ),
);
}

/**
* Ensures that raw carriage returns in attribute values are serialized as line feeds.
*
* @ticket 65372
*
* @dataProvider data_provider_raw_attribute_carriage_returns
*
* @param string $input HTML input containing raw carriage returns.
* @param string $expected Expected normalized output.
*/
public function test_normalize_serializes_raw_attribute_carriage_returns_as_line_feeds( string $input, string $expected ) {
$normalized = WP_HTML_Processor::normalize( $input );

$this->assertSame( $expected, $normalized, 'Should have serialized raw attribute carriage returns as line feeds.' );
$this->assertSame(
$expected,
WP_HTML_Processor::normalize( $normalized ),
'Normalizing already-normalized HTML should not change raw attribute newlines.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_provider_raw_attribute_carriage_returns() {
return array(
'Raw carriage return' => array( "<p title=\"a\rb\"></p>", "<p title=\"a\nb\"></p>" ),
'Raw CRLF pair' => array( "<p title=\"a\r\nb\"></p>", "<p title=\"a\nb\"></p>" ),
);
}

/**
* Ensures that raw carriage returns are normalized before class updates are serialized.
*
* @ticket 65372
*/
public function test_serialize_token_normalizes_raw_class_carriage_returns_before_class_updates() {
$processor = WP_HTML_Processor::create_fragment( "<p class=\"a\rb\"></p>" );

$this->assertTrue( $processor->next_tag( 'P' ), 'Should find the P element.' );

$processor->add_class( 'c' );

$serialized = $processor->serialize_token();
$this->assertSame(
"<p class=\"a\nb c\">",
$serialized,
'Should have serialized raw class carriage returns as line feeds before adding classes.'
);

$reparsed = WP_HTML_Processor::create_fragment( $serialized );
$this->assertTrue( $reparsed->next_tag( 'P' ), 'Should find the reparsed P element.' );
$this->assertSame( "a\nb c", $reparsed->get_attribute( 'class' ), 'The serialized class should parse back to the same value.' );
}

/**
* Ensures rawtext element contents serialize without escaping:
* character references do not decode inside SCRIPT and STYLE, so
* escaping their contents or emitting `&#13;` there would corrupt them.
*
* @ticket 65372
*
* @dataProvider data_provider_rawtext_contents
*
* @param string $html HTML whose rawtext contents must serialize unchanged.
*/
public function test_normalize_preserves_rawtext_contents( string $html ) {
$this->assertSame(
$html,
WP_HTML_Processor::normalize( $html ),
'Should have serialized the rawtext contents unchanged.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_provider_rawtext_contents() {
return array(
'SCRIPT with character references' => array( '<script>a&#13;&amp;b</script>' ),
'STYLE with character references' => array( '<style>a&#13;&amp;b</style>' ),
);
}

/**
* Ensures NULL bytes in attribute values set through the API serialize
* as U+FFFD so that serialized output parses back to the same value.
*
* Browsers serialize the raw NULL byte in innerHTML, which does not
* round-trip: re-parsing replaces it with U+FFFD. Serializing U+FFFD
* directly is a benign deviation which keeps output idempotent, like
* serializing decoded carriage returns as &#13;.
*
* @ticket 65372
*/
public function test_serialize_token_replaces_null_bytes_in_enqueued_attribute_values() {
$processor = WP_HTML_Processor::create_fragment( '<p title="x"></p>' );

$this->assertTrue( $processor->next_tag( 'P' ), 'Should find the P element.' );
$this->assertTrue( $processor->set_attribute( 'title', "a\x00b" ), 'Should have set the attribute.' );

$serialized = $processor->serialize_token();
$this->assertSame(
"<p title=\"a\u{FFFD}b\">",
$serialized,
'Should have serialized the NULL byte as U+FFFD.'
);

$reparsed = WP_HTML_Processor::create_fragment( $serialized );
$this->assertTrue( $reparsed->next_tag( 'P' ), 'Should find the reparsed P element.' );
$this->assertSame( "a\u{FFFD}b", $reparsed->get_attribute( 'title' ), 'The serialized title should parse back to the same value.' );
}

/**
* Data provider.
*
Expand Down