-
Notifications
You must be signed in to change notification settings - Fork 3.5k
HTML API: Add normalization functions. #7331
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
d372c97
07e89bb
754118e
9c6e34c
7f5783e
2aa89fa
7029b25
00a5773
99799b7
db24c11
7b8aa53
18b5005
47f7f08
92558b9
6167887
95756e5
fca6481
b37b312
dd4ff16
588020f
0abf367
37b7fe8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -1029,6 +1029,180 @@ public function get_current_depth(): int { | |||||
| return count( $this->breadcrumbs ); | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Normalize an HTML fragment by serializing it. | ||||||
| * | ||||||
| * This method assumes that the given HTML snippet is found in BODY context. | ||||||
| * For normalizing full documents or fragments found in other contexts, create | ||||||
| * a new processor using {@see WP_HTML_Processor::create_fragment} or | ||||||
| * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize} | ||||||
| * on the created instances. | ||||||
| * | ||||||
| * Many aspects of an input HTML fragment may be changed during normalization. | ||||||
| * | ||||||
| * - Attribute values will be double-quoted. | ||||||
| * - Duplicate attributes will be removed. | ||||||
| * - Omitted tags will be added. | ||||||
| * - Tag and attribute name casing will be lower-cased, | ||||||
| * except for specific SVG and MathML tags or attributes. | ||||||
| * - Text will be re-encoded, null bytes handled, | ||||||
| * and invalid UTF-8 replaced with U+FFFD. | ||||||
| * - Any incomplete syntax trailing at the end will be omitted, | ||||||
| * for example, an unclosed comment opener will be removed. | ||||||
|
dmsnell marked this conversation as resolved.
|
||||||
| * | ||||||
| * Example: | ||||||
| * | ||||||
| * echo WP_HTML_Processor::normalize( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' ); | ||||||
| * // <a href="#anchor" v="5" enabled>One</a> | ||||||
| * | ||||||
| * echo WP_HTML_Processor::normalize( '<div></p>fun<table><td>cell</div>' ); | ||||||
| * // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div> | ||||||
| * | ||||||
| * echo WP_HTML_Processor::normalize( '<![CDATA[invalid comment]]> syntax < <> "oddities"' ); | ||||||
| * // <!--invalid comment--> syntax < <> "oddities" | ||||||
| * | ||||||
| * @since 6.7.0 | ||||||
| * | ||||||
| * @param string $html Input HTML to normalize. | ||||||
| * | ||||||
| * @return string|null Normalized output, or `null` if unable to normalize. | ||||||
| */ | ||||||
| public static function normalize( string $html ): ?string { | ||||||
| return static::create_fragment( $html )->serialize(); | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Return normalized HTML for a fragment by serializing it. | ||||||
|
westonruter marked this conversation as resolved.
Outdated
|
||||||
| * | ||||||
| * This differs from {@see WP_HTML_Processor::normalize} in that it starts with | ||||||
| * a specific HTML Processor, which _must_ not have already started scanning; | ||||||
| * it must be in the initial ready state and will be in the completed state once | ||||||
| * serialization is complete. | ||||||
| * | ||||||
| * Many aspects of an input HTML fragment may be changed during normalization. | ||||||
| * | ||||||
| * - Attribute values will be double-quoted. | ||||||
| * - Duplicate attributes will be removed. | ||||||
| * - Omitted tags will be added. | ||||||
| * - Tag and attribute name casing will be lower-cased, | ||||||
| * except for specific SVG and MathML tags or attributes. | ||||||
| * - Text will be re-encoded, null bytes handled, | ||||||
| * and invalid UTF-8 replaced with U+FFFD. | ||||||
| * - Any incomplete syntax trailing at the end will be omitted, | ||||||
| * for example, an unclosed comment opener will be removed. | ||||||
| * | ||||||
| * Example: | ||||||
| * | ||||||
| * $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' ); | ||||||
| * echo $processor->serialize(); | ||||||
| * // <a href="#anchor" v="5" enabled>One</a> | ||||||
| * | ||||||
| * $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' ); | ||||||
| * echo $processor->serialize(); | ||||||
| * // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div> | ||||||
| * | ||||||
| * $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' ); | ||||||
| * echo $processor->serialize(); | ||||||
| * // <!--invalid comment--> syntax < <> "oddities" | ||||||
| * | ||||||
| * @since 6.7.0 | ||||||
| * | ||||||
| * @return string|null Normalized HTML markup represented by processor, | ||||||
| * or `null` if unable to generate serialization. | ||||||
| */ | ||||||
| public function serialize(): ?string { | ||||||
| if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) { | ||||||
| return null; | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would this make sense to throw an exception?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've tried to avoid throwing exceptions in use code. Tell me more about the value of potentially crashing vs. returning
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, I guess to give more information about why it is returning
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah okay I see now. another thought I had was resetting to the beginning, parsing, and returning to the previous location, which involves double-parsing if already mid-way through a document.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've called |
||||||
| } | ||||||
|
|
||||||
| $html = ''; | ||||||
| while ( $this->next_token() ) { | ||||||
| $token_type = $this->get_token_type(); | ||||||
|
|
||||||
| switch ( $token_type ) { | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah good catch: it should also serialize the PI node tag name, which would match what you wrote. looks like this needs a review of all of the invalid comment syntax
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these should all be updated now. if something lingers I'd like to fix it, but ultimately if we botch an invalid comment, I'm guessing it's not the end of the world. these will go into test cases. |
||||||
| case '#text': | ||||||
| $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); | ||||||
| break; | ||||||
|
|
||||||
| case '#funky-comment': | ||||||
| case '#comment': | ||||||
| $html .= "<!--{$this->get_modifiable_text()}-->"; | ||||||
| break; | ||||||
|
|
||||||
| case '#cdata-section': | ||||||
| $html .= "<![CDATA[{$this->get_modifiable_text()}]]>"; | ||||||
| break; | ||||||
|
|
||||||
| case 'html': | ||||||
| $html .= '<!DOCTYPE html>'; | ||||||
| break; | ||||||
| } | ||||||
|
|
||||||
| if ( '#tag' !== $token_type ) { | ||||||
| continue; | ||||||
| } | ||||||
|
|
||||||
| if ( $this->is_tag_closer() ) { | ||||||
| $html .= "</{$this->get_qualified_tag_name()}>"; | ||||||
| continue; | ||||||
| } | ||||||
|
|
||||||
| $tag_name = $this->get_tag(); | ||||||
| $qualitified_name = $this->get_qualified_tag_name(); | ||||||
| $in_html = 'html' === $this->get_namespace(); | ||||||
|
|
||||||
| $attribute_names = $this->get_attribute_names_with_prefix( '' ); | ||||||
| if ( ! isset( $attribute_names ) ) { | ||||||
|
westonruter marked this conversation as resolved.
Outdated
|
||||||
| $html .= "<{$qualitified_name}>"; | ||||||
| continue; | ||||||
| } | ||||||
|
|
||||||
| $html .= "<{$qualitified_name}"; | ||||||
| foreach ( $attribute_names as $attribute_name ) { | ||||||
| $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; | ||||||
| $value = $this->get_attribute( $attribute_name ); | ||||||
|
|
||||||
| if ( is_string( $value ) ) { | ||||||
| $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| if ( ! $in_html && $this->has_self_closing_flag() ) { | ||||||
| $html .= '/'; | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While not required, it seems a space is usually added here in the wild, right? (e.g. Prettier does this)
Suggested change
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a good thought. with double-quoted attributes it's not relevant, but with unquoted attribute values it becomes relevant. we don't need that since we control quoting. maybe it's best to add it in anyway for the same of other tools. |
||||||
| } | ||||||
|
|
||||||
| $html .= '>'; | ||||||
|
|
||||||
| // Flush out self-contained elements. | ||||||
| if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { | ||||||
| $text = $this->get_modifiable_text(); | ||||||
|
|
||||||
| switch ( $tag_name ) { | ||||||
| case 'IFRAME': | ||||||
| case 'NOEMBED': | ||||||
| case 'NOFRAMES': | ||||||
| $text = ''; | ||||||
| break; | ||||||
|
|
||||||
| case 'SCRIPT': | ||||||
| case 'STYLE': | ||||||
| break; | ||||||
|
|
||||||
| default: | ||||||
| $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); | ||||||
| } | ||||||
|
|
||||||
| $html .= "{$text}</{$qualitified_name}>"; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| if ( null !== $this->get_last_error() ) { | ||||||
| return null; | ||||||
| } | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly here, it would be helpful to know why it returned
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a tougher question because it would conflate the basic suppose you know why this failed: what would you do in response?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It could also use Looking at where
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the exceptions thrown internally are caught and shut down parsing, but does not crash. unsupported content exceptions are returned via
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've called |
||||||
|
|
||||||
| return $html; | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Parses next element in the 'initial' insertion mode. | ||||||
| * | ||||||
|
|
||||||

Uh oh!
There was an error while loading. Please reload this page.