1
0
mirror of https://github.com/GNOME/libxml2.git synced 2025-10-17 08:01:20 +08:00

html: Ignore U+0000 in body text

Align with HTML5. Fixes #908.
This commit is contained in:
Nick Wellnhofer
2025-05-07 14:32:42 +02:00
parent a1e83b2401
commit f3a080bc48
6 changed files with 29 additions and 9 deletions

View File

@@ -3226,8 +3226,25 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
case '\0':
skip = 1;
repl = BAD_CAST "\xEF\xBF\xBD";
replSize = 3;
if (mode == 0) {
/*
* The HTML5 spec says that the tokenizer should
* pass on U+0000 unmodified in normal data mode.
* These characters should then be ignored in body
* and other text, but should be replaced with
* U+FFFD in foreign content.
*
* At least for now, we always strip U+0000 when
* tokenizing.
*/
repl = BAD_CAST "";
replSize = 0;
} else {
repl = BAD_CAST "\xEF\xBF\xBD";
replSize = 3;
}
goto next_chunk;
case '\n':