diff options
Diffstat (limited to '3rdparty/gumbo-parser/src/parser.c')
| -rw-r--r-- | 3rdparty/gumbo-parser/src/parser.c | 4192 |
1 files changed, 4192 insertions, 0 deletions
diff --git a/3rdparty/gumbo-parser/src/parser.c b/3rdparty/gumbo-parser/src/parser.c new file mode 100644 index 0000000..dc692b3 --- /dev/null +++ b/3rdparty/gumbo-parser/src/parser.c @@ -0,0 +1,4192 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) + +#include <assert.h> +#include <ctype.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> + +#include "attribute.h" +#include "error.h" +#include "gumbo.h" +#include "insertion_mode.h" +#include "parser.h" +#include "tokenizer.h" +#include "tokenizer_states.h" +#include "utf8.h" +#include "util.h" +#include "vector.h" + +#define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i) + +#define GUMBO_STRING(literal) \ + { literal, sizeof(literal) - 1 } +#define TERMINATOR \ + { "", 0 } + +typedef char gumbo_tagset[GUMBO_TAG_LAST]; +#define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML) +#define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG) +#define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML) + +#define TAGSET_INCLUDES(tagset, namespace, tag) \ + (tag < GUMBO_TAG_LAST && tagset[(int) tag] == (1 << (int) namespace)) + +// selected forward declarations as it is getting hard to find +// an appropriate order +static bool node_html_tag_is(const GumboNode*, GumboTag); +static GumboInsertionMode get_current_template_insertion_mode( + const GumboParser*); +static bool handle_in_template(GumboParser*, GumboToken*); +static void destroy_node(GumboParser*, GumboNode*); + +static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); } + +static void free_wrapper(void* unused, void* ptr) { free(ptr); } + +const GumboOptions kGumboDefaultOptions = {&malloc_wrapper, &free_wrapper, NULL, + 8, false, -1, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML}; + +static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html"); +static const GumboStringPiece kPublicIdHtml4_0 = + GUMBO_STRING("-//W3C//DTD HTML 4.0//EN"); +static const GumboStringPiece kPublicIdHtml4_01 = + GUMBO_STRING("-//W3C//DTD HTML 4.01//EN"); +static const GumboStringPiece kPublicIdXhtml1_0 = + GUMBO_STRING("-//W3C//DTD XHTML 1.0 Strict//EN"); +static const GumboStringPiece kPublicIdXhtml1_1 = + GUMBO_STRING("-//W3C//DTD XHTML 1.1//EN"); +static const GumboStringPiece kSystemIdRecHtml4_0 = + GUMBO_STRING("http://www.w3.org/TR/REC-html40/strict.dtd"); +static const GumboStringPiece kSystemIdHtml4 = + GUMBO_STRING("http://www.w3.org/TR/html4/strict.dtd"); +static const GumboStringPiece kSystemIdXhtmlStrict1_1 = + GUMBO_STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"); +static const GumboStringPiece kSystemIdXhtml1_1 = + GUMBO_STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"); +static const GumboStringPiece kSystemIdLegacyCompat = + GUMBO_STRING("about:legacy-compat"); + +// The doctype arrays have an explicit terminator because we want to pass them +// to a helper function, and passing them as a pointer discards sizeof +// information. The SVG arrays are used only by one-off functions, and so loops +// over them use sizeof directly instead of a terminator. + +static const GumboStringPiece kQuirksModePublicIdPrefixes[] = { + GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"), + GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"), + GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0//"), + GUMBO_STRING("-//IETF//DTD HTML 2.1E//"), + GUMBO_STRING("-//IETF//DTD HTML 3.0//"), + GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"), + GUMBO_STRING("-//IETF//DTD HTML 3.2//"), + GUMBO_STRING("-//IETF//DTD HTML 3//"), + GUMBO_STRING("-//IETF//DTD HTML Level 0//"), + GUMBO_STRING("-//IETF//DTD HTML Level 1//"), + GUMBO_STRING("-//IETF//DTD HTML Level 2//"), + GUMBO_STRING("-//IETF//DTD HTML Level 3//"), + GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"), + GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"), + GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"), + GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"), + GUMBO_STRING("-//IETF//DTD HTML Strict//"), + GUMBO_STRING("-//IETF//DTD HTML//"), + GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"), + GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"), + GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"), + GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"), + GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"), + GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"), + GUMBO_STRING( + "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)" + "extensions to HTML 4.0//"), + GUMBO_STRING( + "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::" + "extensions to HTML 4.0//"), + GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"), + GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"), + GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"), + GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"), + GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"), + GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"), + GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"), + GUMBO_STRING("-//W3C//DTD HTML 3.2//"), + GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"), + GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"), + GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"), + GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"), + GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"), + GUMBO_STRING("-//W3C//DTD W3 HTML//"), + GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"), + GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"), + GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR}; + +static const GumboStringPiece kQuirksModePublicIdExactMatches[] = { + GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"), + GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), GUMBO_STRING("HTML"), + TERMINATOR}; + +static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = { + GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"), + TERMINATOR}; + +static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = { + GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"), + GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR}; + +static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] = + {GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"), + GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR}; + +// Indexed by GumboNamespaceEnum; keep in sync with that. +static const char* kLegalXmlns[] = {"http://www.w3.org/1999/xhtml", + "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML"}; + +typedef struct _ReplacementEntry { + const GumboStringPiece from; + const GumboStringPiece to; +} ReplacementEntry; + +#define REPLACEMENT_ENTRY(from, to) \ + { GUMBO_STRING(from), GUMBO_STRING(to) } + +// Static data for SVG attribute replacements. +// https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes +static const ReplacementEntry kSvgAttributeReplacements[] = { + REPLACEMENT_ENTRY("attributename", "attributeName"), + REPLACEMENT_ENTRY("attributetype", "attributeType"), + REPLACEMENT_ENTRY("basefrequency", "baseFrequency"), + REPLACEMENT_ENTRY("baseprofile", "baseProfile"), + REPLACEMENT_ENTRY("calcmode", "calcMode"), + REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"), + // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"), + // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"), + REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"), + REPLACEMENT_ENTRY("edgemode", "edgeMode"), + // REPLACEMENT_ENTRY("externalresourcesrequired", + // "externalResourcesRequired"), + // REPLACEMENT_ENTRY("filterres", "filterRes"), + REPLACEMENT_ENTRY("filterunits", "filterUnits"), + REPLACEMENT_ENTRY("glyphref", "glyphRef"), + REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"), + REPLACEMENT_ENTRY("gradientunits", "gradientUnits"), + REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"), + REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"), + REPLACEMENT_ENTRY("keypoints", "keyPoints"), + REPLACEMENT_ENTRY("keysplines", "keySplines"), + REPLACEMENT_ENTRY("keytimes", "keyTimes"), + REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"), + REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"), + REPLACEMENT_ENTRY("markerheight", "markerHeight"), + REPLACEMENT_ENTRY("markerunits", "markerUnits"), + REPLACEMENT_ENTRY("markerwidth", "markerWidth"), + REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"), + REPLACEMENT_ENTRY("maskunits", "maskUnits"), + REPLACEMENT_ENTRY("numoctaves", "numOctaves"), + REPLACEMENT_ENTRY("pathlength", "pathLength"), + REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"), + REPLACEMENT_ENTRY("patterntransform", "patternTransform"), + REPLACEMENT_ENTRY("patternunits", "patternUnits"), + REPLACEMENT_ENTRY("pointsatx", "pointsAtX"), + REPLACEMENT_ENTRY("pointsaty", "pointsAtY"), + REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"), + REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"), + REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"), + REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"), + REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"), + REPLACEMENT_ENTRY("repeatcount", "repeatCount"), + REPLACEMENT_ENTRY("repeatdur", "repeatDur"), + REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"), + REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"), + REPLACEMENT_ENTRY("specularconstant", "specularConstant"), + REPLACEMENT_ENTRY("specularexponent", "specularExponent"), + REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"), + REPLACEMENT_ENTRY("startoffset", "startOffset"), + REPLACEMENT_ENTRY("stddeviation", "stdDeviation"), + REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"), + REPLACEMENT_ENTRY("surfacescale", "surfaceScale"), + REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"), + REPLACEMENT_ENTRY("tablevalues", "tableValues"), + REPLACEMENT_ENTRY("targetx", "targetX"), + REPLACEMENT_ENTRY("targety", "targetY"), + REPLACEMENT_ENTRY("textlength", "textLength"), + REPLACEMENT_ENTRY("viewbox", "viewBox"), + REPLACEMENT_ENTRY("viewtarget", "viewTarget"), + REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"), + REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"), + REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"), +}; + +static const ReplacementEntry kSvgTagReplacements[] = { + REPLACEMENT_ENTRY("altglyph", "altGlyph"), + REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"), + REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"), + REPLACEMENT_ENTRY("animatecolor", "animateColor"), + REPLACEMENT_ENTRY("animatemotion", "animateMotion"), + REPLACEMENT_ENTRY("animatetransform", "animateTransform"), + REPLACEMENT_ENTRY("clippath", "clipPath"), + REPLACEMENT_ENTRY("feblend", "feBlend"), + REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"), + REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"), + REPLACEMENT_ENTRY("fecomposite", "feComposite"), + REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"), + REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"), + REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"), + REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"), + REPLACEMENT_ENTRY("feflood", "feFlood"), + REPLACEMENT_ENTRY("fefunca", "feFuncA"), + REPLACEMENT_ENTRY("fefuncb", "feFuncB"), + REPLACEMENT_ENTRY("fefuncg", "feFuncG"), + REPLACEMENT_ENTRY("fefuncr", "feFuncR"), + REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"), + REPLACEMENT_ENTRY("feimage", "feImage"), + REPLACEMENT_ENTRY("femerge", "feMerge"), + REPLACEMENT_ENTRY("femergenode", "feMergeNode"), + REPLACEMENT_ENTRY("femorphology", "feMorphology"), + REPLACEMENT_ENTRY("feoffset", "feOffset"), + REPLACEMENT_ENTRY("fepointlight", "fePointLight"), + REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"), + REPLACEMENT_ENTRY("fespotlight", "feSpotLight"), + REPLACEMENT_ENTRY("fetile", "feTile"), + REPLACEMENT_ENTRY("feturbulence", "feTurbulence"), + REPLACEMENT_ENTRY("foreignobject", "foreignObject"), + REPLACEMENT_ENTRY("glyphref", "glyphRef"), + REPLACEMENT_ENTRY("lineargradient", "linearGradient"), + REPLACEMENT_ENTRY("radialgradient", "radialGradient"), + REPLACEMENT_ENTRY("textpath", "textPath"), +}; + +typedef struct _NamespacedAttributeReplacement { + const char* from; + const char* local_name; + const GumboAttributeNamespaceEnum attr_namespace; +} NamespacedAttributeReplacement; + +static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = { + {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}, + {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML}, + {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML}, + {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS}, + {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS}, +}; + +// The "scope marker" for the list of active formatting elements. We use a +// pointer to this as a generic marker element, since the particular element +// scope doesn't matter. +static const GumboNode kActiveFormattingScopeMarker; + +// The tag_is and tag_in function use true & false to denote start & end tags, +// but for readability, we define constants for them here. +static const bool kStartTag = true; +static const bool kEndTag = false; + +// Because GumboStringPieces are immutable, we can't insert a character directly +// into a text node. Instead, we accumulate all pending characters here and +// flush them out to a text node whenever a new element is inserted. +// +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character +typedef struct _TextNodeBufferState { + // The accumulated text to be inserted into the current text node. + GumboStringBuffer _buffer; + + // A pointer to the original text represented by this text node. Note that + // because of foster parenting and other strange DOM manipulations, this may + // include other non-text HTML tags in it; it is defined as the span of + // original text from the first character in this text node to the last + // character in this text node. + const char* _start_original_text; + + // The source position of the start of this text node. + GumboSourcePosition _start_position; + + // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE). + GumboNodeType _type; +} TextNodeBufferState; + +typedef struct GumboInternalParserState { + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode + GumboInsertionMode _insertion_mode; + + // Used for run_generic_parsing_algorithm, which needs to switch back to the + // original insertion mode at its conclusion. + GumboInsertionMode _original_insertion_mode; + + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements + GumboVector /*GumboNode*/ _open_elements; + + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements + GumboVector /*GumboNode*/ _active_formatting_elements; + + // The stack of template insertion modes. + // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode + GumboVector /*InsertionMode*/ _template_insertion_modes; + + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers + GumboNode* _head_element; + GumboNode* _form_element; + + // The element used as fragment context when parsing in fragment mode + GumboNode* _fragment_ctx; + + // The flag for when the spec says "Reprocess the current token in..." + bool _reprocess_current_token; + + // The flag for "acknowledge the token's self-closing flag". + bool _self_closing_flag_acknowledged; + + // The "frameset-ok" flag from the spec. + bool _frameset_ok; + + // The flag for "If the next token is a LINE FEED, ignore that token...". + bool _ignore_next_linefeed; + + // The flag for "whenever a node would be inserted into the current node, it + // must instead be foster parented". This is used for misnested table + // content, which needs to be handled according to "in body" rules yet foster + // parented outside of the table. + // It would perhaps be more explicit to have this as a parameter to + // handle_in_body and insert_element, but given how special-purpose this is + // and the number of call-sites that would need to take the extra parameter, + // it's easier just to have a state flag. + bool _foster_parent_insertions; + + // The accumulated text node buffer state. + TextNodeBufferState _text_node; + + // The current token. + GumboToken* _current_token; + + // The way that the spec is written, the </body> and </html> tags are *always* + // implicit, because encountering one of those tokens merely switches the + // insertion mode out of "in body". So we have individual state flags for + // those end tags that are then inspected by pop_current_node when the <body> + // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG + // flag appropriately. + bool _closed_body_tag; + bool _closed_html_tag; +} GumboParserState; + +static bool token_has_attribute(const GumboToken* token, const char* name) { + assert(token->type == GUMBO_TOKEN_START_TAG); + return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL; +} + +// Checks if the value of the specified attribute is a case-insensitive match +// for the specified string. +static bool attribute_matches( + const GumboVector* attributes, const char* name, const char* value) { + const GumboAttribute* attr = gumbo_get_attribute(attributes, name); + return attr ? strcasecmp(value, attr->value) == 0 : false; +} + +// Checks if the value of the specified attribute is a case-sensitive match +// for the specified string. +static bool attribute_matches_case_sensitive( + const GumboVector* attributes, const char* name, const char* value) { + const GumboAttribute* attr = gumbo_get_attribute(attributes, name); + return attr ? strcmp(value, attr->value) == 0 : false; +} + +// Checks if the specified attribute vectors are identical. +static bool all_attributes_match( + const GumboVector* attr1, const GumboVector* attr2) { + unsigned int num_unmatched_attr2_elements = attr2->length; + for (unsigned int i = 0; i < attr1->length; ++i) { + const GumboAttribute* attr = attr1->data[i]; + if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) { + --num_unmatched_attr2_elements; + } else { + return false; + } + } + return num_unmatched_attr2_elements == 0; +} + +static void set_frameset_not_ok(GumboParser* parser) { + gumbo_debug("Setting frameset_ok to false.\n"); + parser->_parser_state->_frameset_ok = false; +} + +static GumboNode* create_node(GumboParser* parser, GumboNodeType type) { + GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode)); + node->parent = NULL; + node->index_within_parent = -1; + node->type = type; + node->parse_flags = GUMBO_INSERTION_NORMAL; + return node; +} + +static GumboNode* new_document_node(GumboParser* parser) { + GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT); + document_node->parse_flags = GUMBO_INSERTION_BY_PARSER; + gumbo_vector_init(parser, 1, &document_node->v.document.children); + + // Must be initialized explicitly, as there's no guarantee that we'll see a + // doc type token. + GumboDocument* document = &document_node->v.document; + document->has_doctype = false; + document->name = NULL; + document->public_identifier = NULL; + document->system_identifier = NULL; + return document_node; +} + +static void output_init(GumboParser* parser) { + GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput)); + output->root = NULL; + output->document = new_document_node(parser); + parser->_output = output; + gumbo_init_errors(parser); +} + +static void parser_state_init(GumboParser* parser) { + GumboParserState* parser_state = + gumbo_parser_allocate(parser, sizeof(GumboParserState)); + parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL; + parser_state->_reprocess_current_token = false; + parser_state->_frameset_ok = true; + parser_state->_ignore_next_linefeed = false; + parser_state->_foster_parent_insertions = false; + parser_state->_text_node._type = GUMBO_NODE_WHITESPACE; + gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer); + gumbo_vector_init(parser, 10, &parser_state->_open_elements); + gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements); + gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes); + parser_state->_head_element = NULL; + parser_state->_form_element = NULL; + parser_state->_fragment_ctx = NULL; + parser_state->_current_token = NULL; + parser_state->_closed_body_tag = false; + parser_state->_closed_html_tag = false; + parser->_parser_state = parser_state; +} + +static void parser_state_destroy(GumboParser* parser) { + GumboParserState* state = parser->_parser_state; + if (state->_fragment_ctx) { + destroy_node(parser, state->_fragment_ctx); + } + gumbo_vector_destroy(parser, &state->_active_formatting_elements); + gumbo_vector_destroy(parser, &state->_open_elements); + gumbo_vector_destroy(parser, &state->_template_insertion_modes); + gumbo_string_buffer_destroy(parser, &state->_text_node._buffer); + gumbo_parser_deallocate(parser, state); +} + +static GumboNode* get_document_node(GumboParser* parser) { + return parser->_output->document; +} + +static bool is_fragment_parser(const GumboParser* parser) { + return !!parser->_parser_state->_fragment_ctx; +} + +// Returns the node at the bottom of the stack of open elements, or NULL if no +// elements have been added yet. +static GumboNode* get_current_node(GumboParser* parser) { + GumboVector* open_elements = &parser->_parser_state->_open_elements; + if (open_elements->length == 0) { + assert(!parser->_output->root); + return NULL; + } + assert(open_elements->length > 0); + assert(open_elements->data != NULL); + return open_elements->data[open_elements->length - 1]; +} + +static GumboNode* get_adjusted_current_node(GumboParser* parser) { + GumboParserState* state = parser->_parser_state; + if (state->_open_elements.length == 1 && state->_fragment_ctx) { + return state->_fragment_ctx; + } + return get_current_node(parser); +} + +// Returns true if the given needle is in the given array of literal +// GumboStringPieces. If exact_match is true, this requires that they match +// exactly; otherwise, this performs a prefix match to check if any of the +// elements in haystack start with needle. This always performs a +// case-insensitive match. +static bool is_in_static_list( + const char* needle, const GumboStringPiece* haystack, bool exact_match) { + for (unsigned int i = 0; haystack[i].length > 0; ++i) { + if ((exact_match && !strcmp(needle, haystack[i].data)) || + (!exact_match && !strcasecmp(needle, haystack[i].data))) { + return true; + } + } + return false; +} + +static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { + parser->_parser_state->_insertion_mode = mode; +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately +// This is a helper function that returns the appropriate insertion mode instead +// of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to +// indicate that there is no appropriate insertion mode, and the loop should +// continue. +static GumboInsertionMode get_appropriate_insertion_mode( + const GumboParser* parser, int index) { + const GumboVector* open_elements = &parser->_parser_state->_open_elements; + const GumboNode* node = open_elements->data[index]; + const bool is_last = index == 0; + + if (is_last && is_fragment_parser(parser)) { + node = parser->_parser_state->_fragment_ctx; + } + + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML) + return is_last ? + GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; + + switch (node->v.element.tag) { + case GUMBO_TAG_SELECT: { + if (is_last) { + return GUMBO_INSERTION_MODE_IN_SELECT; + } + for (int i = index; i > 0; --i) { + const GumboNode* ancestor = open_elements->data[i]; + if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) { + return GUMBO_INSERTION_MODE_IN_SELECT; + } + if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) { + return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE; + } + } + return GUMBO_INSERTION_MODE_IN_SELECT; + } + case GUMBO_TAG_TD: + case GUMBO_TAG_TH: + if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL; + break; + case GUMBO_TAG_TR: + return GUMBO_INSERTION_MODE_IN_ROW; + case GUMBO_TAG_TBODY: + case GUMBO_TAG_THEAD: + case GUMBO_TAG_TFOOT: + return GUMBO_INSERTION_MODE_IN_TABLE_BODY; + case GUMBO_TAG_CAPTION: + return GUMBO_INSERTION_MODE_IN_CAPTION; + case GUMBO_TAG_COLGROUP: + return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; + case GUMBO_TAG_TABLE: + return GUMBO_INSERTION_MODE_IN_TABLE; + case GUMBO_TAG_TEMPLATE: + return get_current_template_insertion_mode(parser); + case GUMBO_TAG_HEAD: + if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD; + break; + case GUMBO_TAG_BODY: + return GUMBO_INSERTION_MODE_IN_BODY; + case GUMBO_TAG_FRAMESET: + return GUMBO_INSERTION_MODE_IN_FRAMESET; + case GUMBO_TAG_HTML: + return parser->_parser_state->_head_element + ? GUMBO_INSERTION_MODE_AFTER_HEAD + : GUMBO_INSERTION_MODE_BEFORE_HEAD; + default: + break; + } + return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; +} + +// This performs the actual "reset the insertion mode" loop. +static void reset_insertion_mode_appropriately(GumboParser* parser) { + const GumboVector* open_elements = &parser->_parser_state->_open_elements; + for (int i = open_elements->length; --i >= 0;) { + GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i); + if (mode != GUMBO_INSERTION_MODE_INITIAL) { + set_insertion_mode(parser, mode); + return; + } + } + // Should never get here, because is_last will be set on the last iteration + // and will force GUMBO_INSERTION_MODE_IN_BODY. + assert(0); +} + +static GumboError* parser_add_parse_error( + GumboParser* parser, const GumboToken* token) { + gumbo_debug("Adding parse error.\n"); + GumboError* error = gumbo_add_error(parser); + if (!error) { + return NULL; + } + error->type = GUMBO_ERR_PARSER; + error->position = token->position; + error->original_text = token->original_text.data; + GumboParserError* extra_data = &error->v.parser; + extra_data->input_type = token->type; + extra_data->input_tag = GUMBO_TAG_UNKNOWN; + if (token->type == GUMBO_TOKEN_START_TAG) { + extra_data->input_tag = token->v.start_tag.tag; + } else if (token->type == GUMBO_TOKEN_END_TAG) { + extra_data->input_tag = token->v.end_tag; + } + GumboParserState* state = parser->_parser_state; + extra_data->parser_state = state->_insertion_mode; + gumbo_vector_init( + parser, state->_open_elements.length, &extra_data->tag_stack); + for (unsigned int i = 0; i < state->_open_elements.length; ++i) { + const GumboNode* node = state->_open_elements.data[i]; + assert( + node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + gumbo_vector_add( + parser, (void*) node->v.element.tag, &extra_data->tag_stack); + } + return error; +} + +// Returns true if the specified token is either a start or end tag (specified +// by is_start) with one of the tag types in the varargs list. Terminate the +// list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of +// the spec references tags that are not in the spec. +static bool tag_in( + const GumboToken* token, bool is_start, const gumbo_tagset tags) { + GumboTag token_tag; + if (is_start && token->type == GUMBO_TOKEN_START_TAG) { + token_tag = token->v.start_tag.tag; + } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { + token_tag = token->v.end_tag; + } else { + return false; + } + return (token_tag < GUMBO_TAG_LAST && tags[(int) token_tag] != 0); +} + +// Like tag_in, but for the single-tag case. +static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { + if (is_start && token->type == GUMBO_TOKEN_START_TAG) { + return token->v.start_tag.tag == tag; + } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { + return token->v.end_tag == tag; + } else { + return false; + } +} + +// Like tag_in, but checks for the tag of a node, rather than a token. +static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) { + assert(node != NULL); + if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { + return false; + } + return TAGSET_INCLUDES( + tags, node->v.element.tag_namespace, node->v.element.tag); +} + +// Like node_tag_in, but for the single-tag case. +static bool node_qualified_tag_is( + const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) { + assert(node); + return (node->type == GUMBO_NODE_ELEMENT || + node->type == GUMBO_NODE_TEMPLATE) && + node->v.element.tag == tag && node->v.element.tag_namespace == ns; +} + +// Like node_tag_in, but for the single-tag case in the HTML namespace +static bool node_html_tag_is(const GumboNode* node, GumboTag tag) { + return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag); +} + +static void push_template_insertion_mode( + GumboParser* parser, GumboInsertionMode mode) { + gumbo_vector_add( + parser, (void*) mode, &parser->_parser_state->_template_insertion_modes); +} + +static void pop_template_insertion_mode(GumboParser* parser) { + gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes); +} + +// Returns the current template insertion mode. If the stack of template +// insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL. +static GumboInsertionMode get_current_template_insertion_mode( + const GumboParser* parser) { + GumboVector* template_insertion_modes = + &parser->_parser_state->_template_insertion_modes; + if (template_insertion_modes->length == 0) { + return GUMBO_INSERTION_MODE_INITIAL; + } + return (GumboInsertionMode) + template_insertion_modes->data[(template_insertion_modes->length - 1)]; +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point +static bool is_mathml_integration_point(const GumboNode* node) { + return node_tag_in_set( + node, (gumbo_tagset){TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), + TAG_MATHML(MS), TAG_MATHML(MTEXT)}); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point +static bool is_html_integration_point(const GumboNode* node) { + return node_tag_in_set(node, (gumbo_tagset){TAG_SVG(FOREIGNOBJECT), + TAG_SVG(DESC), TAG_SVG(TITLE)}) || + (node_qualified_tag_is( + node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && + (attribute_matches( + &node->v.element.attributes, "encoding", "text/html") || + attribute_matches(&node->v.element.attributes, "encoding", + "application/xhtml+xml"))); +} + +// This represents a place to insert a node, consisting of a target parent and a +// child index within that parent. If the node should be inserted at the end of +// the parent's child, index will be -1. +typedef struct { + GumboNode* target; + int index; +} InsertionLocation; + +InsertionLocation get_appropriate_insertion_location( + GumboParser* parser, GumboNode* override_target) { + InsertionLocation retval = {override_target, -1}; + if (retval.target == NULL) { + // No override target; default to the current node, but special-case the + // root node since get_current_node() assumes the stack of open elements is + // non-empty. + retval.target = parser->_output->root != NULL ? get_current_node(parser) + : get_document_node(parser); + } + if (!parser->_parser_state->_foster_parent_insertions || + !node_tag_in_set(retval.target, (gumbo_tagset){TAG(TABLE), TAG(TBODY), + TAG(TFOOT), TAG(THEAD), TAG(TR)})) { + return retval; + } + + // Foster-parenting case. + int last_template_index = -1; + int last_table_index = -1; + GumboVector* open_elements = &parser->_parser_state->_open_elements; + for (unsigned int i = 0; i < open_elements->length; ++i) { + if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) { + last_template_index = i; + } + if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) { + last_table_index = i; + } + } + if (last_template_index != -1 && + (last_table_index == -1 || last_template_index > last_table_index)) { + retval.target = open_elements->data[last_template_index]; + return retval; + } + if (last_table_index == -1) { + retval.target = open_elements->data[0]; + return retval; + } + GumboNode* last_table = open_elements->data[last_table_index]; + if (last_table->parent != NULL) { + retval.target = last_table->parent; + retval.index = last_table->index_within_parent; + return retval; + } + + retval.target = open_elements->data[last_table_index - 1]; + return retval; +} + +// Appends a node to the end of its parent, setting the "parent" and +// "index_within_parent" fields appropriately. +static void append_node( + GumboParser* parser, GumboNode* parent, GumboNode* node) { + assert(node->parent == NULL); + assert(node->index_within_parent == -1); + GumboVector* children; + if (parent->type == GUMBO_NODE_ELEMENT || + parent->type == GUMBO_NODE_TEMPLATE) { + children = &parent->v.element.children; + } else { + assert(parent->type == GUMBO_NODE_DOCUMENT); + children = &parent->v.document.children; + } + node->parent = parent; + node->index_within_parent = children->length; + gumbo_vector_add(parser, (void*) node, children); + assert(node->index_within_parent < children->length); +} + +// Inserts a node at the specified InsertionLocation, updating the +// "parent" and "index_within_parent" fields of it and all its siblings. +// If the index of the location is -1, this calls append_node. +static void insert_node( + GumboParser* parser, GumboNode* node, InsertionLocation location) { + assert(node->parent == NULL); + assert(node->index_within_parent == -1); + GumboNode* parent = location.target; + int index = location.index; + if (index != -1) { + GumboVector* children = NULL; + if (parent->type == GUMBO_NODE_ELEMENT || + parent->type == GUMBO_NODE_TEMPLATE) { + children = &parent->v.element.children; + } else if (parent->type == GUMBO_NODE_DOCUMENT) { + children = &parent->v.document.children; + assert(children->length == 0); + } else { + assert(0); + } + + assert(index >= 0); + assert((unsigned int) index < children->length); + node->parent = parent; + node->index_within_parent = index; + gumbo_vector_insert_at(parser, (void*) node, index, children); + assert(node->index_within_parent < children->length); + for (unsigned int i = index + 1; i < children->length; ++i) { + GumboNode* sibling = children->data[i]; + sibling->index_within_parent = i; + assert(sibling->index_within_parent < children->length); + } + } else { + append_node(parser, parent, node); + } +} + +static void maybe_flush_text_node_buffer(GumboParser* parser) { + GumboParserState* state = parser->_parser_state; + TextNodeBufferState* buffer_state = &state->_text_node; + if (buffer_state->_buffer.length == 0) { + return; + } + + assert(buffer_state->_type == GUMBO_NODE_WHITESPACE || + buffer_state->_type == GUMBO_NODE_TEXT || + buffer_state->_type == GUMBO_NODE_CDATA); + GumboNode* text_node = create_node(parser, buffer_state->_type); + GumboText* text_node_data = &text_node->v.text; + text_node_data->text = + gumbo_string_buffer_to_string(parser, &buffer_state->_buffer); + text_node_data->original_text.data = buffer_state->_start_original_text; + text_node_data->original_text.length = + state->_current_token->original_text.data - + buffer_state->_start_original_text; + text_node_data->start_pos = buffer_state->_start_position; + + gumbo_debug("Flushing text node buffer of %.*s.\n", + (int) buffer_state->_buffer.length, buffer_state->_buffer.data); + + InsertionLocation location = get_appropriate_insertion_location(parser, NULL); + if (location.target->type == GUMBO_NODE_DOCUMENT) { + // The DOM does not allow Document nodes to have Text children, so per the + // spec, they are dropped on the floor. + destroy_node(parser, text_node); + } else { + insert_node(parser, text_node, location); + } + + gumbo_string_buffer_clear(parser, &buffer_state->_buffer); + buffer_state->_type = GUMBO_NODE_WHITESPACE; + assert(buffer_state->_buffer.length == 0); +} + +static void record_end_of_element( + GumboToken* current_token, GumboElement* element) { + element->end_pos = current_token->position; + element->original_end_tag = current_token->type == GUMBO_TOKEN_END_TAG + ? current_token->original_text + : kGumboEmptyString; +} + +static GumboNode* pop_current_node(GumboParser* parser) { + GumboParserState* state = parser->_parser_state; + maybe_flush_text_node_buffer(parser); + if (state->_open_elements.length > 0) { + assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); + gumbo_debug("Popping %s node.\n", + gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)); + } + GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements); + if (!current_node) { + assert(state->_open_elements.length == 0); + return NULL; + } + assert(current_node->type == GUMBO_NODE_ELEMENT || + current_node->type == GUMBO_NODE_TEMPLATE); + bool is_closed_body_or_html_tag = + (node_html_tag_is(current_node, GUMBO_TAG_BODY) && + state->_closed_body_tag) || + (node_html_tag_is(current_node, GUMBO_TAG_HTML) && + state->_closed_html_tag); + if ((state->_current_token->type != GUMBO_TOKEN_END_TAG || + !node_html_tag_is(current_node, state->_current_token->v.end_tag)) && + !is_closed_body_or_html_tag) { + current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; + } + if (!is_closed_body_or_html_tag) { + record_end_of_element(state->_current_token, ¤t_node->v.element); + } + return current_node; +} + +static void append_comment_node( + GumboParser* parser, GumboNode* node, const GumboToken* token) { + maybe_flush_text_node_buffer(parser); + GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT); + comment->type = GUMBO_NODE_COMMENT; + comment->parse_flags = GUMBO_INSERTION_NORMAL; + comment->v.text.text = token->v.text; + comment->v.text.original_text = token->original_text; + comment->v.text.start_pos = token->position; + append_node(parser, node, comment); +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context +static void clear_stack_to_table_row_context(GumboParser* parser) { + while (!node_tag_in_set(get_current_node(parser), + (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) { + pop_current_node(parser); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context +static void clear_stack_to_table_context(GumboParser* parser) { + while (!node_tag_in_set(get_current_node(parser), + (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) { + pop_current_node(parser); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context +void clear_stack_to_table_body_context(GumboParser* parser) { + while (!node_tag_in_set(get_current_node(parser), + (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), + TAG(TEMPLATE)})) { + pop_current_node(parser); + } +} + +// Creates a parser-inserted element in the HTML namespace and returns it. +static GumboNode* create_element(GumboParser* parser, GumboTag tag) { + GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT); + GumboElement* element = &node->v.element; + gumbo_vector_init(parser, 1, &element->children); + gumbo_vector_init(parser, 0, &element->attributes); + element->tag = tag; + element->tag_namespace = GUMBO_NAMESPACE_HTML; + element->original_tag = kGumboEmptyString; + element->original_end_tag = kGumboEmptyString; + element->start_pos = (parser->_parser_state->_current_token) + ? parser->_parser_state->_current_token->position + : kGumboEmptySourcePosition; + element->end_pos = kGumboEmptySourcePosition; + return node; +} + +// Constructs an element from the given start tag token. +static GumboNode* create_element_from_token( + GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) { + assert(token->type == GUMBO_TOKEN_START_TAG); + GumboTokenStartTag* start_tag = &token->v.start_tag; + + GumboNodeType type = (tag_namespace == GUMBO_NAMESPACE_HTML && + start_tag->tag == GUMBO_TAG_TEMPLATE) + ? GUMBO_NODE_TEMPLATE + : GUMBO_NODE_ELEMENT; + + GumboNode* node = create_node(parser, type); + GumboElement* element = &node->v.element; + gumbo_vector_init(parser, 1, &element->children); + element->attributes = start_tag->attributes; + element->tag = start_tag->tag; + element->tag_namespace = tag_namespace; + + assert(token->original_text.length >= 2); + assert(token->original_text.data[0] == '<'); + assert(token->original_text.data[token->original_text.length - 1] == '>'); + element->original_tag = token->original_text; + element->start_pos = token->position; + element->original_end_tag = kGumboEmptyString; + element->end_pos = kGumboEmptySourcePosition; + + // The element takes ownership of the attributes from the token, so any + // allocated-memory fields should be nulled out. + start_tag->attributes = kGumboEmptyVector; + return node; +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element +static void insert_element(GumboParser* parser, GumboNode* node, + bool is_reconstructing_formatting_elements) { + GumboParserState* state = parser->_parser_state; + // NOTE(jdtang): The text node buffer must always be flushed before inserting + // a node, otherwise we're handling nodes in a different order than the spec + // mandated. However, one clause of the spec (character tokens in the body) + // requires that we reconstruct the active formatting elements *before* adding + // the character, and reconstructing the active formatting elements may itself + // result in the insertion of new elements (which should be pushed onto the + // stack of open elements before the buffer is flushed). We solve this (for + // the time being, the spec has been rewritten for <template> and the new + // version may be simpler here) with a boolean flag to this method. + if (!is_reconstructing_formatting_elements) { + maybe_flush_text_node_buffer(parser); + } + InsertionLocation location = get_appropriate_insertion_location(parser, NULL); + insert_node(parser, node, location); + gumbo_vector_add(parser, (void*) node, &state->_open_elements); +} + +// Convenience method that combines create_element_from_token and +// insert_element, inserting the generated element directly into the current +// node. Returns the node inserted. +static GumboNode* insert_element_from_token( + GumboParser* parser, GumboToken* token) { + GumboNode* element = + create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML); + insert_element(parser, element, false); + gumbo_debug("Inserting <%s> element (@%x) from token.\n", + gumbo_normalized_tagname(element->v.element.tag), element); + return element; +} + +// Convenience method that combines create_element and insert_element, inserting +// a parser-generated element of a specific tag type. Returns the node +// inserted. +static GumboNode* insert_element_of_tag_type( + GumboParser* parser, GumboTag tag, GumboParseFlags reason) { + GumboNode* element = create_element(parser, tag); + element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason; + insert_element(parser, element, false); + gumbo_debug("Inserting %s element (@%x) from tag type.\n", + gumbo_normalized_tagname(tag), element); + return element; +} + +// Convenience method for creating foreign namespaced element. Returns the node +// inserted. +static GumboNode* insert_foreign_element( + GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) { + assert(token->type == GUMBO_TOKEN_START_TAG); + GumboNode* element = create_element_from_token(parser, token, tag_namespace); + insert_element(parser, element, false); + if (token_has_attribute(token, "xmlns") && + !attribute_matches_case_sensitive(&token->v.start_tag.attributes, "xmlns", + kLegalXmlns[tag_namespace])) { + // TODO(jdtang): Since there're multiple possible error codes here, we + // eventually need reason codes to differentiate them. + parser_add_parse_error(parser, token); + } + if (token_has_attribute(token, "xmlns:xlink") && + !attribute_matches_case_sensitive(&token->v.start_tag.attributes, + "xmlns:xlink", "http://www.w3.org/1999/xlink")) { + parser_add_parse_error(parser, token); + } + return element; +} + +static void insert_text_token(GumboParser* parser, GumboToken* token) { + assert(token->type == GUMBO_TOKEN_WHITESPACE || + token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA); + TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; + if (buffer_state->_buffer.length == 0) { + // Initialize position fields. + buffer_state->_start_original_text = token->original_text.data; + buffer_state->_start_position = token->position; + } + gumbo_string_buffer_append_codepoint( + parser, token->v.character, &buffer_state->_buffer); + if (token->type == GUMBO_TOKEN_CHARACTER) { + buffer_state->_type = GUMBO_NODE_TEXT; + } else if (token->type == GUMBO_TOKEN_CDATA) { + buffer_state->_type = GUMBO_NODE_CDATA; + } + gumbo_debug("Inserting text token '%c'.\n", token->v.character); +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm +static void run_generic_parsing_algorithm( + GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) { + insert_element_from_token(parser, token); + gumbo_tokenizer_set_state(parser, lexer_state); + parser->_parser_state->_original_insertion_mode = + parser->_parser_state->_insertion_mode; + parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT; +} + +static void acknowledge_self_closing_tag(GumboParser* parser) { + parser->_parser_state->_self_closing_flag_acknowledged = true; +} + +// Returns true if there's an anchor tag in the list of active formatting +// elements, and fills in its index if so. +static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) { + GumboVector* elements = &parser->_parser_state->_active_formatting_elements; + for (int i = elements->length; --i >= 0;) { + GumboNode* node = elements->data[i]; + if (node == &kActiveFormattingScopeMarker) { + return false; + } + if (node_html_tag_is(node, GUMBO_TAG_A)) { + *anchor_index = i; + return true; + } + } + return false; +} + +// Counts the number of open formatting elements in the list of active +// formatting elements (after the last active scope marker) that have a specific +// tag. If this is > 0, then earliest_matching_index will be filled in with the +// index of the first such element. +static int count_formatting_elements_of_tag(GumboParser* parser, + const GumboNode* desired_node, int* earliest_matching_index) { + const GumboElement* desired_element = &desired_node->v.element; + GumboVector* elements = &parser->_parser_state->_active_formatting_elements; + int num_identical_elements = 0; + for (int i = elements->length; --i >= 0;) { + GumboNode* node = elements->data[i]; + if (node == &kActiveFormattingScopeMarker) { + break; + } + assert(node->type == GUMBO_NODE_ELEMENT); + if (node_qualified_tag_is( + node, desired_element->tag_namespace, desired_element->tag) && + all_attributes_match( + &node->v.element.attributes, &desired_element->attributes)) { + num_identical_elements++; + *earliest_matching_index = i; + } + } + return num_identical_elements; +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements +static void add_formatting_element(GumboParser* parser, const GumboNode* node) { + assert(node == &kActiveFormattingScopeMarker || + node->type == GUMBO_NODE_ELEMENT); + GumboVector* elements = &parser->_parser_state->_active_formatting_elements; + if (node == &kActiveFormattingScopeMarker) { + gumbo_debug("Adding a scope marker.\n"); + } else { + gumbo_debug("Adding a formatting element.\n"); + } + + // Hunt for identical elements. + int earliest_identical_element = elements->length; + int num_identical_elements = count_formatting_elements_of_tag( + parser, node, &earliest_identical_element); + + // Noah's Ark clause: if there're at least 3, remove the earliest. + if (num_identical_elements >= 3) { + gumbo_debug("Noah's ark clause: removing element at %d.\n", + earliest_identical_element); + gumbo_vector_remove_at(parser, earliest_identical_element, elements); + } + + gumbo_vector_add(parser, (void*) node, elements); +} + +static bool is_open_element(GumboParser* parser, const GumboNode* node) { + GumboVector* open_elements = &parser->_parser_state->_open_elements; + for (unsigned int i = 0; i < open_elements->length; ++i) { + if (open_elements->data[i] == node) { + return true; + } + } + return false; +} + +// Clones attributes, tags, etc. of a node, but does not copy the content. The +// clone shares no structure with the original node: all owned strings and +// values are fresh copies. +GumboNode* clone_node( + GumboParser* parser, GumboNode* node, GumboParseFlags reason) { + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode)); + *new_node = *node; + new_node->parent = NULL; + new_node->index_within_parent = -1; + // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may + // have a separate end tag. + new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG; + new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER; + GumboElement* element = &new_node->v.element; + gumbo_vector_init(parser, 1, &element->children); + + const GumboVector* old_attributes = &node->v.element.attributes; + gumbo_vector_init(parser, old_attributes->length, &element->attributes); + for (unsigned int i = 0; i < old_attributes->length; ++i) { + const GumboAttribute* old_attr = old_attributes->data[i]; + GumboAttribute* attr = + gumbo_parser_allocate(parser, sizeof(GumboAttribute)); + *attr = *old_attr; + attr->name = gumbo_copy_stringz(parser, old_attr->name); + attr->value = gumbo_copy_stringz(parser, old_attr->value); + gumbo_vector_add(parser, attr, &element->attributes); + } + return new_node; +} + +// "Reconstruct active formatting elements" part of the spec. +// This implementation is based on the html5lib translation from the mess of +// GOTOs in the spec to reasonably structured programming. +// http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py +static void reconstruct_active_formatting_elements(GumboParser* parser) { + GumboVector* elements = &parser->_parser_state->_active_formatting_elements; + // Step 1 + if (elements->length == 0) { + return; + } + + // Step 2 & 3 + unsigned int i = elements->length - 1; + GumboNode* element = elements->data[i]; + if (element == &kActiveFormattingScopeMarker || + is_open_element(parser, element)) { + return; + } + + // Step 6 + do { + if (i == 0) { + // Step 4 + i = -1; // Incremented to 0 below. + break; + } + // Step 5 + element = elements->data[--i]; + } while (element != &kActiveFormattingScopeMarker && + !is_open_element(parser, element)); + + ++i; + gumbo_debug("Reconstructing elements from %d on %s parent.\n", i, + gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)); + for (; i < elements->length; ++i) { + // Step 7 & 8. + assert(elements->length > 0); + assert(i < elements->length); + element = elements->data[i]; + assert(element != &kActiveFormattingScopeMarker); + GumboNode* clone = clone_node( + parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT); + // Step 9. + InsertionLocation location = + get_appropriate_insertion_location(parser, NULL); + insert_node(parser, clone, location); + gumbo_vector_add( + parser, (void*) clone, &parser->_parser_state->_open_elements); + + // Step 10. + elements->data[i] = clone; + gumbo_debug("Reconstructed %s element at %d.\n", + gumbo_normalized_tagname(clone->v.element.tag), i); + } +} + +static void clear_active_formatting_elements(GumboParser* parser) { + GumboVector* elements = &parser->_parser_state->_active_formatting_elements; + int num_elements_cleared = 0; + const GumboNode* node; + do { + node = gumbo_vector_pop(parser, elements); + ++num_elements_cleared; + } while (node && node != &kActiveFormattingScopeMarker); + gumbo_debug("Cleared %d elements from active formatting list.\n", + num_elements_cleared); +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode +static GumboQuirksModeEnum compute_quirks_mode( + const GumboTokenDocType* doctype) { + if (doctype->force_quirks || strcmp(doctype->name, kDoctypeHtml.data) || + is_in_static_list( + doctype->public_identifier, kQuirksModePublicIdPrefixes, false) || + is_in_static_list( + doctype->public_identifier, kQuirksModePublicIdExactMatches, true) || + is_in_static_list( + doctype->system_identifier, kQuirksModeSystemIdExactMatches, true) || + (is_in_static_list(doctype->public_identifier, + kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) && + !doctype->has_system_identifier)) { + return GUMBO_DOCTYPE_QUIRKS; + } else if (is_in_static_list(doctype->public_identifier, + kLimitedQuirksPublicIdPrefixes, false) || + (is_in_static_list(doctype->public_identifier, + kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) && + doctype->has_system_identifier)) { + return GUMBO_DOCTYPE_LIMITED_QUIRKS; + } + return GUMBO_DOCTYPE_NO_QUIRKS; +} + +// The following functions are all defined by the "has an element in __ scope" +// sections of the HTML5 spec: +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope +// The basic idea behind them is that they check for an element of the given +// qualified name, contained within a scope formed by a set of other qualified +// names. For example, "has an element in list scope" looks for an element of +// the given qualified name within the nearest enclosing <ol> or <ul>, along +// with a bunch of generic element types that serve to "firewall" their content +// from the rest of the document. Note that because of the way the spec is +// written, +// all elements are expected to be in the HTML namespace +static bool has_an_element_in_specific_scope(GumboParser* parser, + int expected_size, const GumboTag* expected, bool negate, + const gumbo_tagset tags) { + GumboVector* open_elements = &parser->_parser_state->_open_elements; + for (int i = open_elements->length; --i >= 0;) { + const GumboNode* node = open_elements->data[i]; + if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) + continue; + + GumboTag node_tag = node->v.element.tag; + GumboNamespaceEnum node_ns = node->v.element.tag_namespace; + for (int j = 0; j < expected_size; ++j) { + if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML) + return true; + } + + bool found = TAGSET_INCLUDES(tags, node_ns, node_tag); + if (negate != found) return false; + } + return false; +} + +// Checks for the presence of an open element of the specified tag type. +static bool has_open_element(GumboParser* parser, GumboTag tag) { + return has_an_element_in_specific_scope( + parser, 1, &tag, false, (gumbo_tagset){TAG(HTML)}); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope +static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) { + return has_an_element_in_specific_scope(parser, 1, &tag, false, + (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), + TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), + TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), + TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), + TAG_SVG(TITLE)}); +} + +// Like "has an element in scope", but for the specific case of looking for a +// unique target node, not for any node with a given tag name. This duplicates +// much of the algorithm from has_an_element_in_specific_scope because the +// predicate is different when checking for an exact node, and it's easier & +// faster just to duplicate the code for this one case than to try and +// parameterize it. +static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) { + GumboVector* open_elements = &parser->_parser_state->_open_elements; + for (int i = open_elements->length; --i >= 0;) { + const GumboNode* current = open_elements->data[i]; + if (current == node) { + return true; + } + if (current->type != GUMBO_NODE_ELEMENT && + current->type != GUMBO_NODE_TEMPLATE) { + continue; + } + if (node_tag_in_set(current, + (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), + TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), + TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), + TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), + TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)})) { + return false; + } + } + assert(false); + return false; +} + +// Like has_an_element_in_scope, but restricts the expected qualified name to a +// range of possible qualified names instead of just a single one. +static bool has_an_element_in_scope_with_tagname( + GumboParser* parser, int expected_len, const GumboTag expected[]) { + return has_an_element_in_specific_scope(parser, expected_len, expected, false, + (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), + TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), + TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), + TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), + TAG_SVG(TITLE)}); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope +static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) { + return has_an_element_in_specific_scope(parser, 1, &tag, false, + (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), + TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), + TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), + TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), + TAG_SVG(TITLE), TAG(OL), TAG(UL)}); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope +static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) { + return has_an_element_in_specific_scope(parser, 1, &tag, false, + (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), + TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), + TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), + TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), + TAG_SVG(TITLE), TAG(BUTTON)}); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope +static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) { + return has_an_element_in_specific_scope(parser, 1, &tag, false, + (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)}); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope +static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) { + return has_an_element_in_specific_scope( + parser, 1, &tag, true, (gumbo_tagset){TAG(OPTGROUP), TAG(OPTION)}); +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags +// "exception" is the "element to exclude from the process" listed in the spec. +// Pass GUMBO_TAG_LAST to not exclude any of them. +static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) { + for (; node_tag_in_set(get_current_node(parser), + (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), + TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)}) && + !node_html_tag_is(get_current_node(parser), exception); + pop_current_node(parser)) + ; +} + +// This is the "generate all implied end tags thoroughly" clause of the spec. +// https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags +static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) { + for ( + ; node_tag_in_set(get_current_node(parser), + (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), + TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC), + TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)}); + pop_current_node(parser)) + ; +} + +// This factors out the clauses relating to "act as if an end tag token with tag +// name "table" had been seen. Returns true if there's a table element in table +// scope which was successfully closed, false if not and the token should be +// ignored. Does not add parse errors; callers should handle that. +static bool close_table(GumboParser* parser) { + if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) { + return false; + } + + GumboNode* node = pop_current_node(parser); + while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) { + node = pop_current_node(parser); + } + reset_insertion_mode_appropriately(parser); + return true; +} + +// This factors out the clauses relating to "act as if an end tag token with tag +// name `cell_tag` had been seen". +static bool close_table_cell( + GumboParser* parser, const GumboToken* token, GumboTag cell_tag) { + bool result = true; + generate_implied_end_tags(parser, GUMBO_TAG_LAST); + const GumboNode* node = get_current_node(parser); + if (!node_html_tag_is(node, cell_tag)) { + parser_add_parse_error(parser, token); + result = false; + } + do { + node = pop_current_node(parser); + } while (!node_html_tag_is(node, cell_tag)); + + clear_active_formatting_elements(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); + return result; +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell +// This holds the logic to determine whether we should close a <td> or a <th>. +static bool close_current_cell(GumboParser* parser, const GumboToken* token) { + if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) { + assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH)); + return close_table_cell(parser, token, GUMBO_TAG_TD); + } else { + assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH)); + return close_table_cell(parser, token, GUMBO_TAG_TH); + } +} + +// This factors out the "act as if an end tag of tag name 'select' had been +// seen" clause of the spec, since it's referenced in several places. It pops +// all nodes from the stack until the current <select> has been closed, then +// resets the insertion mode appropriately. +static void close_current_select(GumboParser* parser) { + GumboNode* node = pop_current_node(parser); + while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) { + node = pop_current_node(parser); + } + reset_insertion_mode_appropriately(parser); +} + +// The list of nodes in the "special" category: +// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special +static bool is_special_node(const GumboNode* node) { + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + return node_tag_in_set(node, + (gumbo_tagset){TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE), + TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE), + TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL), + TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR), + TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET), + TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME), + TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), + TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME), + TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK), TAG(LISTING), + TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED), + TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), + TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), + TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY), + TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH), + TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP), + + TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), + TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), + + TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC)}); +} + +// Implicitly closes currently open elements until it reaches an element with +// the +// specified qualified name. If the elements closed are in the set handled by +// generate_implied_end_tags, this is normal operation and this function returns +// true. Otherwise, a parse error is recorded and this function returns false. +static bool implicitly_close_tags(GumboParser* parser, GumboToken* token, + GumboNamespaceEnum target_ns, GumboTag target) { + bool result = true; + generate_implied_end_tags(parser, target); + if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) { + parser_add_parse_error(parser, token); + while ( + !node_qualified_tag_is(get_current_node(parser), target_ns, target)) { + pop_current_node(parser); + } + result = false; + } + assert(node_qualified_tag_is(get_current_node(parser), target_ns, target)); + pop_current_node(parser); + return result; +} + +// If the stack of open elements has a <p> tag in button scope, this acts as if +// a </p> tag was encountered, implicitly closing tags. Returns false if a +// parse error occurs. This is a convenience function because this particular +// clause appears several times in the spec. +static bool maybe_implicitly_close_p_tag( + GumboParser* parser, GumboToken* token) { + if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) { + return implicitly_close_tags( + parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P); + } + return true; +} + +// Convenience function to encapsulate the logic for closing <li> or <dd>/<dt> +// tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>. +static void maybe_implicitly_close_list_tag( + GumboParser* parser, GumboToken* token, bool is_li) { + GumboParserState* state = parser->_parser_state; + state->_frameset_ok = false; + for (int i = state->_open_elements.length; --i >= 0;) { + const GumboNode* node = state->_open_elements.data[i]; + bool is_list_tag = + is_li ? node_html_tag_is(node, GUMBO_TAG_LI) + : node_tag_in_set(node, (gumbo_tagset){TAG(DD), TAG(DT)}); + if (is_list_tag) { + implicitly_close_tags( + parser, token, node->v.element.tag_namespace, node->v.element.tag); + return; + } + if (is_special_node(node) && + !node_tag_in_set( + node, (gumbo_tagset){TAG(ADDRESS), TAG(DIV), TAG(P)})) { + return; + } + } +} + +static void merge_attributes( + GumboParser* parser, GumboToken* token, GumboNode* node) { + assert(token->type == GUMBO_TOKEN_START_TAG); + assert(node->type == GUMBO_NODE_ELEMENT); + const GumboVector* token_attr = &token->v.start_tag.attributes; + GumboVector* node_attr = &node->v.element.attributes; + + for (unsigned int i = 0; i < token_attr->length; ++i) { + GumboAttribute* attr = token_attr->data[i]; + if (!gumbo_get_attribute(node_attr, attr->name)) { + // Ownership of the attribute is transferred by this gumbo_vector_add, + // so it has to be nulled out of the original token so it doesn't get + // double-deleted. + gumbo_vector_add(parser, attr, node_attr); + token_attr->data[i] = NULL; + } + } + // When attributes are merged, it means the token has been ignored and merged + // with another token, so we need to free its memory. The attributes that are + // transferred need to be nulled-out in the vector above so that they aren't + // double-deleted. + gumbo_token_destroy(parser, token); + +#ifndef NDEBUG + // Mark this sentinel so the assertion in the main loop knows it's been + // destroyed. + token->v.start_tag.attributes = kGumboEmptyVector; +#endif +} + +const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) { + for (size_t i = 0; i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry); + ++i) { + const ReplacementEntry* entry = &kSvgTagReplacements[i]; + if (gumbo_string_equals_ignore_case(tag, &entry->from)) { + return entry->to.data; + } + } + return NULL; +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes +// This destructively modifies any matching attributes on the token and sets the +// namespace appropriately. +static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) { + assert(token->type == GUMBO_TOKEN_START_TAG); + const GumboVector* attributes = &token->v.start_tag.attributes; + for (size_t i = 0; i < sizeof(kForeignAttributeReplacements) / + sizeof(NamespacedAttributeReplacement); + ++i) { + const NamespacedAttributeReplacement* entry = + &kForeignAttributeReplacements[i]; + GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from); + if (!attr) { + continue; + } + gumbo_parser_deallocate(parser, (void*) attr->name); + attr->attr_namespace = entry->attr_namespace; + attr->name = gumbo_copy_stringz(parser, entry->local_name); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes +// This destructively modifies any matching attributes on the token. +static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) { + assert(token->type == GUMBO_TOKEN_START_TAG); + const GumboVector* attributes = &token->v.start_tag.attributes; + for (size_t i = 0; + i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) { + const ReplacementEntry* entry = &kSvgAttributeReplacements[i]; + GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data); + if (!attr) { + continue; + } + gumbo_parser_deallocate(parser, (void*) attr->name); + attr->name = gumbo_copy_stringz(parser, entry->to.data); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes +// Note that this may destructively modify the token with the new attribute +// value. +static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) { + assert(token->type == GUMBO_TOKEN_START_TAG); + GumboAttribute* attr = + gumbo_get_attribute(&token->v.start_tag.attributes, "definitionurl"); + if (!attr) { + return; + } + gumbo_parser_deallocate(parser, (void*) attr->name); + attr->name = gumbo_copy_stringz(parser, "definitionURL"); +} + +static bool doctype_matches(const GumboTokenDocType* doctype, + const GumboStringPiece* public_id, const GumboStringPiece* system_id, + bool allow_missing_system_id) { + return !strcmp(doctype->public_identifier, public_id->data) && + (allow_missing_system_id || doctype->has_system_identifier) && + !strcmp(doctype->system_identifier, system_id->data); +} + +static bool maybe_add_doctype_error( + GumboParser* parser, const GumboToken* token) { + const GumboTokenDocType* doctype = &token->v.doc_type; + bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data); + if ((!html_doctype || doctype->has_public_identifier || + (doctype->has_system_identifier && + !strcmp( + doctype->system_identifier, kSystemIdLegacyCompat.data))) && + !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0, + &kSystemIdRecHtml4_0, true) || + doctype_matches(doctype, &kPublicIdHtml4_01, + &kSystemIdHtml4, true) || + doctype_matches(doctype, &kPublicIdXhtml1_0, + &kSystemIdXhtmlStrict1_1, false) || + doctype_matches(doctype, &kPublicIdXhtml1_1, + &kSystemIdXhtml1_1, false)))) { + parser_add_parse_error(parser, token); + return false; + } + return true; +} + +static void remove_from_parent(GumboParser* parser, GumboNode* node) { + if (!node->parent) { + // The node may not have a parent if, for example, it is a newly-cloned copy + // of an active formatting element. DOM manipulations continue with the + // orphaned fragment of the DOM tree until it's appended/foster-parented to + // the common ancestor at the end of the adoption agency algorithm. + return; + } + assert(node->parent->type == GUMBO_NODE_ELEMENT); + GumboVector* children = &node->parent->v.element.children; + int index = gumbo_vector_index_of(children, node); + assert(index != -1); + + gumbo_vector_remove_at(parser, index, children); + node->parent = NULL; + node->index_within_parent = -1; + for (unsigned int i = index; i < children->length; ++i) { + GumboNode* child = children->data[i]; + child->index_within_parent = i; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser +// Also described in the "in body" handling for end formatting tags. +static bool adoption_agency_algorithm( + GumboParser* parser, GumboToken* token, GumboTag subject) { + GumboParserState* state = parser->_parser_state; + gumbo_debug("Entering adoption agency algorithm.\n"); + // Step 1. + GumboNode* current_node = get_current_node(parser); + if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML && + current_node->v.element.tag == subject && + gumbo_vector_index_of( + &state->_active_formatting_elements, current_node) == -1) { + pop_current_node(parser); + return false; + } + // Steps 2-4 & 20: + for (unsigned int i = 0; i < 8; ++i) { + // Step 5. + GumboNode* formatting_node = NULL; + int formatting_node_in_open_elements = -1; + for (int j = state->_active_formatting_elements.length; --j >= 0;) { + GumboNode* current_node = state->_active_formatting_elements.data[j]; + if (current_node == &kActiveFormattingScopeMarker) { + gumbo_debug("Broke on scope marker; aborting.\n"); + // Last scope marker; abort the algorithm. + return false; + } + if (node_html_tag_is(current_node, subject)) { + // Found it. + formatting_node = current_node; + formatting_node_in_open_elements = + gumbo_vector_index_of(&state->_open_elements, formatting_node); + gumbo_debug("Formatting element of tag %s at %d.\n", + gumbo_normalized_tagname(subject), + formatting_node_in_open_elements); + break; + } + } + if (!formatting_node) { + // No matching tag; not a parse error outright, but fall through to the + // "any other end tag" clause (which may potentially add a parse error, + // but not always). + gumbo_debug("No active formatting elements; aborting.\n"); + return false; + } + + // Step 6 + if (formatting_node_in_open_elements == -1) { + gumbo_debug("Formatting node not on stack of open elements.\n"); + parser_add_parse_error(parser, token); + gumbo_vector_remove( + parser, formatting_node, &state->_active_formatting_elements); + return false; + } + + // Step 7 + if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) { + parser_add_parse_error(parser, token); + gumbo_debug("Element not in scope.\n"); + return false; + } + + // Step 8 + if (formatting_node != get_current_node(parser)) { + parser_add_parse_error(parser, token); // But continue onwards. + } + assert(formatting_node); + assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML)); + assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY)); + + // Step 9 & 10 + GumboNode* furthest_block = NULL; + for (unsigned int j = formatting_node_in_open_elements; + j < state->_open_elements.length; ++j) { + assert(j > 0); + GumboNode* current = state->_open_elements.data[j]; + if (is_special_node(current)) { + // Step 9. + furthest_block = current; + break; + } + } + if (!furthest_block) { + // Step 10. + while (get_current_node(parser) != formatting_node) { + pop_current_node(parser); + } + // And the formatting element itself. + pop_current_node(parser); + gumbo_vector_remove( + parser, formatting_node, &state->_active_formatting_elements); + return false; + } + assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML)); + assert(furthest_block); + + // Step 11. + // Elements may be moved and reparented by this algorithm, so + // common_ancestor is not necessarily the same as formatting_node->parent. + GumboNode* common_ancestor = + state->_open_elements.data[gumbo_vector_index_of(&state->_open_elements, + formatting_node) - + 1]; + gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n", + gumbo_normalized_tagname(common_ancestor->v.element.tag), + gumbo_normalized_tagname(furthest_block->v.element.tag)); + + // Step 12. + int bookmark = gumbo_vector_index_of( + &state->_active_formatting_elements, formatting_node) + + 1; + gumbo_debug("Bookmark at %d.\n", bookmark); + // Step 13. + GumboNode* node = furthest_block; + GumboNode* last_node = furthest_block; + // Must be stored explicitly, in case node is removed from the stack of open + // elements, to handle step 9.4. + int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node); + assert(saved_node_index > 0); + // Step 13.1. + for (int j = 0;;) { + // Step 13.2. + ++j; + // Step 13.3. + int node_index = gumbo_vector_index_of(&state->_open_elements, node); + gumbo_debug( + "Current index: %d, last index: %d.\n", node_index, saved_node_index); + if (node_index == -1) { + node_index = saved_node_index; + } + saved_node_index = --node_index; + assert(node_index > 0); + assert((unsigned int) node_index < state->_open_elements.capacity); + node = state->_open_elements.data[node_index]; + assert(node->parent); + if (node == formatting_node) { + // Step 13.4. + break; + } + int formatting_index = + gumbo_vector_index_of(&state->_active_formatting_elements, node); + if (j > 3 && formatting_index != -1) { + // Step 13.5. + gumbo_debug("Removing formatting element at %d.\n", formatting_index); + gumbo_vector_remove_at( + parser, formatting_index, &state->_active_formatting_elements); + // Removing the element shifts all indices over by one, so we may need + // to move the bookmark. + if (formatting_index < bookmark) { + --bookmark; + gumbo_debug("Moving bookmark to %d.\n", bookmark); + } + continue; + } + if (formatting_index == -1) { + // Step 13.6. + gumbo_vector_remove_at(parser, node_index, &state->_open_elements); + continue; + } + // Step 13.7. + // "common ancestor as the intended parent" doesn't actually mean insert + // it into the common ancestor; that happens below. + node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED); + assert(formatting_index >= 0); + state->_active_formatting_elements.data[formatting_index] = node; + assert(node_index >= 0); + state->_open_elements.data[node_index] = node; + // Step 13.8. + if (last_node == furthest_block) { + bookmark = formatting_index + 1; + gumbo_debug("Bookmark moved to %d.\n", bookmark); + assert((unsigned int) bookmark <= state->_active_formatting_elements.length); + } + // Step 13.9. + last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED; + remove_from_parent(parser, last_node); + append_node(parser, node, last_node); + // Step 13.10. + last_node = node; + } // Step 13.11. + + // Step 14. + gumbo_debug("Removing %s node from parent ", + gumbo_normalized_tagname(last_node->v.element.tag)); + remove_from_parent(parser, last_node); + last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED; + InsertionLocation location = + get_appropriate_insertion_location(parser, common_ancestor); + gumbo_debug("and inserting it into %s.\n", + gumbo_normalized_tagname(location.target->v.element.tag)); + insert_node(parser, last_node, location); + + // Step 15. + GumboNode* new_formatting_node = clone_node( + parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED); + formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; + + // Step 16. Instead of appending nodes one-by-one, we swap the children + // vector of furthest_block with the empty children of new_formatting_node, + // reducing memory traffic and allocations. We still have to reset their + // parent pointers, though. + GumboVector temp = new_formatting_node->v.element.children; + new_formatting_node->v.element.children = + furthest_block->v.element.children; + furthest_block->v.element.children = temp; + + temp = new_formatting_node->v.element.children; + for (unsigned int i = 0; i < temp.length; ++i) { + GumboNode* child = temp.data[i]; + child->parent = new_formatting_node; + } + + // Step 17. + append_node(parser, furthest_block, new_formatting_node); + + // Step 18. + // If the formatting node was before the bookmark, it may shift over all + // indices after it, so we need to explicitly find the index and possibly + // adjust the bookmark. + int formatting_node_index = gumbo_vector_index_of( + &state->_active_formatting_elements, formatting_node); + assert(formatting_node_index != -1); + if (formatting_node_index < bookmark) { + gumbo_debug( + "Formatting node at %d is before bookmark at %d; decrementing.\n", + formatting_node_index, bookmark); + --bookmark; + } + gumbo_vector_remove_at( + parser, formatting_node_index, &state->_active_formatting_elements); + assert(bookmark >= 0); + assert((unsigned int) bookmark <= state->_active_formatting_elements.length); + gumbo_vector_insert_at(parser, new_formatting_node, bookmark, + &state->_active_formatting_elements); + + // Step 19. + gumbo_vector_remove(parser, formatting_node, &state->_open_elements); + int insert_at = + gumbo_vector_index_of(&state->_open_elements, furthest_block) + 1; + assert(insert_at >= 0); + assert((unsigned int) insert_at <= state->_open_elements.length); + gumbo_vector_insert_at( + parser, new_formatting_node, insert_at, &state->_open_elements); + } // Step 20. + return true; +} + +// This is here to clean up memory when the spec says "Ignore current token." +static void ignore_token(GumboParser* parser) { + GumboToken* token = parser->_parser_state->_current_token; + // Ownership of the token's internal buffers are normally transferred to the + // element, but if no element is emitted (as happens in non-verbatim-mode + // when a token is ignored), we need to free it here to prevent a memory + // leak. + gumbo_token_destroy(parser, token); +#ifndef NDEBUG + if (token->type == GUMBO_TOKEN_START_TAG) { + // Mark this sentinel so the assertion in the main loop knows it's been + // destroyed. + token->v.start_tag.attributes = kGumboEmptyVector; + } +#endif +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html +static void finish_parsing(GumboParser* parser) { + gumbo_debug("Finishing parsing"); + maybe_flush_text_node_buffer(parser); + GumboParserState* state = parser->_parser_state; + for (GumboNode* node = pop_current_node(parser); node; + node = pop_current_node(parser)) { + if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) || + (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) { + continue; + } + node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; + } + while (pop_current_node(parser)) + ; // Pop them all. +} + +static bool handle_initial(GumboParser* parser, GumboToken* token) { + GumboDocument* document = &get_document_node(parser)->v.document; + if (token->type == GUMBO_TOKEN_WHITESPACE) { + ignore_token(parser); + return true; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_document_node(parser), token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + document->has_doctype = true; + document->name = token->v.doc_type.name; + document->public_identifier = token->v.doc_type.public_identifier; + document->system_identifier = token->v.doc_type.system_identifier; + document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML); + return maybe_add_doctype_error(parser, token); + } + parser_add_parse_error(parser, token); + document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS; + set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML); + parser->_parser_state->_reprocess_current_token = true; + return true; +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode +static bool handle_before_html(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_document_node(parser), token); + return true; + } else if (token->type == GUMBO_TOKEN_WHITESPACE) { + ignore_token(parser); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + GumboNode* html_node = insert_element_from_token(parser, token); + parser->_output->root = html_node; + set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD); + return true; + } else if (token->type == GUMBO_TOKEN_END_TAG && + !tag_in(token, false, + (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + GumboNode* html_node = insert_element_of_tag_type( + parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED); + assert(html_node); + parser->_output->root = html_node; + set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD); + parser->_parser_state->_reprocess_current_token = true; + return true; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode +static bool handle_before_head(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_current_node(parser), token); + return true; + } else if (token->type == GUMBO_TOKEN_WHITESPACE) { + ignore_token(parser); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) { + GumboNode* node = insert_element_from_token(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); + parser->_parser_state->_head_element = node; + return true; + } else if (token->type == GUMBO_TOKEN_END_TAG && + !tag_in(token, false, + (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + GumboNode* node = insert_element_of_tag_type( + parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); + parser->_parser_state->_head_element = node; + parser->_parser_state->_reprocess_current_token = true; + return true; + } +} + +// Forward declarations because of mutual dependencies. +static bool handle_token(GumboParser* parser, GumboToken* token); +static bool handle_in_body(GumboParser* parser, GumboToken* token); + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead +static bool handle_in_head(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_WHITESPACE) { + insert_text_token(parser, token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_current_node(parser), token); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), + TAG(MENUITEM), TAG(LINK)})) { + insert_element_from_token(parser, token); + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) { + insert_element_from_token(parser, token); + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the + // spec doesn't apply. If clients want to handle meta-tag re-encoding, they + // should specifically look for that string in the document and re-encode it + // before passing to Gumbo. + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) { + run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA); + return true; + } else if (tag_in( + token, kStartTag, (gumbo_tagset){TAG(NOFRAMES), TAG(STYLE)})) { + run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) { + insert_element_from_token(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) { + run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT); + return true; + } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) { + GumboNode* head = pop_current_node(parser); + AVOID_UNUSED_VARIABLE_WARNING(head); + assert(node_html_tag_is(head, GUMBO_TAG_HEAD)); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); + return true; + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)})) { + pop_current_node(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); + parser->_parser_state->_reprocess_current_token = true; + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) { + insert_element_from_token(parser, token); + add_formatting_element(parser, &kActiveFormattingScopeMarker); + parser->_parser_state->_frameset_ok = false; + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); + return true; + } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + generate_all_implied_end_tags_thoroughly(parser); + bool success = true; + if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) { + parser_add_parse_error(parser, token); + success = false; + } + while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE)) + ; + clear_active_formatting_elements(parser); + pop_template_insertion_mode(parser); + reset_insertion_mode_appropriately(parser); + return success; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || + (token->type == GUMBO_TOKEN_END_TAG)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + pop_current_node(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); + parser->_parser_state->_reprocess_current_token = true; + return true; + } + return true; +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript +static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) { + const GumboNode* node = pop_current_node(parser); + assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT)); + AVOID_UNUSED_VARIABLE_WARNING(node); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); + return true; + } else if (token->type == GUMBO_TOKEN_WHITESPACE || + token->type == GUMBO_TOKEN_COMMENT || + tag_in(token, kStartTag, + (gumbo_tagset){TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), + TAG(META), TAG(NOFRAMES), TAG(STYLE)})) { + return handle_in_head(parser, token); + } else if (tag_in( + token, kStartTag, (gumbo_tagset){TAG(HEAD), TAG(NOSCRIPT)}) || + (token->type == GUMBO_TOKEN_END_TAG && + !tag_is(token, kEndTag, GUMBO_TAG_BR))) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + parser_add_parse_error(parser, token); + const GumboNode* node = pop_current_node(parser); + assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT)); + AVOID_UNUSED_VARIABLE_WARNING(node); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); + parser->_parser_state->_reprocess_current_token = true; + return false; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode +static bool handle_after_head(GumboParser* parser, GumboToken* token) { + GumboParserState* state = parser->_parser_state; + if (token->type == GUMBO_TOKEN_WHITESPACE) { + insert_text_token(parser, token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_current_node(parser), token); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) { + insert_element_from_token(parser, token); + state->_frameset_ok = false; + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { + insert_element_from_token(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET); + return true; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), + TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), + TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)})) { + parser_add_parse_error(parser, token); + assert(state->_head_element != NULL); + // This must be flushed before we push the head element on, as there may be + // pending character tokens that should be attached to the root. + maybe_flush_text_node_buffer(parser); + gumbo_vector_add(parser, state->_head_element, &state->_open_elements); + bool result = handle_in_head(parser, token); + gumbo_vector_remove(parser, state->_head_element, &state->_open_elements); + return result; + } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + return handle_in_head(parser, token); + } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || + (token->type == GUMBO_TOKEN_END_TAG && + !tag_in(token, kEndTag, + (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)}))) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); + state->_reprocess_current_token = true; + return true; + } +} + +static void destroy_node(GumboParser* parser, GumboNode* node) { + switch (node->type) { + case GUMBO_NODE_DOCUMENT: { + GumboDocument* doc = &node->v.document; + for (unsigned int i = 0; i < doc->children.length; ++i) { + destroy_node(parser, doc->children.data[i]); + } + gumbo_parser_deallocate(parser, (void*) doc->children.data); + gumbo_parser_deallocate(parser, (void*) doc->name); + gumbo_parser_deallocate(parser, (void*) doc->public_identifier); + gumbo_parser_deallocate(parser, (void*) doc->system_identifier); + } break; + case GUMBO_NODE_TEMPLATE: + case GUMBO_NODE_ELEMENT: + for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) { + gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]); + } + gumbo_parser_deallocate(parser, node->v.element.attributes.data); + for (unsigned int i = 0; i < node->v.element.children.length; ++i) { + destroy_node(parser, node->v.element.children.data[i]); + } + gumbo_parser_deallocate(parser, node->v.element.children.data); + break; + case GUMBO_NODE_TEXT: + case GUMBO_NODE_CDATA: + case GUMBO_NODE_COMMENT: + case GUMBO_NODE_WHITESPACE: + gumbo_parser_deallocate(parser, (void*) node->v.text.text); + break; + } + gumbo_parser_deallocate(parser, node); +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody +static bool handle_in_body(GumboParser* parser, GumboToken* token) { + GumboParserState* state = parser->_parser_state; + assert(state->_open_elements.length > 0); + if (token->type == GUMBO_TOKEN_NULL) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_WHITESPACE) { + reconstruct_active_formatting_elements(parser); + insert_text_token(parser, token); + return true; + } else if (token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_CDATA) { + reconstruct_active_formatting_elements(parser); + insert_text_token(parser, token); + set_frameset_not_ok(parser); + return true; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_current_node(parser), token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + parser_add_parse_error(parser, token); + if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + ignore_token(parser); + return false; + } + assert(parser->_output->root != NULL); + assert(parser->_output->root->type == GUMBO_NODE_ELEMENT); + merge_attributes(parser, token, parser->_output->root); + return false; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), + TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES), + TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) || + tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + return handle_in_head(parser, token); + } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) { + parser_add_parse_error(parser, token); + if (state->_open_elements.length < 2 || + !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || + has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + ignore_token(parser); + return false; + } + state->_frameset_ok = false; + merge_attributes(parser, token, state->_open_elements.data[1]); + return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { + parser_add_parse_error(parser, token); + if (state->_open_elements.length < 2 || + !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || + !state->_frameset_ok) { + ignore_token(parser); + return false; + } + // Save the body node for later removal. + GumboNode* body_node = state->_open_elements.data[1]; + + // Pop all nodes except root HTML element. + GumboNode* node; + do { + node = pop_current_node(parser); + } while (node != state->_open_elements.data[1]); + + // Removing & destroying the body node is going to kill any nodes that have + // been added to the list of active formatting elements, and so we should + // clear it to prevent a use-after-free if the list of active formatting + // elements is reconstructed afterwards. This may happen if whitespace + // follows the </frameset>. + clear_active_formatting_elements(parser); + + // Remove the body node. We may want to factor this out into a generic + // helper, but right now this is the only code that needs to do this. + GumboVector* children = &parser->_output->root->v.element.children; + for (unsigned int i = 0; i < children->length; ++i) { + if (children->data[i] == body_node) { + gumbo_vector_remove_at(parser, i, children); + break; + } + } + destroy_node(parser, body_node); + + // Insert the <frameset>, and switch the insertion mode. + insert_element_from_token(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET); + return true; + } else if (token->type == GUMBO_TOKEN_EOF) { + for (unsigned int i = 0; i < state->_open_elements.length; ++i) { + if (!node_tag_in_set(state->_open_elements.data[i], + (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), + TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), + TAG(HTML)})) { + parser_add_parse_error(parser, token); + } + } + if (get_current_template_insertion_mode(parser) != + GUMBO_INSERTION_MODE_INITIAL) { + return handle_in_template(parser, token); + } + return true; + } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(HTML)})) { + if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + bool success = true; + for (unsigned int i = 0; i < state->_open_elements.length; ++i) { + if (!node_tag_in_set(state->_open_elements.data[i], + (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), + TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), + TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), + TAG(BODY), TAG(HTML)})) { + parser_add_parse_error(parser, token); + success = false; + break; + } + } + set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY); + if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { + parser->_parser_state->_reprocess_current_token = true; + } else { + GumboNode* body = state->_open_elements.data[1]; + assert(node_html_tag_is(body, GUMBO_TAG_BODY)); + record_end_of_element(state->_current_token, &body->v.element); + } + return success; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), + TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS), TAG(DIR), + TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION), + TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), + TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P), + TAG(SECTION), TAG(SUMMARY), TAG(UL)})) { + bool result = maybe_implicitly_close_p_tag(parser, token); + insert_element_from_token(parser, token); + return result; + } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), + TAG(H4), TAG(H5), TAG(H6)})) { + bool result = maybe_implicitly_close_p_tag(parser, token); + if (node_tag_in_set( + get_current_node(parser), (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), + TAG(H4), TAG(H5), TAG(H6)})) { + parser_add_parse_error(parser, token); + pop_current_node(parser); + result = false; + } + insert_element_from_token(parser, token); + return result; + } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(PRE), TAG(LISTING)})) { + bool result = maybe_implicitly_close_p_tag(parser, token); + insert_element_from_token(parser, token); + state->_ignore_next_linefeed = true; + state->_frameset_ok = false; + return result; + } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) { + if (state->_form_element != NULL && + !has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + gumbo_debug("Ignoring nested form.\n"); + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + bool result = maybe_implicitly_close_p_tag(parser, token); + GumboNode* form_element = insert_element_from_token(parser, token); + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + state->_form_element = form_element; + } + return result; + } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) { + maybe_implicitly_close_list_tag(parser, token, true); + bool result = maybe_implicitly_close_p_tag(parser, token); + insert_element_from_token(parser, token); + return result; + } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(DD), TAG(DT)})) { + maybe_implicitly_close_list_tag(parser, token, false); + bool result = maybe_implicitly_close_p_tag(parser, token); + insert_element_from_token(parser, token); + return result; + } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) { + bool result = maybe_implicitly_close_p_tag(parser, token); + insert_element_from_token(parser, token); + gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT); + return result; + } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) { + if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) { + parser_add_parse_error(parser, token); + implicitly_close_tags( + parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON); + state->_reprocess_current_token = true; + return false; + } + reconstruct_active_formatting_elements(parser); + insert_element_from_token(parser, token); + state->_frameset_ok = false; + return true; + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), + TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS), + TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), + TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), + TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV), + TAG(OL), TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)})) { + GumboTag tag = token->v.end_tag; + if (!has_an_element_in_scope(parser, tag)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + implicitly_close_tags( + parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag); + return true; + } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) { + if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + bool success = true; + generate_implied_end_tags(parser, GUMBO_TAG_LAST); + if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) { + parser_add_parse_error(parser, token); + return false; + } + while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM)) + ; + return success; + } else { + bool result = true; + const GumboNode* node = state->_form_element; + assert(!node || node->type == GUMBO_NODE_ELEMENT); + state->_form_element = NULL; + if (!node || !has_node_in_scope(parser, node)) { + gumbo_debug("Closing an unopened form.\n"); + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + // This differs from implicitly_close_tags because we remove *only* the + // <form> element; other nodes are left in scope. + generate_implied_end_tags(parser, GUMBO_TAG_LAST); + if (get_current_node(parser) != node) { + parser_add_parse_error(parser, token); + result = false; + } + + GumboVector* open_elements = &state->_open_elements; + int index = gumbo_vector_index_of(open_elements, node); + assert(index >= 0); + gumbo_vector_remove_at(parser, index, open_elements); + return result; + } + } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) { + if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) { + parser_add_parse_error(parser, token); + // reconstruct_active_formatting_elements(parser); + insert_element_of_tag_type( + parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG); + state->_reprocess_current_token = true; + return false; + } + return implicitly_close_tags( + parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P); + } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) { + if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + return implicitly_close_tags( + parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI); + } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(DD), TAG(DT)})) { + assert(token->type == GUMBO_TOKEN_END_TAG); + GumboTag token_tag = token->v.end_tag; + if (!has_an_element_in_scope(parser, token_tag)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + return implicitly_close_tags( + parser, token, GUMBO_NAMESPACE_HTML, token_tag); + } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), + TAG(H4), TAG(H5), TAG(H6)})) { + if (!has_an_element_in_scope_with_tagname( + parser, 6, (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, + GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) { + // No heading open; ignore the token entirely. + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + generate_implied_end_tags(parser, GUMBO_TAG_LAST); + const GumboNode* current_node = get_current_node(parser); + bool success = node_html_tag_is(current_node, token->v.end_tag); + if (!success) { + // There're children of the heading currently open; close them below and + // record a parse error. + // TODO(jdtang): Add a way to distinguish this error case from the one + // above. + parser_add_parse_error(parser, token); + } + do { + current_node = pop_current_node(parser); + } while (!node_tag_in_set( + current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), + TAG(H4), TAG(H5), TAG(H6)})); + return success; + } + } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) { + bool success = true; + int last_a; + int has_matching_a = find_last_anchor_index(parser, &last_a); + if (has_matching_a) { + assert(has_matching_a == 1); + parser_add_parse_error(parser, token); + adoption_agency_algorithm(parser, token, GUMBO_TAG_A); + // The adoption agency algorithm usually removes all instances of <a> + // from the list of active formatting elements, but in case it doesn't, + // we're supposed to do this. (The conditions where it might not are + // listed in the spec.) + if (find_last_anchor_index(parser, &last_a)) { + void* last_element = gumbo_vector_remove_at( + parser, last_a, &state->_active_formatting_elements); + gumbo_vector_remove(parser, last_element, &state->_open_elements); + } + success = false; + } + reconstruct_active_formatting_elements(parser); + add_formatting_element(parser, insert_element_from_token(parser, token)); + return success; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), + TAG(I), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), + TAG(TT), TAG(U)})) { + reconstruct_active_formatting_elements(parser); + add_formatting_element(parser, insert_element_from_token(parser, token)); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) { + bool result = true; + reconstruct_active_formatting_elements(parser); + if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) { + result = false; + parser_add_parse_error(parser, token); + adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR); + reconstruct_active_formatting_elements(parser); + } + insert_element_from_token(parser, token); + add_formatting_element(parser, get_current_node(parser)); + return result; + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), + TAG(FONT), TAG(I), TAG(NOBR), TAG(S), TAG(SMALL), + TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)})) { + return adoption_agency_algorithm(parser, token, token->v.end_tag); + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) { + reconstruct_active_formatting_elements(parser); + insert_element_from_token(parser, token); + add_formatting_element(parser, &kActiveFormattingScopeMarker); + set_frameset_not_ok(parser); + return true; + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) { + GumboTag token_tag = token->v.end_tag; + if (!has_an_element_in_table_scope(parser, token_tag)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag); + clear_active_formatting_elements(parser); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) { + if (get_document_node(parser)->v.document.doc_type_quirks_mode != + GUMBO_DOCTYPE_QUIRKS) { + maybe_implicitly_close_p_tag(parser, token); + } + insert_element_from_token(parser, token); + set_frameset_not_ok(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + return true; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), + TAG(IMAGE), TAG(KEYGEN), TAG(WBR)})) { + bool success = true; + if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) { + success = false; + parser_add_parse_error(parser, token); + token->v.start_tag.tag = GUMBO_TAG_IMG; + } + reconstruct_active_formatting_elements(parser); + GumboNode* node = insert_element_from_token(parser, token); + if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) { + success = false; + parser_add_parse_error(parser, token); + node->v.element.tag = GUMBO_TAG_IMG; + node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE; + } + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + set_frameset_not_ok(parser); + return success; + } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) { + if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) { + // Must be before the element is inserted, as that takes ownership of the + // token's attribute vector. + set_frameset_not_ok(parser); + } + reconstruct_active_formatting_elements(parser); + insert_element_from_token(parser, token); + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + return true; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})) { + insert_element_from_token(parser, token); + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) { + bool result = maybe_implicitly_close_p_tag(parser, token); + insert_element_from_token(parser, token); + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + set_frameset_not_ok(parser); + return result; + } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) { + parser_add_parse_error(parser, token); + if (parser->_parser_state->_form_element != NULL && + !has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + ignore_token(parser); + return false; + } + acknowledge_self_closing_tag(parser); + maybe_implicitly_close_p_tag(parser, token); + set_frameset_not_ok(parser); + + GumboVector* token_attrs = &token->v.start_tag.attributes; + GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt"); + GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action"); + GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "name"); + + GumboNode* form = insert_element_of_tag_type( + parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX); + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + parser->_parser_state->_form_element = form; + } + if (action_attr) { + gumbo_vector_add(parser, action_attr, &form->v.element.attributes); + } + insert_element_of_tag_type( + parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX); + pop_current_node(parser); // <hr> + + insert_element_of_tag_type( + parser, GUMBO_TAG_LABEL, GUMBO_INSERTION_FROM_ISINDEX); + TextNodeBufferState* text_state = &parser->_parser_state->_text_node; + text_state->_start_original_text = token->original_text.data; + text_state->_start_position = token->position; + text_state->_type = GUMBO_NODE_TEXT; + if (prompt_attr) { + int prompt_attr_length = strlen(prompt_attr->value); + gumbo_string_buffer_destroy(parser, &text_state->_buffer); + text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value); + text_state->_buffer.length = prompt_attr_length; + text_state->_buffer.capacity = prompt_attr_length + 1; + gumbo_destroy_attribute(parser, prompt_attr); + } else { + GumboStringPiece prompt_text = + GUMBO_STRING("This is a searchable index. Enter search keywords: "); + gumbo_string_buffer_append_string( + parser, &prompt_text, &text_state->_buffer); + } + + GumboNode* input = insert_element_of_tag_type( + parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX); + for (unsigned int i = 0; i < token_attrs->length; ++i) { + GumboAttribute* attr = token_attrs->data[i]; + if (attr != prompt_attr && attr != action_attr && attr != name_attr) { + gumbo_vector_add(parser, attr, &input->v.element.attributes); + } + token_attrs->data[i] = NULL; + } + + // All attributes have been successfully transferred and nulled out at this + // point, so the call to ignore_token will free the memory for it without + // touching the attributes. + ignore_token(parser); + + // The name attribute, if present, should be destroyed since it's ignored + // when copying over. The action attribute should be kept since it's moved + // to the form. + if (name_attr) { + gumbo_destroy_attribute(parser, name_attr); + } + + GumboAttribute* name = + gumbo_parser_allocate(parser, sizeof(GumboAttribute)); + GumboStringPiece name_str = GUMBO_STRING("name"); + GumboStringPiece isindex_str = GUMBO_STRING("isindex"); + name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE; + name->name = gumbo_copy_stringz(parser, "name"); + name->value = gumbo_copy_stringz(parser, "isindex"); + name->original_name = name_str; + name->original_value = isindex_str; + name->name_start = kGumboEmptySourcePosition; + name->name_end = kGumboEmptySourcePosition; + name->value_start = kGumboEmptySourcePosition; + name->value_end = kGumboEmptySourcePosition; + gumbo_vector_add(parser, name, &input->v.element.attributes); + + pop_current_node(parser); // <input> + pop_current_node(parser); // <label> + insert_element_of_tag_type( + parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX); + pop_current_node(parser); // <hr> + pop_current_node(parser); // <form> + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + parser->_parser_state->_form_element = NULL; + } + return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) { + run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA); + parser->_parser_state->_ignore_next_linefeed = true; + set_frameset_not_ok(parser); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) { + bool result = maybe_implicitly_close_p_tag(parser, token); + reconstruct_active_formatting_elements(parser); + set_frameset_not_ok(parser); + run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); + return result; + } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) { + set_frameset_not_ok(parser); + run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) { + run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) { + reconstruct_active_formatting_elements(parser); + insert_element_from_token(parser, token); + set_frameset_not_ok(parser); + GumboInsertionMode state = parser->_parser_state->_insertion_mode; + if (state == GUMBO_INSERTION_MODE_IN_TABLE || + state == GUMBO_INSERTION_MODE_IN_CAPTION || + state == GUMBO_INSERTION_MODE_IN_TABLE_BODY || + state == GUMBO_INSERTION_MODE_IN_ROW || + state == GUMBO_INSERTION_MODE_IN_CELL) { + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE); + } else { + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT); + } + return true; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(OPTION), TAG(OPTGROUP)})) { + if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { + pop_current_node(parser); + } + reconstruct_active_formatting_elements(parser); + insert_element_from_token(parser, token); + return true; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})) { + bool success = true; + GumboTag exception = + tag_in(token, kStartTag, (gumbo_tagset){TAG(RT), TAG(RP)}) + ? GUMBO_TAG_RTC + : GUMBO_TAG_LAST; + if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) { + generate_implied_end_tags(parser, exception); + } + if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) && + !(exception == GUMBO_TAG_LAST || + node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) { + parser_add_parse_error(parser, token); + success = false; + } + insert_element_from_token(parser, token); + return success; + } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) { + parser_add_parse_error(parser, token); + reconstruct_active_formatting_elements(parser); + insert_element_of_tag_type( + parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG); + pop_current_node(parser); + return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) { + reconstruct_active_formatting_elements(parser); + adjust_mathml_attributes(parser, token); + adjust_foreign_attributes(parser, token); + insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML); + if (token->v.start_tag.is_self_closing) { + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + } + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) { + reconstruct_active_formatting_elements(parser); + adjust_svg_attributes(parser, token); + adjust_foreign_attributes(parser, token); + insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG); + if (token->v.start_tag.is_self_closing) { + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + } + return true; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), + TAG(FRAME), TAG(HEAD), TAG(TBODY), TAG(TD), TAG(TFOOT), + TAG(TH), TAG(THEAD), TAG(TR)})) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_START_TAG) { + reconstruct_active_formatting_elements(parser); + insert_element_from_token(parser, token); + return true; + } else { + assert(token->type == GUMBO_TOKEN_END_TAG); + GumboTag end_tag = token->v.end_tag; + assert(state->_open_elements.length > 0); + assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); + // Walk up the stack of open elements until we find one that either: + // a) Matches the tag name we saw + // b) Is in the "special" category. + // If we see a), implicitly close everything up to and including it. If we + // see b), then record a parse error, don't close anything (except the + // implied end tags) and ignore the end tag token. + for (int i = state->_open_elements.length; --i >= 0;) { + const GumboNode* node = state->_open_elements.data[i]; + if (node_html_tag_is(node, end_tag)) { + generate_implied_end_tags(parser, end_tag); + // TODO(jdtang): Do I need to add a parse error here? The condition in + // the spec seems like it's the inverse of the loop condition above, and + // so would never fire. + while (node != pop_current_node(parser)) + ; // Pop everything. + return true; + } else if (is_special_node(node)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + } + // <html> is in the special category, so we should never get here. + assert(0); + return false; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata +static bool handle_text(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_WHITESPACE) { + insert_text_token(parser, token); + } else { + // We provide only bare-bones script handling that doesn't involve any of + // the parser-pause/already-started/script-nesting flags or re-entrant + // invocations of the tokenizer. Because the intended usage of this library + // is mostly for templating, refactoring, and static-analysis libraries, we + // provide the script body as a text-node child of the <script> element. + // This behavior doesn't support document.write of partial HTML elements, + // but should be adequate for almost all other scripting support. + if (token->type == GUMBO_TOKEN_EOF) { + parser_add_parse_error(parser, token); + parser->_parser_state->_reprocess_current_token = true; + } + pop_current_node(parser); + set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode); + } + return true; +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable +static bool handle_in_table(GumboParser* parser, GumboToken* token) { + GumboParserState* state = parser->_parser_state; + if (token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_WHITESPACE) { + // The "pending table character tokens" list described in the spec is + // nothing more than the TextNodeBufferState. We accumulate text tokens as + // normal, except that when we go to flush them in the handle_in_table_text, + // we set _foster_parent_insertions if there're non-whitespace characters in + // the buffer. + assert(state->_text_node._buffer.length == 0); + state->_original_insertion_mode = state->_insertion_mode; + state->_reprocess_current_token = true; + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_current_node(parser), token); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) { + clear_stack_to_table_context(parser); + add_formatting_element(parser, &kActiveFormattingScopeMarker); + insert_element_from_token(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) { + clear_stack_to_table_context(parser); + insert_element_from_token(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { + clear_stack_to_table_context(parser); + insert_element_of_tag_type( + parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED); + parser->_parser_state->_reprocess_current_token = true; + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); + return true; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD), + TAG(TH), TAG(TR)})) { + clear_stack_to_table_context(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); + if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH), TAG(TR)})) { + insert_element_of_tag_type( + parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED); + state->_reprocess_current_token = true; + } else { + insert_element_from_token(parser, token); + } + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) { + parser_add_parse_error(parser, token); + if (close_table(parser)) { + parser->_parser_state->_reprocess_current_token = true; + } else { + ignore_token(parser); + } + return false; + } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) { + if (!close_table(parser)) { + parser_add_parse_error(parser, token); + return false; + } + return true; + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), + TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT), + TAG(TH), TAG(THEAD), TAG(TR)})) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) || + (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) { + return handle_in_head(parser, token); + } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) && + attribute_matches( + &token->v.start_tag.attributes, "type", "hidden")) { + parser_add_parse_error(parser, token); + insert_element_from_token(parser, token); + pop_current_node(parser); + return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) { + parser_add_parse_error(parser, token); + if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + ignore_token(parser); + return false; + } + state->_form_element = insert_element_from_token(parser, token); + pop_current_node(parser); + return false; + } else if (token->type == GUMBO_TOKEN_EOF) { + return handle_in_body(parser, token); + } else { + parser_add_parse_error(parser, token); + state->_foster_parent_insertions = true; + bool result = handle_in_body(parser, token); + state->_foster_parent_insertions = false; + return result; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext +static bool handle_in_table_text(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_NULL) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_WHITESPACE) { + insert_text_token(parser, token); + return true; + } else { + GumboParserState* state = parser->_parser_state; + GumboStringBuffer* buffer = &state->_text_node._buffer; + // Can't use strspn for this because GumboStringBuffers are not + // null-terminated. + // Note that TextNodeBuffer may contain UTF-8 characters, but the presence + // of any one byte that is not whitespace means we flip the flag, so this + // loop is still valid. + for (unsigned int i = 0; i < buffer->length; ++i) { + if (!isspace((unsigned char) buffer->data[i]) || + buffer->data[i] == '\v') { + state->_foster_parent_insertions = true; + reconstruct_active_formatting_elements(parser); + break; + } + } + maybe_flush_text_node_buffer(parser); + state->_foster_parent_insertions = false; + state->_reprocess_current_token = true; + state->_insertion_mode = state->_original_insertion_mode; + return true; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption +static bool handle_in_caption(GumboParser* parser, GumboToken* token) { + if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) { + if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + generate_implied_end_tags(parser, GUMBO_TAG_LAST); + bool result = true; + if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) { + parser_add_parse_error(parser, token); + } + while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION)) + ; + clear_active_formatting_elements(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + return result; + } + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), + TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), + TAG(TR)}) || + (tag_is(token, kEndTag, GUMBO_TAG_TABLE))) { + if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION)) + ; + clear_active_formatting_elements(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + parser->_parser_state->_reprocess_current_token = true; + return true; + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), + TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), + TAG(TR)})) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + return handle_in_body(parser, token); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup +static bool handle_in_column_group(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_WHITESPACE) { + insert_text_token(parser, token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_current_node(parser), token); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { + insert_element_from_token(parser, token); + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + return true; + } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) { + if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + pop_current_node(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + return false; + } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) || + tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + return handle_in_head(parser, token); + } else if (token->type == GUMBO_TOKEN_EOF) { + return handle_in_body(parser, token); + } else { + if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + pop_current_node(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + parser->_parser_state->_reprocess_current_token = true; + return true; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody +static bool handle_in_table_body(GumboParser* parser, GumboToken* token) { + if (tag_is(token, kStartTag, GUMBO_TAG_TR)) { + clear_stack_to_table_body_context(parser); + insert_element_from_token(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); + return true; + } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) { + parser_add_parse_error(parser, token); + clear_stack_to_table_body_context(parser); + insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED); + parser->_parser_state->_reprocess_current_token = true; + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); + return false; + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { + if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + clear_stack_to_table_body_context(parser); + pop_current_node(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + return true; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), + TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) || + tag_is(token, kEndTag, GUMBO_TAG_TABLE)) { + if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) || + has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) || + has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + clear_stack_to_table_body_context(parser); + pop_current_node(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + parser->_parser_state->_reprocess_current_token = true; + return true; + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR), + TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + return handle_in_table(parser, token); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr +static bool handle_in_row(GumboParser* parser, GumboToken* token) { + if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TH), TAG(TD)})) { + clear_stack_to_table_row_context(parser); + insert_element_from_token(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL); + add_formatting_element(parser, &kActiveFormattingScopeMarker); + return true; + } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) { + if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + clear_stack_to_table_row_context(parser); + pop_current_node(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); + return true; + } + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), + TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)}) || + tag_is(token, kEndTag, GUMBO_TAG_TABLE)) { + if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + clear_stack_to_table_row_context(parser); + pop_current_node(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); + parser->_parser_state->_reprocess_current_token = true; + return true; + } + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { + if (!has_an_element_in_table_scope(parser, token->v.end_tag) || + (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + clear_stack_to_table_row_context(parser); + pop_current_node(parser); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); + parser->_parser_state->_reprocess_current_token = true; + return true; + } + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), + TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else { + return handle_in_table(parser, token); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd +static bool handle_in_cell(GumboParser* parser, GumboToken* token) { + if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TD), TAG(TH)})) { + GumboTag token_tag = token->v.end_tag; + if (!has_an_element_in_table_scope(parser, token_tag)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + return close_table_cell(parser, token, token_tag); + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), + TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), + TAG(TR)})) { + gumbo_debug("Handling <td> in cell.\n"); + if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) && + !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) { + gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n"); + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + parser->_parser_state->_reprocess_current_token = true; + return close_current_cell(parser, token); + } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(CAPTION), + TAG(COL), TAG(COLGROUP), TAG(HTML)})) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TABLE), TAG(TBODY), + TAG(TFOOT), TAG(THEAD), TAG(TR)})) { + if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + parser->_parser_state->_reprocess_current_token = true; + return close_current_cell(parser, token); + } else { + return handle_in_body(parser, token); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect +static bool handle_in_select(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_NULL) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_WHITESPACE) { + insert_text_token(parser, token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_current_node(parser), token); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) { + if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { + pop_current_node(parser); + } + insert_element_from_token(parser, token); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) { + if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { + pop_current_node(parser); + } + if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) { + pop_current_node(parser); + } + insert_element_from_token(parser, token); + return true; + } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) { + GumboVector* open_elements = &parser->_parser_state->_open_elements; + if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) && + node_html_tag_is(open_elements->data[open_elements->length - 2], + GUMBO_TAG_OPTGROUP)) { + pop_current_node(parser); + } + if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) { + pop_current_node(parser); + return true; + } else { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) { + if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { + pop_current_node(parser); + return true; + } else { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) { + if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + close_current_select(parser); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { + close_current_select(parser); + } + return false; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})) { + parser_add_parse_error(parser, token); + if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { + ignore_token(parser); + } else { + close_current_select(parser); + parser->_parser_state->_reprocess_current_token = true; + } + return false; + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(SCRIPT), TAG(TEMPLATE)}) || + tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + return handle_in_head(parser, token); + } else if (token->type == GUMBO_TOKEN_EOF) { + return handle_in_body(parser, token); + } else { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable +static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) { + if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT), + TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) { + parser_add_parse_error(parser, token); + close_current_select(parser); + parser->_parser_state->_reprocess_current_token = true; + return false; + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), + TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) { + parser_add_parse_error(parser, token); + if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { + ignore_token(parser); + return false; + } else { + close_current_select(parser); + // close_current_select already does the + // reset_insertion_mode_appropriately + // reset_insertion_mode_appropriately(parser); + parser->_parser_state->_reprocess_current_token = true; + return false; + } + } else { + return handle_in_select(parser, token); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate +static bool handle_in_template(GumboParser* parser, GumboToken* token) { + GumboParserState* state = parser->_parser_state; + if (token->type == GUMBO_TOKEN_WHITESPACE || + token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_COMMENT || token->type == GUMBO_TOKEN_NULL || + token->type == GUMBO_TOKEN_DOCTYPE) { + return handle_in_body(parser, token); + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), + TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), + TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) || + tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + return handle_in_head(parser, token); + } else if (tag_in( + token, kStartTag, (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), + TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); + state->_reprocess_current_token = true; + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); + state->_reprocess_current_token = true; + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); + state->_reprocess_current_token = true; + return true; + } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); + state->_reprocess_current_token = true; + return true; + } else if (token->type == GUMBO_TOKEN_START_TAG) { + pop_template_insertion_mode(parser); + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); + state->_reprocess_current_token = true; + return true; + } else if (token->type == GUMBO_TOKEN_END_TAG) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (token->type == GUMBO_TOKEN_EOF) { + if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + // Stop parsing. + return true; + } + parser_add_parse_error(parser, token); + while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE)) + ; + clear_active_formatting_elements(parser); + pop_template_insertion_mode(parser); + reset_insertion_mode_appropriately(parser); + state->_reprocess_current_token = true; + return false; + } else { + assert(0); + return false; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody +static bool handle_after_body(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_WHITESPACE || + tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (token->type == GUMBO_TOKEN_COMMENT) { + GumboNode* html_node = parser->_output->root; + assert(html_node != NULL); + append_comment_node(parser, html_node, token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { + /* fragment case: ignore the closing HTML token */ + if (is_fragment_parser(parser)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY); + GumboNode* html = parser->_parser_state->_open_elements.data[0]; + assert(node_html_tag_is(html, GUMBO_TAG_HTML)); + record_end_of_element( + parser->_parser_state->_current_token, &html->v.element); + return true; + } else if (token->type == GUMBO_TOKEN_EOF) { + return true; + } else { + parser_add_parse_error(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); + parser->_parser_state->_reprocess_current_token = true; + return false; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset +static bool handle_in_frameset(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_WHITESPACE) { + insert_text_token(parser, token); + return true; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_current_node(parser), token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { + insert_element_from_token(parser, token); + return true; + } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) { + if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } + pop_current_node(parser); + if (!is_fragment_parser(parser) && + !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) { + set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET); + } + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) { + insert_element_from_token(parser, token); + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { + return handle_in_head(parser, token); + } else if (token->type == GUMBO_TOKEN_EOF) { + if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) { + parser_add_parse_error(parser, token); + return false; + } + return true; + } else { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset +static bool handle_after_frameset(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_WHITESPACE) { + insert_text_token(parser, token); + return true; + } else if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_current_node(parser), token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE) { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { + GumboNode* html = parser->_parser_state->_open_elements.data[0]; + assert(node_html_tag_is(html, GUMBO_TAG_HTML)); + record_end_of_element( + parser->_parser_state->_current_token, &html->v.element); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET); + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { + return handle_in_head(parser, token); + } else if (token->type == GUMBO_TOKEN_EOF) { + return true; + } else { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode +static bool handle_after_after_body(GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_document_node(parser), token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE || + token->type == GUMBO_TOKEN_WHITESPACE || + tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (token->type == GUMBO_TOKEN_EOF) { + return true; + } else { + parser_add_parse_error(parser, token); + set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); + parser->_parser_state->_reprocess_current_token = true; + return false; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode +static bool handle_after_after_frameset( + GumboParser* parser, GumboToken* token) { + if (token->type == GUMBO_TOKEN_COMMENT) { + append_comment_node(parser, get_document_node(parser), token); + return true; + } else if (token->type == GUMBO_TOKEN_DOCTYPE || + token->type == GUMBO_TOKEN_WHITESPACE || + tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + return handle_in_body(parser, token); + } else if (token->type == GUMBO_TOKEN_EOF) { + return true; + } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { + return handle_in_head(parser, token); + } else { + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + } +} + +// Function pointers for each insertion mode. Keep in sync with +// insertion_mode.h. +typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token); +static const TokenHandler kTokenHandlers[] = {handle_initial, + handle_before_html, handle_before_head, handle_in_head, + handle_in_head_noscript, handle_after_head, handle_in_body, handle_text, + handle_in_table, handle_in_table_text, handle_in_caption, + handle_in_column_group, handle_in_table_body, handle_in_row, handle_in_cell, + handle_in_select, handle_in_select_in_table, handle_in_template, + handle_after_body, handle_in_frameset, handle_after_frameset, + handle_after_after_body, handle_after_after_frameset}; + +static bool handle_html_content(GumboParser* parser, GumboToken* token) { + return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode]( + parser, token); +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign +static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { + gumbo_debug("Handling foreign content"); + switch (token->type) { + case GUMBO_TOKEN_NULL: + parser_add_parse_error(parser, token); + token->v.character = kUtf8ReplacementChar; + insert_text_token(parser, token); + return false; + case GUMBO_TOKEN_WHITESPACE: + insert_text_token(parser, token); + return true; + case GUMBO_TOKEN_CDATA: + case GUMBO_TOKEN_CHARACTER: + insert_text_token(parser, token); + set_frameset_not_ok(parser); + return true; + case GUMBO_TOKEN_COMMENT: + append_comment_node(parser, get_current_node(parser), token); + return true; + case GUMBO_TOKEN_DOCTYPE: + parser_add_parse_error(parser, token); + ignore_token(parser); + return false; + default: + // Fall through to the if-statements below. + break; + } + // Order matters for these clauses. + if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR), + TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT), + TAG(EM), TAG(EMBED), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), + TAG(H6), TAG(HEAD), TAG(HR), TAG(I), TAG(IMG), TAG(LI), + TAG(LISTING), TAG(MENU), TAG(META), TAG(NOBR), TAG(OL), TAG(P), + TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), TAG(SPAN), TAG(STRONG), + TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), TAG(TT), TAG(U), + TAG(UL), TAG(VAR)}) || + (tag_is(token, kStartTag, GUMBO_TAG_FONT) && + (token_has_attribute(token, "color") || + token_has_attribute(token, "face") || + token_has_attribute(token, "size")))) { + /* Parse error */ + parser_add_parse_error(parser, token); + + /* + * Fragment case: If the parser was originally created for the HTML + * fragment parsing algorithm, then act as described in the "any other + * start tag" entry below. + */ + if (!is_fragment_parser(parser)) { + do { + pop_current_node(parser); + } while (!(is_mathml_integration_point(get_current_node(parser)) || + is_html_integration_point(get_current_node(parser)) || + get_current_node(parser)->v.element.tag_namespace == + GUMBO_NAMESPACE_HTML)); + parser->_parser_state->_reprocess_current_token = true; + return false; + } + + assert(token->type == GUMBO_TOKEN_START_TAG); + } + + if (token->type == GUMBO_TOKEN_START_TAG) { + const GumboNamespaceEnum current_namespace = + get_adjusted_current_node(parser)->v.element.tag_namespace; + if (current_namespace == GUMBO_NAMESPACE_MATHML) { + adjust_mathml_attributes(parser, token); + } + if (current_namespace == GUMBO_NAMESPACE_SVG) { + // Tag adjustment is left to the gumbo_normalize_svg_tagname helper + // function. + adjust_svg_attributes(parser, token); + } + adjust_foreign_attributes(parser, token); + insert_foreign_element(parser, token, current_namespace); + if (token->v.start_tag.is_self_closing) { + pop_current_node(parser); + acknowledge_self_closing_tag(parser); + } + return true; + // </script> tags are handled like any other end tag, putting the script's + // text into a text node child and closing the current node. + } else { + assert(token->type == GUMBO_TOKEN_END_TAG); + GumboNode* node = get_current_node(parser); + assert(node != NULL); + GumboStringPiece token_tagname = token->original_text; + GumboStringPiece node_tagname = node->v.element.original_tag; + gumbo_tag_from_original_text(&token_tagname); + gumbo_tag_from_original_text(&node_tagname); + + bool is_success = true; + if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) { + parser_add_parse_error(parser, token); + is_success = false; + } + int i = parser->_parser_state->_open_elements.length; + for (--i; i > 0;) { + // Here we move up the stack until we find an HTML element (in which + // case we do nothing) or we find the element that we're about to + // close (in which case we pop everything we've seen until that + // point.) + gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length, + node_tagname.data, i); + if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) { + gumbo_debug("Matches.\n"); + while (pop_current_node(parser) != node) { + // Pop all the nodes below the current one. Node is guaranteed to + // be an element on the stack of open elements (set below), so + // this loop is guaranteed to terminate. + } + return is_success; + } + --i; + node = parser->_parser_state->_open_elements.data[i]; + if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) { + // Must break before gumbo_tag_from_original_text to avoid passing + // parser-inserted nodes through. + break; + } + node_tagname = node->v.element.original_tag; + gumbo_tag_from_original_text(&node_tagname); + } + assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML); + // We can't call handle_token directly because the current node is still in + // the SVG namespace, so it would re-enter this and result in infinite + // recursion. + return handle_html_content(parser, token) && is_success; + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction +static bool handle_token(GumboParser* parser, GumboToken* token) { + if (parser->_parser_state->_ignore_next_linefeed && + token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') { + parser->_parser_state->_ignore_next_linefeed = false; + ignore_token(parser); + return true; + } + // This needs to be reset both here and in the conditional above to catch both + // the case where the next token is not whitespace (so we don't ignore + // whitespace in the middle of <pre> tags) and where there are multiple + // whitespace tokens (so we don't ignore the second one). + parser->_parser_state->_ignore_next_linefeed = false; + + if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) { + parser->_parser_state->_closed_body_tag = true; + } + if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { + parser->_parser_state->_closed_html_tag = true; + } + + const GumboNode* current_node = get_adjusted_current_node(parser); + assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT || + current_node->type == GUMBO_NODE_TEMPLATE); + if (current_node) { + gumbo_debug("Current node: <%s>.\n", + gumbo_normalized_tagname(current_node->v.element.tag)); + } + if (!current_node || + current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML || + (is_mathml_integration_point(current_node) && + (token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_WHITESPACE || + token->type == GUMBO_TOKEN_NULL || + (token->type == GUMBO_TOKEN_START_TAG && + !tag_in(token, kStartTag, + (gumbo_tagset){TAG(MGLYPH), TAG(MALIGNMARK)})))) || + (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML && + node_qualified_tag_is( + current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && + tag_is(token, kStartTag, GUMBO_TAG_SVG)) || + (is_html_integration_point(current_node) && + (token->type == GUMBO_TOKEN_START_TAG || + token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_NULL || + token->type == GUMBO_TOKEN_WHITESPACE)) || + token->type == GUMBO_TOKEN_EOF) { + return handle_html_content(parser, token); + } else { + return handle_in_foreign_content(parser, token); + } +} + +static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx, + GumboNamespaceEnum fragment_namespace) { + GumboNode* root; + assert(fragment_ctx != GUMBO_TAG_LAST); + + // 3 + parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx); + parser->_parser_state->_fragment_ctx->v.element.tag_namespace = + fragment_namespace; + + // 4 + if (fragment_namespace == GUMBO_NAMESPACE_HTML) { + // Non-HTML namespaces always start in the DATA state. + switch (fragment_ctx) { + case GUMBO_TAG_TITLE: + case GUMBO_TAG_TEXTAREA: + gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA); + break; + + case GUMBO_TAG_STYLE: + case GUMBO_TAG_XMP: + case GUMBO_TAG_IFRAME: + case GUMBO_TAG_NOEMBED: + case GUMBO_TAG_NOFRAMES: + gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT); + break; + + case GUMBO_TAG_SCRIPT: + gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); + break; + + case GUMBO_TAG_NOSCRIPT: + /* scripting is disabled in Gumbo, so leave the tokenizer + * in the default data state */ + break; + + case GUMBO_TAG_PLAINTEXT: + gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT); + break; + + default: + /* default data state */ + break; + } + } + + // 5. 6. 7. + root = insert_element_of_tag_type( + parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED); + parser->_output->root = root; + + // 8. + if (fragment_ctx == GUMBO_TAG_TEMPLATE) { + push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); + } + + // 10. + reset_insertion_mode_appropriately(parser); +} + +GumboOutput* gumbo_parse(const char* buffer) { + return gumbo_parse_with_options( + &kGumboDefaultOptions, buffer, strlen(buffer)); +} + +GumboOutput* gumbo_parse_with_options( + const GumboOptions* options, const char* buffer, size_t length) { + GumboParser parser; + parser._options = options; + output_init(&parser); + gumbo_tokenizer_state_init(&parser, buffer, length); + parser_state_init(&parser); + + if (options->fragment_context != GUMBO_TAG_LAST) { + fragment_parser_init( + &parser, options->fragment_context, options->fragment_namespace); + } + + GumboParserState* state = parser._parser_state; + gumbo_debug("Parsing %.*s.\n", length, buffer); + + // Sanity check so that infinite loops die with an assertion failure instead + // of hanging the process before we ever get an error. + int loop_count = 0; + + GumboToken token; + bool has_error = false; + + do { + if (state->_reprocess_current_token) { + state->_reprocess_current_token = false; + } else { + GumboNode* current_node = get_current_node(&parser); + gumbo_tokenizer_set_is_current_node_foreign(&parser, + current_node && + current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML); + has_error = !gumbo_lex(&parser, &token) || has_error; + } + const char* token_type = "text"; + switch (token.type) { + case GUMBO_TOKEN_DOCTYPE: + token_type = "doctype"; + break; + case GUMBO_TOKEN_START_TAG: + token_type = gumbo_normalized_tagname(token.v.start_tag.tag); + break; + case GUMBO_TOKEN_END_TAG: + token_type = gumbo_normalized_tagname(token.v.end_tag); + break; + case GUMBO_TOKEN_COMMENT: + token_type = "comment"; + break; + default: + break; + } + gumbo_debug("Handling %s token @%d:%d in state %d.\n", (char*) token_type, + token.position.line, token.position.column, state->_insertion_mode); + + state->_current_token = &token; + state->_self_closing_flag_acknowledged = + !(token.type == GUMBO_TOKEN_START_TAG && + token.v.start_tag.is_self_closing); + + has_error = !handle_token(&parser, &token) || has_error; + + // Check for memory leaks when ownership is transferred from start tag + // tokens to nodes. + assert(state->_reprocess_current_token || + token.type != GUMBO_TOKEN_START_TAG || + token.v.start_tag.attributes.data == NULL); + + if (!state->_self_closing_flag_acknowledged) { + GumboError* error = parser_add_parse_error(&parser, &token); + if (error) { + error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG; + } + } + + ++loop_count; + assert(loop_count < 1000000000); + + } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) && + !(options->stop_on_first_error && has_error)); + + finish_parsing(&parser); + // For API uniformity reasons, if the doctype still has nulls, convert them to + // empty strings. + GumboDocument* doc_type = &parser._output->document->v.document; + if (doc_type->name == NULL) { + doc_type->name = gumbo_copy_stringz(&parser, ""); + } + if (doc_type->public_identifier == NULL) { + doc_type->public_identifier = gumbo_copy_stringz(&parser, ""); + } + if (doc_type->system_identifier == NULL) { + doc_type->system_identifier = gumbo_copy_stringz(&parser, ""); + } + + parser_state_destroy(&parser); + gumbo_tokenizer_state_destroy(&parser); + return parser._output; +} + +void gumbo_destroy_node(GumboOptions* options, GumboNode* node) { + // Need a dummy GumboParser because the allocator comes along with the + // options object. + GumboParser parser; + parser._options = options; + destroy_node(&parser, node); +} + +void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) { + // Need a dummy GumboParser because the allocator comes along with the + // options object. + GumboParser parser; + parser._options = options; + destroy_node(&parser, output->document); + for (unsigned int i = 0; i < output->errors.length; ++i) { + gumbo_error_destroy(&parser, output->errors.data[i]); + } + gumbo_vector_destroy(&parser, &output->errors); + gumbo_parser_deallocate(&parser, output); +} |
