diff options
Diffstat (limited to '3rdparty/gumbo-parser/src/gumbo.h')
| -rw-r--r-- | 3rdparty/gumbo-parser/src/gumbo.h | 858 |
1 files changed, 858 insertions, 0 deletions
diff --git a/3rdparty/gumbo-parser/src/gumbo.h b/3rdparty/gumbo-parser/src/gumbo.h new file mode 100644 index 0000000..c2e52b4 --- /dev/null +++ b/3rdparty/gumbo-parser/src/gumbo.h @@ -0,0 +1,858 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and +// GUMBO_ as a prefix for enum constants (static constants get the Google-style +// kGumbo prefix). + +/** + * @file + * @mainpage Gumbo HTML Parser + * + * This provides a conformant, no-dependencies implementation of the HTML5 + * parsing algorithm. It supports only UTF8; if you need to parse a different + * encoding, run a preprocessing step to convert to UTF8. It returns a parse + * tree made of the structs in this file. + * + * Example: + * @code + * GumboOutput* output = gumbo_parse(input); + * do_something_with_doctype(output->document); + * do_something_with_html_tree(output->root); + * gumbo_destroy_output(&options, output); + * @endcode + * HTML5 Spec: + * + * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html + */ + +/* + + There are some changes to the original gumbo.h file to facilitate automatic + conversion to nim code. Most of the changes should be surrounded by + #ifdef C2NIM - #endif pair. + +*/ + +#ifdef C2NIM +#@ +const libgumbo_parser = "libgumbo_parser.a" + +template gumboLibPath(): string = + var path = currentSourcePath() + var idx = len(path) - 1 + while idx > 0: + if (path[idx] == '/') or (path[idx] == '\\'): + break + dec(idx) + path = path[0..idx] & libgumbo_parser + path + +{.passL:gumboLibPath().} +@# +#endif + +#ifndef GUMBO_GUMBO_H_ +#define GUMBO_GUMBO_H_ + +#ifndef C2NIM +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define fileno _fileno +#endif +#endif + +#include <stdbool.h> +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * A struct representing a character position within the original text buffer. + * Line and column numbers are 1-based and offsets are 0-based, which matches + * how most editors and command-line tools work. Also, columns measure + * positions in terms of characters while offsets measure by bytes; this is + * because the offset field is often used to pull out a particular region of + * text (which in most languages that bind to C implies pointer arithmetic on a + * buffer of bytes), while the column field is often used to reference a + * particular column on a printable display, which nowadays is usually UTF-8. + */ +typedef struct { + unsigned int line; + unsigned int column; + unsigned int offset; +} GumboSourcePosition; + +/** + * A SourcePosition used for elements that have no source position, i.e. + * parser-inserted elements. + */ +extern const GumboSourcePosition kGumboEmptySourcePosition; + +/** + * A struct representing a string or part of a string. Strings within the + * parser are represented by a char* and a length; the char* points into + * an existing data buffer owned by some other code (often the original input). + * GumboStringPieces are assumed (by convention) to be immutable, because they + * may share data. Use GumboStringBuffer if you need to construct a string. + * Clients should assume that it is not NUL-terminated, and should always use + * explicit lengths when manipulating them. + */ +typedef struct { + /** A pointer to the beginning of the string. NULL iff length == 0. */ + const char* data; + + /** The length of the string fragment, in bytes. May be zero. */ + size_t length; +} GumboStringPiece; + +/** A constant to represent a 0-length null string. */ +extern const GumboStringPiece kGumboEmptyString; + +/** + * Compares two GumboStringPieces, and returns true if they're equal or false + * otherwise. + */ +bool gumbo_string_equals( + const GumboStringPiece* str1, const GumboStringPiece* str2); + +/** + * Compares two GumboStringPieces ignoring case, and returns true if they're + * equal or false otherwise. + */ +bool gumbo_string_equals_ignore_case( + const GumboStringPiece* str1, const GumboStringPiece* str2); + +/** + * A simple vector implementation. This stores a pointer to a data array and a + * length. All elements are stored as void*; client code must cast to the + * appropriate type. Overflows upon addition result in reallocation of the data + * array, with the size doubling to maintain O(1) amortized cost. There is no + * removal function, as this isn't needed for any of the operations within this + * library. Iteration can be done through inspecting the structure directly in + * a for-loop. + */ +typedef struct { + /** Data elements. This points to a dynamically-allocated array of capacity + * elements, each a void* to the element itself. + */ + void** data; + + /** Number of elements currently in the vector. */ + unsigned int length; + + /** Current array capacity. */ + unsigned int capacity; +} GumboVector; + +/** An empty (0-length, 0-capacity) GumboVector. */ +extern const GumboVector kGumboEmptyVector; + +/** + * Returns the first index at which an element appears in this vector (testing + * by pointer equality), or -1 if it never does. + */ +int gumbo_vector_index_of(GumboVector* vector, const void* element); + +/** + * An enum for all the tags defined in the HTML5 standard. These correspond to + * the tag names themselves. Enum constants exist only for tags which appear in + * the spec itself (or for tags with special handling in the SVG and MathML + * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag + * name can be obtained through original_tag. + * + * This is mostly for API convenience, so that clients of this library don't + * need to perform a strcasecmp to find the normalized tag name. It also has + * efficiency benefits, by letting the parser work with enums instead of + * strings. + */ +typedef enum { +// Load all the tags from an external source, generated from tag.in. +// Generated via `gentags.py src/tag.in`. +// Do not edit; edit src/tag.in instead. +// clang-format off +GUMBO_TAG_HTML, +GUMBO_TAG_HEAD, +GUMBO_TAG_TITLE, +GUMBO_TAG_BASE, +GUMBO_TAG_LINK, +GUMBO_TAG_META, +GUMBO_TAG_STYLE, +GUMBO_TAG_SCRIPT, +GUMBO_TAG_NOSCRIPT, +GUMBO_TAG_TEMPLATE, +GUMBO_TAG_BODY, +GUMBO_TAG_ARTICLE, +GUMBO_TAG_SECTION, +GUMBO_TAG_NAV, +GUMBO_TAG_ASIDE, +GUMBO_TAG_H1, +GUMBO_TAG_H2, +GUMBO_TAG_H3, +GUMBO_TAG_H4, +GUMBO_TAG_H5, +GUMBO_TAG_H6, +GUMBO_TAG_HGROUP, +GUMBO_TAG_HEADER, +GUMBO_TAG_FOOTER, +GUMBO_TAG_ADDRESS, +GUMBO_TAG_P, +GUMBO_TAG_HR, +GUMBO_TAG_PRE, +GUMBO_TAG_BLOCKQUOTE, +GUMBO_TAG_OL, +GUMBO_TAG_UL, +GUMBO_TAG_LI, +GUMBO_TAG_DL, +GUMBO_TAG_DT, +GUMBO_TAG_DD, +GUMBO_TAG_FIGURE, +GUMBO_TAG_FIGCAPTION, +GUMBO_TAG_MAIN, +GUMBO_TAG_DIV, +GUMBO_TAG_A, +GUMBO_TAG_EM, +GUMBO_TAG_STRONG, +GUMBO_TAG_SMALL, +GUMBO_TAG_S, +GUMBO_TAG_CITE, +GUMBO_TAG_Q, +GUMBO_TAG_DFN, +GUMBO_TAG_ABBR, +GUMBO_TAG_DATA, +GUMBO_TAG_TIME, +GUMBO_TAG_CODE, +GUMBO_TAG_VAR, +GUMBO_TAG_SAMP, +GUMBO_TAG_KBD, +GUMBO_TAG_SUB, +GUMBO_TAG_SUP, +GUMBO_TAG_I, +GUMBO_TAG_B, +GUMBO_TAG_U, +GUMBO_TAG_MARK, +GUMBO_TAG_RUBY, +GUMBO_TAG_RT, +GUMBO_TAG_RP, +GUMBO_TAG_BDI, +GUMBO_TAG_BDO, +GUMBO_TAG_SPAN, +GUMBO_TAG_BR, +GUMBO_TAG_WBR, +GUMBO_TAG_INS, +GUMBO_TAG_DEL, +GUMBO_TAG_IMAGE, +GUMBO_TAG_IMG, +GUMBO_TAG_IFRAME, +GUMBO_TAG_EMBED, +GUMBO_TAG_OBJECT, +GUMBO_TAG_PARAM, +GUMBO_TAG_VIDEO, +GUMBO_TAG_AUDIO, +GUMBO_TAG_SOURCE, +GUMBO_TAG_TRACK, +GUMBO_TAG_CANVAS, +GUMBO_TAG_MAP, +GUMBO_TAG_AREA, +GUMBO_TAG_MATH, +GUMBO_TAG_MI, +GUMBO_TAG_MO, +GUMBO_TAG_MN, +GUMBO_TAG_MS, +GUMBO_TAG_MTEXT, +GUMBO_TAG_MGLYPH, +GUMBO_TAG_MALIGNMARK, +GUMBO_TAG_ANNOTATION_XML, +GUMBO_TAG_SVG, +GUMBO_TAG_FOREIGNOBJECT, +GUMBO_TAG_DESC, +GUMBO_TAG_TABLE, +GUMBO_TAG_CAPTION, +GUMBO_TAG_COLGROUP, +GUMBO_TAG_COL, +GUMBO_TAG_TBODY, +GUMBO_TAG_THEAD, +GUMBO_TAG_TFOOT, +GUMBO_TAG_TR, +GUMBO_TAG_TD, +GUMBO_TAG_TH, +GUMBO_TAG_FORM, +GUMBO_TAG_FIELDSET, +GUMBO_TAG_LEGEND, +GUMBO_TAG_LABEL, +GUMBO_TAG_INPUT, +GUMBO_TAG_BUTTON, +GUMBO_TAG_SELECT, +GUMBO_TAG_DATALIST, +GUMBO_TAG_OPTGROUP, +GUMBO_TAG_OPTION, +GUMBO_TAG_TEXTAREA, +GUMBO_TAG_KEYGEN, +GUMBO_TAG_OUTPUT, +GUMBO_TAG_PROGRESS, +GUMBO_TAG_METER, +GUMBO_TAG_DETAILS, +GUMBO_TAG_SUMMARY, +GUMBO_TAG_MENU, +GUMBO_TAG_MENUITEM, +GUMBO_TAG_APPLET, +GUMBO_TAG_ACRONYM, +GUMBO_TAG_BGSOUND, +GUMBO_TAG_DIR, +GUMBO_TAG_FRAME, +GUMBO_TAG_FRAMESET, +GUMBO_TAG_NOFRAMES, +GUMBO_TAG_ISINDEX, +GUMBO_TAG_LISTING, +GUMBO_TAG_XMP, +GUMBO_TAG_NEXTID, +GUMBO_TAG_NOEMBED, +GUMBO_TAG_PLAINTEXT, +GUMBO_TAG_RB, +GUMBO_TAG_STRIKE, +GUMBO_TAG_BASEFONT, +GUMBO_TAG_BIG, +GUMBO_TAG_BLINK, +GUMBO_TAG_CENTER, +GUMBO_TAG_FONT, +GUMBO_TAG_MARQUEE, +GUMBO_TAG_MULTICOL, +GUMBO_TAG_NOBR, +GUMBO_TAG_SPACER, +GUMBO_TAG_TT, +GUMBO_TAG_RTC, + + // Used for all tags that don't have special handling in HTML. Add new tags + // to the end of tag.in so as to preserve backwards-compatibility. + GUMBO_TAG_UNKNOWN, + // A marker value to indicate the end of the enum, for iterating over it. + // Also used as the terminator for varargs functions that take tags. + GUMBO_TAG_LAST, +} GumboTag; + +/** + * Returns the normalized (usually all-lowercased, except for foreign content) + * tag name for an GumboTag enum. Return value is static data owned by the + * library. + */ +const char* gumbo_normalized_tagname(GumboTag tag); + +/** + * Extracts the tag name from the original_text field of an element or token by + * stripping off </> characters and attributes and adjusting the passed-in + * GumboStringPiece appropriately. The tag name is in the original case and + * shares a buffer with the original text, to simplify memory management. + * Behavior is undefined if a string-piece that doesn't represent an HTML tag + * (<tagname> or </tagname>) is passed in. If the string piece is completely + * empty (NULL data pointer), then this function will exit successfully as a + * no-op. + */ +void gumbo_tag_from_original_text(GumboStringPiece* text); + +/** + * Fixes the case of SVG elements that are not all lowercase. + * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign + * This is not done at parse time because there's no place to store a mutated + * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags + * without special handling), while original_tag_name is a pointer into the + * original buffer. Instead, we provide this helper function that clients can + * use to rename SVG tags as appropriate. + * Returns the case-normalized SVG tagname if a replacement is found, or NULL if + * no normalization is called for. The return value is static data and owned by + * the library. + */ +const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname); + +/** + * Converts a tag name string (which may be in upper or mixed case) to a tag + * enum. The `tag` version expects `tagname` to be NULL-terminated + */ +GumboTag gumbo_tag_enum(const char* tagname); +GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length); + +/** + * Attribute namespaces. + * HTML includes special handling for XLink, XML, and XMLNS namespaces on + * attributes. Everything else goes in the generic "NONE" namespace. + */ +typedef enum { + GUMBO_ATTR_NAMESPACE_NONE, + GUMBO_ATTR_NAMESPACE_XLINK, + GUMBO_ATTR_NAMESPACE_XML, + GUMBO_ATTR_NAMESPACE_XMLNS, +} GumboAttributeNamespaceEnum; + +/** + * A struct representing a single attribute on an HTML tag. This is a + * name-value pair, but also includes information about source locations and + * original source text. + */ +typedef struct { + /** + * The namespace for the attribute. This will usually be + * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special + * values, per: + * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes + */ + GumboAttributeNamespaceEnum attr_namespace; + + /** + * The name of the attribute. This is in a freshly-allocated buffer to deal + * with case-normalization, and is null-terminated. + */ + const char* name; + + /** + * The original text of the attribute name, as a pointer into the original + * source buffer. + */ + GumboStringPiece original_name; + + /** + * The value of the attribute. This is in a freshly-allocated buffer to deal + * with unescaping, and is null-terminated. It does not include any quotes + * that surround the attribute. If the attribute has no value (for example, + * 'selected' on a checkbox), this will be an empty string. + */ + const char* value; + + /** + * The original text of the value of the attribute. This points into the + * original source buffer. It includes any quotes that surround the + * attribute, and you can look at original_value.data[0] and + * original_value.data[original_value.length - 1] to determine what the quote + * characters were. If the attribute has no value, this will be a 0-length + * string. + */ + GumboStringPiece original_value; + + /** The starting position of the attribute name. */ + GumboSourcePosition name_start; + + /** + * The ending position of the attribute name. This is not always derivable + * from the starting position of the value because of the possibility of + * whitespace around the = sign. + */ + GumboSourcePosition name_end; + + /** The starting position of the attribute value. */ + GumboSourcePosition value_start; + + /** The ending position of the attribute value. */ + GumboSourcePosition value_end; +} GumboAttribute; + +/** + * Given a vector of GumboAttributes, look up the one with the specified name + * and return it, or NULL if no such attribute exists. This uses a + * case-insensitive match, as HTML is case-insensitive. + */ +GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name); + +/** + * Enum denoting the type of node. This determines the type of the node.v + * union. + */ +typedef enum { + /** Document node. v will be a GumboDocument. */ + GUMBO_NODE_DOCUMENT, + /** Element node. v will be a GumboElement. */ + GUMBO_NODE_ELEMENT, + /** Text node. v will be a GumboText. */ + GUMBO_NODE_TEXT, + /** CDATA node. v will be a GumboText. */ + GUMBO_NODE_CDATA, + /** Comment node. v will be a GumboText, excluding comment delimiters. */ + GUMBO_NODE_COMMENT, + /** Text node, where all contents is whitespace. v will be a GumboText. */ + GUMBO_NODE_WHITESPACE, + /** Template node. This is separate from GUMBO_NODE_ELEMENT because many + * client libraries will want to ignore the contents of template nodes, as + * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing + * here, while clients that want to include template contents should also + * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */ + GUMBO_NODE_TEMPLATE +} GumboNodeType; + +#ifndef C2NIM +/** + * Forward declaration of GumboNode so it can be used recursively in + * GumboNode.parent. + */ +typedef struct GumboInternalNode GumboNode; +#endif + +/** + * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode + */ +typedef enum { + GUMBO_DOCTYPE_NO_QUIRKS, + GUMBO_DOCTYPE_QUIRKS, + GUMBO_DOCTYPE_LIMITED_QUIRKS +} GumboQuirksModeEnum; + +/** + * Namespaces. + * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather, + * anything inside an <svg> tag is in the SVG namespace, anything inside the + * <math> tag is in the MathML namespace, and anything else is inside the HTML + * namespace. No other namespaces are supported, so this can be an enum only. + */ +typedef enum { + GUMBO_NAMESPACE_HTML, + GUMBO_NAMESPACE_SVG, + GUMBO_NAMESPACE_MATHML +} GumboNamespaceEnum; + +/** + * Parse flags. + * We track the reasons for parser insertion of nodes and store them in a + * bitvector in the node itself. This lets client code optimize out nodes that + * are implied by the HTML structure of the document, or flag constructs that + * may not be allowed by a style guide, or track the prevalence of incorrect or + * tricky HTML code. + */ +typedef enum { + /** + * A normal node - both start and end tags appear in the source, nothing has + * been reparented. + */ + GUMBO_INSERTION_NORMAL = 0, + + /** + * A node inserted by the parser to fulfill some implicit insertion rule. + * This is usually set in addition to some other flag giving a more specific + * insertion reason; it's a generic catch-all term meaning "The start tag for + * this node did not appear in the document source". + */ + GUMBO_INSERTION_BY_PARSER = 1 << 0, + + /** + * A flag indicating that the end tag for this node did not appear in the + * document source. Note that in some cases, you can still have + * parser-inserted nodes with an explicit end tag: for example, "Text</html>" + * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but + * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually + * exists. This flag will be set only if the end tag is completely missing; + * in some cases, the end tag may be misplaced (eg. a </body> tag with text + * afterwards), which will leave this flag unset and require clients to + * inspect the parse errors for that case. + */ + GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1, + + // Value 1 << 2 was for a flag that has since been removed. + + /** + * A flag for nodes that are inserted because their presence is implied by + * other tags, eg. <html>, <head>, <body>, <tbody>, etc. + */ + GUMBO_INSERTION_IMPLIED = 1 << 3, + + /** + * A flag for nodes that are converted from their end tag equivalents. For + * example, </p> when no paragraph is open implies that the parser should + * create a <p> tag and immediately close it, while </br> means the same thing + * as <br>. + */ + GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4, + + /** A flag for nodes that are converted from the parse of an <isindex> tag. */ + GUMBO_INSERTION_FROM_ISINDEX = 1 << 5, + + /** A flag for <image> tags that are rewritten as <img>. */ + GUMBO_INSERTION_FROM_IMAGE = 1 << 6, + + /** + * A flag for nodes that are cloned as a result of the reconstruction of + * active formatting elements. This is set only on the clone; the initial + * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG. + */ + GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7, + + /** A flag for nodes that are cloned by the adoption agency algorithm. */ + GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8, + + /** A flag for nodes that are moved by the adoption agency algorithm. */ + GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9, + + /** + * A flag for nodes that have been foster-parented out of a table (or + * should've been foster-parented, if verbatim mode is set). + */ + GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10, +} GumboParseFlags; + +/** + * Information specific to document nodes. + */ +typedef struct { + /** + * An array of GumboNodes, containing the children of this element. This will + * normally consist of the <html> element and any comment nodes found. + * Pointers are owned. + */ + GumboVector /* GumboNode* */ children; + + // True if there was an explicit doctype token as opposed to it being omitted. + bool has_doctype; + + // Fields from the doctype token, copied verbatim. + const char* name; + const char* public_identifier; + const char* system_identifier; + + /** + * Whether or not the document is in QuirksMode, as determined by the values + * in the GumboTokenDocType template. + */ + GumboQuirksModeEnum doc_type_quirks_mode; +} GumboDocument; + +/** + * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements. + * This contains just a block of text and its position. + */ +typedef struct { + /** + * The text of this node, after entities have been parsed and decoded. For + * comment/cdata nodes, this does not include the comment delimiters. + */ + const char* text; + + /** + * The original text of this node, as a pointer into the original buffer. For + * comment/cdata nodes, this includes the comment delimiters. + */ + GumboStringPiece original_text; + + /** + * The starting position of this node. This corresponds to the position of + * original_text, before entities are decoded. + * */ + GumboSourcePosition start_pos; +} GumboText; + +/** + * The struct used to represent all HTML elements. This contains information + * about the tag, attributes, and child nodes. + */ +typedef struct { + /** + * An array of GumboNodes, containing the children of this element. Pointers + * are owned. + */ + GumboVector /* GumboNode* */ children; + + /** The GumboTag enum for this element. */ + GumboTag tag; + + /** The GumboNamespaceEnum for this element. */ + GumboNamespaceEnum tag_namespace; + + /** + * A GumboStringPiece pointing to the original tag text for this element, + * pointing directly into the source buffer. If the tag was inserted + * algorithmically (for example, <head> or <tbody> insertion), this will be a + * zero-length string. + */ + GumboStringPiece original_tag; + + /** + * A GumboStringPiece pointing to the original end tag text for this element. + * If the end tag was inserted algorithmically, (for example, closing a + * self-closing tag), this will be a zero-length string. + */ + GumboStringPiece original_end_tag; + + /** The source position for the start of the start tag. */ + GumboSourcePosition start_pos; + + /** The source position for the start of the end tag. */ + GumboSourcePosition end_pos; + + /** + * An array of GumboAttributes, containing the attributes for this tag in the + * order that they were parsed. Pointers are owned. + */ + GumboVector /* GumboAttribute* */ attributes; +} GumboElement; + +/** + * A supertype for GumboElement and GumboText, so that we can include one + * generic type in lists of children and cast as necessary to subtypes. + */ +#ifdef C2NIM +#def GumboInternalNode GumboNode +#endif + +struct GumboInternalNode { + /** The type of node that this is. */ + GumboNodeType type; + + /** Pointer back to parent node. Not owned. */ + GumboNode* parent; + + /** The index within the parent's children vector of this node. */ + size_t index_within_parent; + + /** + * A bitvector of flags containing information about why this element was + * inserted into the parse tree, including a variety of special parse + * situations. + */ + GumboParseFlags parse_flags; + + /** The actual node data. */ + union { + GumboDocument document; // For GUMBO_NODE_DOCUMENT. + GumboElement element; // For GUMBO_NODE_ELEMENT. + GumboText text; // For everything else. + } v; +}; + +/** + * The type for an allocator function. Takes the 'userdata' member of the + * GumboParser struct as its first argument. Semantics should be the same as + * malloc, i.e. return a block of size_t bytes on success or NULL on failure. + * Allocating a block of 0 bytes behaves as per malloc. + */ +// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition. +typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size); + +/** + * The type for a deallocator function. Takes the 'userdata' member of the + * GumboParser struct as its first argument. + */ +typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); + +/** + * Input struct containing configuration options for the parser. + * These let you specify alternate memory managers, provide different error + * handling, etc. + * Use kGumboDefaultOptions for sensible defaults, and only set what you need. + */ +typedef struct GumboInternalOptions { + /** A memory allocator function. Default: malloc. */ + GumboAllocatorFunction allocator; + + /** A memory deallocator function. Default: free. */ + GumboDeallocatorFunction deallocator; + + /** + * An opaque object that's passed in as the first argument to all callbacks + * used by this library. Default: NULL. + */ + void* userdata; + + /** + * The tab-stop size, for computing positions in source code that uses tabs. + * Default: 8. + */ + int tab_stop; + + /** + * Whether or not to stop parsing when the first error is encountered. + * Default: false. + */ + bool stop_on_first_error; + + /** + * The maximum number of errors before the parser stops recording them. This + * is provided so that if the page is totally borked, we don't completely fill + * up the errors vector and exhaust memory with useless redundant errors. Set + * to -1 to disable the limit. + * Default: -1 + */ + int max_errors; + + /** + * The fragment context for parsing: + * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments + * + * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e. + * the regular parsing algorithm. Otherwise, pass the tag enum for the + * intended parent of the parsed fragment. We use just the tag enum rather + * than a full node because that's enough to set all the parsing context we + * need, and it provides some additional flexibility for client code to act as + * if parsing a fragment even when a full HTML tree isn't available. + * + * Default: GUMBO_TAG_LAST + */ + GumboTag fragment_context; + + /** + * The namespace for the fragment context. This lets client code + * differentiate between, say, parsing a <title> tag in SVG vs. parsing it in + * HTML. + * Default: GUMBO_NAMESPACE_HTML + */ + GumboNamespaceEnum fragment_namespace; +} GumboOptions; + +/** Default options struct; use this with gumbo_parse_with_options. */ +extern const GumboOptions kGumboDefaultOptions; + +/** The output struct containing the results of the parse. */ +typedef struct GumboInternalOutput { + /** + * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT + * that contains the entire document as its child. + */ + GumboNode* document; + + /** + * Pointer to the root node. This the <html> tag that forms the root of the + * document. + */ + GumboNode* root; + + /** + * A list of errors that occurred during the parse. + * NOTE: In version 1.0 of this library, the API for errors hasn't been fully + * fleshed out and may change in the future. For this reason, the GumboError + * header isn't part of the public API. Contact us if you need errors + * reported so we can work out something appropriate for your use-case. + */ + GumboVector /* GumboError */ errors; +} GumboOutput; + +/** + * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must + * live at least as long as the parse tree, as some fields (eg. original_text) + * point directly into the original buffer. + * + * This doesn't support buffers longer than 4 gigabytes. + */ +GumboOutput* gumbo_parse(const char* buffer); + +/** + * Extended version of gumbo_parse that takes an explicit options structure, + * buffer, and length. + */ +GumboOutput* gumbo_parse_with_options( + const GumboOptions* options, const char* buffer, size_t buffer_length); + +/** Release the memory used for the parse tree & parse errors. */ +void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_GUMBO_H_ |
