/* Copyright (c) 2024-2025 Luke Philipsen Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted. THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* Usage Do this: #define HOXML_IMPLEMENTATION before you include this file in *one* C or C++ file to create the implementation. You can define HOXML_DECL with #define HOXML_DECL static or #define HOXML_DECL extern to specify hoxml function declarations as static or extern, respectively. The default specifier is extern. */ #ifndef HOXML_H #define HOXML_H #include /* strtoul() */ #include /* memcpy(), memset(), NULL, size_t */ #ifndef HOXML_DECL #define HOXML_DECL #endif /* HOXML_DECL */ #ifdef __cplusplus extern "C" { #endif /* __cpluspus */ /***************/ /* Definitions */ /** * Error and token codes returned after parsing. */ typedef enum { HOXML_ERROR_INVALID_INPUT = -9, /**< One or more parameter passed to hoxml was unacceptable. */ HOXML_ERROR_INTERNAL = -8, /**< There's a bug in hoxml and parsing must halt. */ HOXML_ERROR_INSUFFICIENT_MEMORY = -7, /**< Initialization or continued parsing require more memory. */ HOXML_ERROR_UNEXPECTED_EOF = -6, /**< Reached the end of the XML content before the end of the document. */ HOXML_ERROR_SYNTAX = -5, /**< Syntax error (e.g. "" followed by ""). */ HOXML_ERROR_INVALID_DOCUMENT_TYPE_DECLARATION = -2, /**< declaration not before the root element. */ HOXML_ERROR_INVALID_DOCUMENT_DECLARATION = -1, /**< declaration not before the root element. */ HOXML_END_OF_DOCUMENT = 0, /**< The root element has been closed, parsing is done. */ HOXML_ELEMENT_BEGIN, /**< A new element/tag began and its name is available. */ HOXML_ELEMENT_END, /**< An element was closed, or , and its name and content are available. */ HOXML_ATTRIBUTE, /**< An attribute's value, its name, and its element are available. */ HOXML_PROCESSING_INSTRUCTION_BEGIN, /**< A processing instruction began and its target is available. */ HOXML_PROCESSING_INSTRUCTION_END /**< A processing instruction ended and its content is available. */ } hoxml_code_t; /** * Holds context and state information needed by hoxml. Some of this information is public and holds the data parsed * from XML content (element names, attribute names and values, etc.) but some is private and only makes sense to hoxml. */ typedef struct { /* Public */ char* tag; /**< Holds the name of the open or just-closed tag, or processing instruction target. */ char* attribute; /**< Holds the current attribute's name. */ char* value; /**< Holds the current attribute's value. */ char* content; /**< Holds the current element's content. This means all character data found, including spaces. */ int line; /**< The line currently being parsed. Lines are determined by line feeds and carriage returns. */ int column; /**< The column, on the current line, of the character last parsed. */ int depth; /**< The nested level of elements. Assigned with the level in which the element was found. */ /* Private (for internal use) */ int is_initialized; /* Set to 1, or true, by hoxml_init() and indicates this context is safe to use */ const char* xml; /* XML content to be parsed */ size_t xml_length; /* Length of the XML content to parse */ int encoding; /* Character encoding of the XML content */ const char* iterator; /* Pointer to the character in the XML content being parsed */ char* buffer; /* Memory allocated for hoxml to use */ size_t buffer_length; /* Amount of memory allocated for hoxml */ char* reference_start; /* Pointer to a location on the stack where a reference entity string (e.g "<") began */ char* stack; /* Pointer to the current node in the stack-like structure of elements */ int state; /* Current parsing state, determines which characters are acceptable and when to return */ int post_state; /* When not "none" this indicates a post-state that has a cleanup step */ int return_state; /* State to return to after the processing of a comment or reference has finished */ int error_return_state; /* State to return to after recovering from an error */ unsigned long stream; /* Holds the current character, whole or partial. May contain bytes from different strings. */ size_t stream_length; /* Length of the 'stream' variable in bytes */ unsigned newline_character; /* The character used to increment the 'line' variable, \r or \n */ } hoxml_context_t; /** * Sets up the hoxml context object to begin parsing. Following this, call hoxml_parse() until * HOXML_END_OF_DOCUMENT or one of the error values is returned. * * @param context Pointer to an allocated hoxml context object. This instance will be modified. * @param buffer A pointer to some contiguous block of memory for hoxml to use. This will also be modified, frequently. * @param buffer_length The length, in bytes, of the buffer handed to hoxml as the 'buffer' parameter. */ HOXML_DECL void hoxml_init(hoxml_context_t* context, void* buffer, size_t buffer_length); /** * Instruct hoxml to use a new buffer. This maintains the current state of parsing meaning that the next call to * hoxml_parse() will continue none the wiser. * The buffer must have a length greater than the current buffer and both buffers must be allocated at the time this * function is called. Once it returns, the original buffer may and should be freed. * * @param context An initialized hoxml context object. * @param buffer A pointer to a new, contiguous block of memory for hoxml to use. * @param buffer_length The length, in bytes, of the buffer handed to hoxml as the 'buffer' parameter. */ HOXML_DECL void hoxml_realloc(hoxml_context_t* context, void* buffer, size_t buffer_length); /** * Begin or continue parsing the given XML content string. * The XML content string does not need to contain the content in its entirety. If hoxml finds a null terminator or * parses up to the indicated length of the content, HOXML_ERROR_UNEXPECTED_EOF is returned and parsing will cease. * However, this error is recoverable and parsing will continue if the next call to hoxml_parse() passes a new XML * content string, using the same pointer or not. * * @param context An initialized hoxml context object. This should be treated as read-only until parsing is done. * @param xml XML content as an encoded string. Supported character encodings include ASCII, UTF-8, and UTF-16(BE|LE). * @param xml_length Length of the XML content in bytes. * @return A code indicating what information from the XML content is available or an error. */ HOXML_DECL hoxml_code_t hoxml_parse(hoxml_context_t* context, const char* xml, size_t xml_length); #ifdef __cplusplus } #endif /* __cplusplus */ #ifdef HOXML_IMPLEMENTATION /******************/ /* Implementation */ enum { /* Current parser states */ HOXML_STATE_ERROR_INTERNAL = -8, HOXML_STATE_ERROR_INSUFFICIENT_MEMORY = -7, HOXML_STATE_ERROR_UNEXPECTED_EOF = -6, HOXML_STATE_ERROR_SYNTAX = -5, HOXML_STATE_ERROR_ENCODING = -4, HOXML_STATE_ERROR_TAG_MISMATCH = -3, HOXML_STATE_ERROR_INVALID_DOCUMENT_TYPE_DECLARATION = -2, HOXML_STATE_ERROR_INVALID_DOCUMENT_DECLARATION = -1, HOXML_STATE_NONE = 0, HOXML_STATE_UTF8_BOM1, HOXML_STATE_UTF8_BOM2, HOXML_STATE_UTF16BE_BOM, HOXML_STATE_UTF16LE_BOM, HOXML_STATE_TAG_BEGIN, HOXML_STATE_TAG_END, HOXML_STATE_ELEMENT_NAME1, HOXML_STATE_ELEMENT_NAME2, HOXML_STATE_ATTRIBUTE_NAME1, HOXML_STATE_ATTRIBUTE_NAME2, HOXML_STATE_ATTRIBUTE_ASSIGNMENT, HOXML_STATE_ATTRIBUTE_VALUE, HOXML_STATE_OPEN_TAG, HOXML_STATE_COMMENT_CDATA_OR_DTD_BEGIN, HOXML_STATE_COMMENT_BEGIN, HOXML_STATE_COMMENT, HOXML_STATE_COMMENT_END1, HOXML_STATE_COMMENT_END2, HOXML_STATE_CDATA_BEGIN1, HOXML_STATE_CDATA_BEGIN2, HOXML_STATE_CDATA_BEGIN3, HOXML_STATE_CDATA_BEGIN4, HOXML_STATE_CDATA_BEGIN5, HOXML_STATE_CDATA_BEGIN6, HOXML_STATE_CDATA_CONTENT, HOXML_STATE_CDATA_END1, HOXML_STATE_CDATA_END2, HOXML_STATE_REFERENCE_BEGIN, HOXML_STATE_REFERENCE_ENTITY, HOXML_STATE_REFERENCE_NUMERIC, HOXML_STATE_REFERENCE_HEX, HOXML_STATE_PROCESSING_INSTRUCTION_BEGIN, HOXML_STATE_PROCESSING_INSTRUCTION_TARGET1, HOXML_STATE_PROCESSING_INSTRUCTION_TARGET2, HOXML_STATE_PROCESSING_INSTRUCTION_CONTENT, HOXML_STATE_PROCESSING_INSTRUCTION_END, HOXML_STATE_DTD_BEGIN1, HOXML_STATE_DTD_BEGIN2, HOXML_STATE_DTD_BEGIN3, HOXML_STATE_DTD_BEGIN4, HOXML_STATE_DTD_BEGIN5, HOXML_STATE_DTD_BEGIN6, HOXML_STATE_DTD_BEGIN7, HOXML_STATE_DTD_BEGIN8, HOXML_STATE_DTD_NAME, HOXML_STATE_DTD_CONTENT, HOXML_STATE_DTD_OPEN_BRACKET, HOXML_STATE_DONE, /* Post (i.e. after) parser states indicating actions to take on the next call to hoxml_parse() */ HOXML_POST_STATE_TAG_END, HOXML_POST_STATE_ATTRIBUTE_END, }; enum { HOXML_FLAG_END_TAG = 1, /* The node is a dedicated end tag (not an empty element) */ HOXML_FLAG_EMPTY_ELEMENT = 2, /* The node is an empty element */ HOXML_FLAG_PROCESSING_INSTRUCTION = 4, /* The node is a processing instruction */ HOXML_FLAG_DOUBLE_QUOTE = 8, /* The value string being parsed was opened with a double quote (") */ HOXML_FLAG_TERMINATED = 16, /* The node's current string (tag, attribute, etc.) is null terminated */ HOXML_FLAG_BEGUN = 32, /* The "element begun" code was already returned for this node */ HOXML_FLAG_INCREMENT_DEPTH = 64, /* Context object's depth value should increase by one next hoxml_parse() */ HOXML_FLAG_DECREMENT_DEPTH = 128 /* Context object's depth value should decrease by one next hoxml_parse() */ }; enum { HOXML_ENC_UNKNOWN = 0, /* The character encoding is unknown. UTF-8 is assumed. */ HOXML_ENC_UTF_8, /* Variable-length encoding (8, 16, 24, or 32 bits) compatible with ASCII */ HOXML_ENC_UTF_16_LE, /* Variable-length encoding (16 or 32 bits), little-endian variant */ HOXML_ENC_UTF_16_BE /* Variable-lenght encoding (16 or 32 bits), big-endian variant */ }; enum { HOXML_CASE_SENSITIVE = 0, /* Cases must match. 'A' == 'a' -> false. */ HOXML_CASE_INSENSITIVE /* Cases need not match. 'A' == 'a' -> true. */ }; enum { HOXML_REF_TYPE_ENTITY = 0, /* Predefined strings representing known, problematic characters (e.g. '<') */ HOXML_REF_TYPE_NUMERIC, /* A value of a character given as a decimal number */ HOXML_REF_TYPE_HEX /* A value of a character given as a hexadecimal number */ }; struct _hoxml_node_t; typedef struct _hoxml_node_t { struct _hoxml_node_t* parent; /* Points to the parent node, or NULL if this is the root */ char* end; /* Points to the last byte of this node's data */ int flags; /* May contain any number of the flags defined in hoxml_node_flags */ char tag; /* Where the tag string will be stored in the buffer, must be defined last */ } hoxml_node_t; typedef struct { unsigned encoded; /* Character as it appeared in the content. In other words, the original, encoded character. */ unsigned codepoint; /* Unicode codepoint of the character. In other words, the decoded character. */ size_t bytes; /* Number of eight-bit bytes of the encoded character, in the [1, 4] range */ } hoxml_character_t; #ifndef UINT32_MAX /* Defined in stdint.h with later revisions of C and C++ but not for some earlier ones */ #define UINT32_MAX (0xffffffff) #endif #define HOXML_STACK ((hoxml_node_t*)context->stack) #define HOXML_TO_LOWER(c) (c >= 'A' && c <= 'Z' ? c + 32 : c) #define HOXML_IS_NEW_LINE(c) (c == 0x0A || c == 0x0D) #define HOXML_IS_WHITESPACE(c) (c == 0x20 || c == 0x09 || HOXML_IS_NEW_LINE(c)) #define HOXML_IS_ASCII_CHAR(c) (c >= 0x21 && c <= 0x7F) #define HOXML_IS_CHAR_DATA(c) (c != '<' && c != '&') #define HOXML_IS_ALPHA(c) ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) #define HOXML_IS_NUMERIC(c) (c >= '0' && c <= '9') #define HOXML_IS_NAME_START_CHAR(c) (HOXML_IS_ALPHA(c) || c == ':' || c == '_' || (c >= 0xC0 && c <= 0xD6) || \ (c >= 0xD8 && c <= 0xF6) || c >= 0xF8) #define HOXML_IS_NAME_CHAR(c) (HOXML_IS_NAME_START_CHAR(c) || c == '-' || c == '.'|| HOXML_IS_NUMERIC(c)) #define HOXML_IS_HEX_CHAR(c) (HOXML_IS_NUMERIC(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) #define HOXML_IS_VALUE_CHAR_DATA(f, c) (HOXML_IS_CHAR_DATA(c) && ((f & HOXML_FLAG_DOUBLE_QUOTE && c != '"') || \ c != '\'')) void hoxml_push_stack(hoxml_context_t* context); void hoxml_pop_stack(hoxml_context_t* context); void hoxml_append_character(hoxml_context_t* context, hoxml_character_t c); void hoxml_append_terminator(hoxml_context_t* context); void hoxml_end_reference(hoxml_context_t* context, int type); void hoxml_begin_tag(hoxml_context_t* context); hoxml_code_t hoxml_end_tag(hoxml_context_t* context); int hoxml_post_state_cleanup(hoxml_context_t* context); hoxml_character_t hoxml_decode_character(const char* str, size_t str_length, int encoding); hoxml_character_t hoxml_encode_character(unsigned codepoint, int encoding); char* hoxml_to_ascii(const char* str, int encoding); size_t hoxml_strlen(const char* str, int encoding); int hoxml_strcmp(const char* str1, int encoding1, const char* str2, int encoding2, int sensitivity); const char* hoxml_strstr(const char* haystack, int haystack_encoding, const char* needle, int needle_encoding, int sensitivity); #ifdef HOXML_DEBUG #include /* printf() */ #define HOXML_LOG_STATE(s) printf("%s\n", s); #else #define HOXML_LOG_STATE(s) #endif HOXML_DECL void hoxml_init(hoxml_context_t* context, void* buffer, size_t buffer_length) { if (context == NULL || buffer == NULL || buffer_length <= 0) return; memset(context, 0, sizeof(hoxml_context_t)); /* Assign all values of the context to zero */ context->buffer = (char*)buffer; /* Use the provided buffer */ context->buffer_length = buffer_length; /* Remember the length of the provided buffer */ context->line = 1; /* This is meant to be human-readable and humans begin counting at one */ context->is_initialized = 1; memset(buffer, 0, buffer_length); /* Fill the buffer with zeroes */ } HOXML_DECL void hoxml_realloc(hoxml_context_t* context, void* buffer, size_t buffer_length) { if (context == NULL || context->is_initialized == 0 || buffer == NULL || buffer_length <= context->buffer_length) return; /* Reassign the end and parent pointers of each node, beginning at the tail and iterate to the head */ hoxml_node_t* node = HOXML_STACK; while (node != NULL) { hoxml_node_t* parent = node->parent; node->end = (char*)buffer + (node->end - context->buffer); if (node->parent != NULL) node->parent = (hoxml_node_t*)((char*)buffer + ((char*)node->parent - context->buffer)); node = parent; } /* Use offsets from the original buffer pointer to reassign pointers such that they now point to the new buffer */ if (context->tag != NULL) context->tag = (char*)buffer + (context->tag - context->buffer); if (context->attribute != NULL) context->attribute = (char*)buffer + (context->attribute - context->buffer); if (context->value != NULL) context->value = (char*)buffer + (context->value - context->buffer); if (context->content != NULL) context->content = (char*)buffer + (context->content - context->buffer); if (context->reference_start != NULL) context->reference_start = (char*)buffer + (context->reference_start - context->buffer); if (context->stack != NULL) context->stack = (char*)buffer + (context->stack - context->buffer); memset(buffer, 0, buffer_length); /* Fill the new buffer with zeroes */ memcpy(buffer, context->buffer, context->buffer_length); /* Copy the entire, current buffer to the new buffer */ context->buffer = (char*)buffer; context->buffer_length = buffer_length; if (context->state == HOXML_STATE_ERROR_INSUFFICIENT_MEMORY) { context->state = context->error_return_state; context->error_return_state = HOXML_STATE_NONE; } } HOXML_DECL hoxml_code_t hoxml_parse(hoxml_context_t* context, const char* xml, const size_t xml_length) { if (context == NULL || context->is_initialized == 0 || xml == NULL || xml_length == 0) return HOXML_ERROR_INVALID_INPUT; if (HOXML_STACK != NULL) { if (HOXML_STACK->flags & HOXML_FLAG_INCREMENT_DEPTH) { /* If an element began, increasing nesting */ context->depth += 1; HOXML_STACK->flags &= ~HOXML_FLAG_INCREMENT_DEPTH; /* Clear the flag */ } if (HOXML_STACK->flags & HOXML_FLAG_DECREMENT_DEPTH) { /* If an element ended, decreasing nesting */ context->depth -= 1; HOXML_STACK->flags &= ~HOXML_FLAG_DECREMENT_DEPTH; /* Clear the flag */ } } switch (context->state) { /* Two errors are recoverable: HOXML_ERROR_INSUFFICIENT_MEMORY and HOXML_ERROR_UNEXPECTED_EOF. The former can */ /* be recovered by assigning a new buffer with hoxml_realloc(). The latter can be recovered by passing a new */ /* XML content string to hoxml_parse() so we'll check for one before concluding we're still in error. */ case HOXML_STATE_ERROR_UNEXPECTED_EOF: { /* Try to decode a character, or remainder of a character, at the beginning of this hopefully-new string */ unsigned long stream = context->stream; /* Calculate the number of bytes to copy into the 'stream' variable from the hopefully-new string. We */ /* want 4 bytes, or whatever is available. */ size_t bytes_to_copy = 4; if (bytes_to_copy > xml_length) bytes_to_copy = xml_length; if (context->stream_length > 0) { /* Adjust the number of bytes to copy to account for possible bytes from a previous string */ bytes_to_copy -= context->stream_length; /* Append the new bytes to the previous one(s) */ memcpy((char*)&stream + context->stream_length, xml, bytes_to_copy); } else { /* Copy to the 'stream' under the assumption that all of it can be overwritten */ memcpy(&stream, xml, bytes_to_copy); } hoxml_character_t c = hoxml_decode_character((const char*)&stream, xml_length, context->encoding); /* If the character is the equivalent of a null terminator or there was not enough data */ if (c.codepoint == 0 || c.codepoint == UINT32_MAX) return HOXML_ERROR_UNEXPECTED_EOF; context->state = context->error_return_state; context->error_return_state = HOXML_STATE_NONE; /* Note: there is a check for a change in the input pointer a little further down */ } break; case HOXML_STATE_DONE: return HOXML_END_OF_DOCUMENT; case HOXML_STATE_ERROR_INTERNAL: return HOXML_ERROR_INTERNAL; case HOXML_STATE_ERROR_INSUFFICIENT_MEMORY: return HOXML_ERROR_INSUFFICIENT_MEMORY; case HOXML_STATE_ERROR_SYNTAX: return HOXML_ERROR_SYNTAX; case HOXML_STATE_ERROR_ENCODING: return HOXML_ERROR_ENCODING; case HOXML_STATE_ERROR_TAG_MISMATCH: return HOXML_ERROR_TAG_MISMATCH; case HOXML_STATE_ERROR_INVALID_DOCUMENT_DECLARATION: return HOXML_ERROR_INVALID_DOCUMENT_DECLARATION; } /* A handful of cases leave the context in an intermediary state. This allows the caller to have access to things */ /* like the tag's name, an attribute's value, etc. but that old data may now need to be cleaned up. */ if (hoxml_post_state_cleanup(context)) /* If the cleanup process found the document ended */ return HOXML_END_OF_DOCUMENT; /* If the pointer to the XML content string has changed */ if (context->xml != xml) { /* A few variables are now invalid: the pointer to the content, its length, and the iterator */ context->xml = xml; context->xml_length = xml_length; context->iterator = xml; } /* Remember some context variables in case we hit an unexpected EoF and need to undo an iteration */ const char* previous_iterator = context->iterator; size_t previous_stream_length = context->stream_length; while (context->state >= HOXML_STATE_NONE && context->state <= HOXML_STATE_DONE) { /* About half of the parsing states assume the stack is non-null. */ /* If parsing is currently in one of those states and the stack (head) pointer is null. */ if (((context->state >= HOXML_STATE_TAG_BEGIN && context->state <= HOXML_STATE_OPEN_TAG) || (context->state >= HOXML_STATE_REFERENCE_BEGIN && context->state <= HOXML_STATE_REFERENCE_HEX)) && context->stack == NULL) { /* Some unforseen bug has led us to a state in which continuing would cause an illegal memory access. */ /* Parsing must halt. There is no way to recover. */ context->state = HOXML_STATE_ERROR_INTERNAL; return HOXML_ERROR_INTERNAL; } /* Calculate the number of bytes remaining in the current XML content string */ size_t bytes_remaining = (size_t)(context->xml_length - (context->iterator - context->xml)); /* Calculate the number of bytes to copy into the 'stream' variable. We want 4 bytes, or whatever is left. */ size_t bytes_to_copy = 4; if (bytes_to_copy > bytes_remaining) bytes_to_copy = bytes_remaining; if (context->stream_length > 0) { /* Adjust the number of bytes to copy to account for possible bytes from a previous XML content string. */ /* This will be non-zero in the rare case where content is being given in parts. */ bytes_to_copy -= context->stream_length; /* Append the new bytes to the previous one(s) */ memcpy((char*)&(context->stream) + context->stream_length, context->iterator, bytes_to_copy); } else { /* Copy to the 'stream' under the assumption that all of it can be overwritten */ memcpy(&(context->stream), context->iterator, bytes_to_copy); } hoxml_character_t c = hoxml_decode_character((const char*)&(context->stream), bytes_remaining, context->encoding); /* If the character is the equivalent of a null terminator or there was not enough data to decode the value */ if (c.codepoint == 0 || c.codepoint == UINT32_MAX) { context->stream_length = bytes_to_copy; context->error_return_state = context->state; context->state = HOXML_STATE_ERROR_UNEXPECTED_EOF; return HOXML_ERROR_UNEXPECTED_EOF; } else if (HOXML_IS_NEW_LINE(c.codepoint)) { if (context->newline_character == 0) /* If this is the first newline */ context->newline_character = c.codepoint; /* Remember this as the character to use for increments */ if (c.codepoint == context->newline_character) /* Avoid incrementing twice for files with \r\n endings */ context->line++; context->column = 0; } else context->column++; /* Iterate up to four bytes into the XML content string. The idea is to jump forward by the number of bytes */ /* that were just decoded as a single character. The number of bytes varies from one to four bytes depending */ /* on the character encoding and character's codepoint. We also need to consider the case in which some of */ /* this character's bytes were carried over from a previous XML content string. Those bytes would have been */ /* stashed in the context's 'stream' variable where 'stream_length' tells us the number of said bytes. */ previous_iterator = context->iterator; previous_stream_length = context->stream_length; context->iterator += c.bytes - context->stream_length; context->stream_length = 0; #ifdef HOXML_DEBUG char debugCodepoint = HOXML_IS_NEW_LINE(c.codepoint) ? ' ' : c.codepoint; printf(" %c [%08X] [L%02dC%02d] -> ", debugCodepoint, c.codepoint, context->line, context->column); #endif switch(context->state) { case HOXML_STATE_NONE: /* The first state immediately following initialization, or a document declaration */ HOXML_LOG_STATE("HOXML_STATE_NONE") if (c.codepoint == '<') hoxml_begin_tag(context); else if (c.encoded == 0xEF) { /* UTF-8 Byte Order Marker (BOM) is [EF] BB BF, as hex bytes */ context->state = HOXML_STATE_UTF8_BOM1; context->column--; /* Don't count this as a column */ } else if (c.encoded == 0xFE) { /* UTF-16BE BOM is [FE] FF, as hex bytes */ context->state = HOXML_STATE_UTF16BE_BOM; context->column--; /* Don't count this as a column */ } else if (c.encoded == 0xFF) { /* UTF-16LE BOM is [FF] FE, as hex bytes */ context->state = HOXML_STATE_UTF16LE_BOM; context->column--; /* Don't count this as a column */ } else if (!HOXML_IS_WHITESPACE(c.codepoint)) context->state = HOXML_STATE_ERROR_SYNTAX; break; case HOXML_STATE_UTF8_BOM1: /* The first byte of a UTF-8 byte order marker was found */ HOXML_LOG_STATE("HOXML_STATE_UTF8_BOM1") context->column--; /* Don't count this as a column */ if (c.encoded == 0xBB) /* UTF-8 BOM is EF [BB] BF, as hex bytes */ context->state = HOXML_STATE_UTF8_BOM2; else context->state = HOXML_STATE_ERROR_SYNTAX; break; case HOXML_STATE_UTF8_BOM2: /* The second byte of a UTF-8 byte order marker was found */ HOXML_LOG_STATE("HOXML_STATE_UTF8_BOM2") context->column--; /* Don't count this as a column */ if (c.encoded == 0xBF) { /* UTF-8 BOM is EF BB [BF], as hex bytes */ context->state = HOXML_STATE_NONE; context->encoding = HOXML_ENC_UTF_8; } else context->state = HOXML_STATE_ERROR_SYNTAX; break; case HOXML_STATE_UTF16BE_BOM: /* The first byte of a UTF-16BE byte order marker was found */ HOXML_LOG_STATE("HOXML_STATE_UTF16BE_BOM") context->column--; /* Don't count this as a column */ if (c.encoded == 0xFF) { /* UTF-16BE BOM is FE [FF], as hex bytes */ context->state = HOXML_STATE_NONE; context->encoding = HOXML_ENC_UTF_16_BE; } else context->state = HOXML_STATE_ERROR_SYNTAX; break; case HOXML_STATE_UTF16LE_BOM: /* The first byte of a UTF-16LE byte order marker was found */ HOXML_LOG_STATE("HOXML_STATE_UTF16LE_BOM") context->column--; /* Don't count this as a column */ if (c.encoded == 0xFE) { /* UTF-16LE BOM is FF [FE], as hex bytes */ context->state = HOXML_STATE_NONE; context->encoding = HOXML_ENC_UTF_16_LE; } else context->state = HOXML_STATE_ERROR_SYNTAX; break; case HOXML_STATE_TAG_BEGIN: /* A new tag was started (a '<' was found) and a new node has been pushed */ HOXML_LOG_STATE("HOXML_STATE_TAG_BEGIN") if (c.codepoint == '?') { /* "state = HOXML_STATE_PROCESSING_INSTRUCTION_BEGIN; HOXML_STACK->flags |= HOXML_FLAG_PROCESSING_INSTRUCTION; /* Apply the PI flag to this node */ } else if (c.codepoint == '/') /* "flags |= HOXML_FLAG_END_TAG; /* Apply the end tag flag to this node */ else if (c.codepoint == '!') /* "