feat(core): Implement XML tokenization with the yxml parser

This commit is contained in:
Julius Pfrommer 2025-01-02 11:35:18 +01:00 committed by Julius Pfrommer
parent 329c4e0fb8
commit ffd5330245
2 changed files with 150 additions and 2 deletions

View File

@ -12,6 +12,7 @@
#include "../deps/base64.h"
#include "../deps/libc_time.h"
#include "../deps/dtoa.h"
#include "../deps/yxml.h"
#include <libxml/parser.h>
@ -31,6 +32,117 @@
# define NAN ((UA_Double)(INFINITY-INFINITY))
#endif
/* Replicate yxml_isNameStart and yxml_isName from yxml */
static UA_String
backtrackName(const char *xml, unsigned end) {
unsigned pos = end;
for(; pos > 0; pos--) {
unsigned char c = (unsigned char)xml[pos-1];
if(c >= 'a' && c <= 'z') continue; /* isAlpha */
if(c >= 'A' && c <= 'Z') continue; /* isAlpha */
if(c >= '0' && c <= '9') continue; /* isNum */
if(c == ':' || c == '_' || c >= 128 || c == '-'|| c == '.') continue;
break;
}
UA_String s = {end - pos, (UA_Byte*)(uintptr_t)xml + pos};
return s;
}
xml_result
xml_tokenize(const char *xml, unsigned int len,
xml_token *tokens, unsigned int max_tokens) {
xml_result res;
memset(&res, 0, sizeof(xml_result));
res.tokens = tokens;
yxml_t ctx;
char buf[512];
yxml_init(&ctx, buf, 512);
unsigned char top = 0;
unsigned tokenPos = 0;
xml_token *stack[32]; /* Max nesting depth is 32 */
xml_token backup_tokens[32]; /* To be used when the tokens run out */
stack[top] = &backup_tokens[top];
memset(stack[top], 0, sizeof(xml_token));
unsigned val_begin = 0;
unsigned pos = 0;
for(; pos < len; pos++) {
yxml_ret_t status = yxml_parse(&ctx, xml[pos]);
switch(status) {
case YXML_EEOF:
case YXML_EREF:
case YXML_ECLOSE:
case YXML_ESTACK:
case YXML_ESYN:
default:
goto errout;
case YXML_OK:
continue;
case YXML_ELEMSTART:
case YXML_ATTRSTART:
if(status == YXML_ELEMSTART)
stack[top]->children++;
else
stack[top]->attributes++;
top++;
if(top >= 32)
goto errout; /* nesting too deep */
stack[top] = (tokenPos < max_tokens) ? &tokens[tokenPos] : &backup_tokens[top];
memset(stack[top], 0, sizeof(xml_token));
stack[top]->type = (status == YXML_ELEMSTART) ? XML_TOKEN_ELEMENT : XML_TOKEN_ATTRIBUTE;
stack[top]->name = backtrackName(xml, pos);
const char *start = xml + pos;
if(status == YXML_ELEMSTART) {
while(*start != '<')
start--;
}
stack[top]->start = (unsigned)(start - xml);
tokenPos++;
break;
case YXML_CONTENT:
case YXML_ATTRVAL:
if(val_begin == 0)
val_begin = pos;
stack[top]->end = pos;
break;
case YXML_ELEMEND:
case YXML_ATTREND:
if(val_begin > 0) {
stack[top]->content.data = (UA_Byte*)(uintptr_t)xml + val_begin;
stack[top]->content.length = stack[top]->end + 1 - val_begin;
}
stack[top]->end = pos;
if(status == YXML_ELEMEND) {
while(xml[stack[top]->end] != '>')
stack[top]->end++;
stack[top]->end++;
}
val_begin = 0;
top--;
break;
case YXML_PISTART:
case YXML_PICONTENT:
case YXML_PIEND:
continue; /* Ignore processing instructions */
}
}
res.num_tokens = tokenPos;
if(tokenPos >= max_tokens)
res.error = XML_ERROR_OVERFLOW;
return res;
errout:
res.error_pos = pos;
if(yxml_eof(&ctx) != YXML_OK)
res.error = XML_ERROR_INVALID;
return res;
}
/* Map for decoding a XML complex object type. An array of this is passed to the
* decodeXmlFields function. If the xml element with name "fieldName" is found
* in the xml complex object (mark as found) decode the value with the "function"
@ -535,12 +647,12 @@ ENCODE_XML(ExtensionObject) {
static status
Array_encodeXml(CtxXml *ctx, const void *ptr, size_t length,
const UA_DataType *type) {
char* arrName[128];
char arrName[128];
size_t arrNameLen = strlen("ListOf") + strlen(type->typeName);
if(arrNameLen >= 128)
return UA_STATUSCODE_BADENCODINGERROR;
memcpy(arrName, "ListOf", strlen("ListOf"));
memcpy(arrName + strlen("ListOf"), type-typeName, strlen(type->typeName));
memcpy(arrName + strlen("ListOf"), type->typeName, strlen(type->typeName));
arrName[arrNameLen] = '\0';
status ret = writeXmlElemNameBegin(ctx, arrName);

View File

@ -14,6 +14,42 @@ _UA_BEGIN_DECLS
#define UA_XML_MAXMEMBERSCOUNT 256
#define UA_XML_ENCODING_MAX_RECURSION 100
/* XML input gets parsed into a sequence of tokens first.
* Processing isntructions, etc. get ignored. */
typedef enum {
XML_TOKEN_ELEMENT = 0,
XML_TOKEN_ATTRIBUTE,
} xml_token_type;
typedef struct {
xml_token_type type;
UA_String name;
UA_String content;
unsigned attributes; // For elements only: the number of attributes
unsigned children; // For elements only: the number of child elements
unsigned start; // First character of the token in the xml
unsigned end; // Position after the token ends
} xml_token;
typedef enum {
XML_ERROR_NONE = 0,
XML_ERROR_INVALID, // Invalid character/syntax
XML_ERROR_OVERFLOW // Token buffer overflow
} xml_error_code;
typedef struct {
xml_error_code error;
unsigned int error_pos;
unsigned int num_tokens;
const xml_token *tokens;
} xml_result;
/* Parse XML input into a token sequence */
xml_result
xml_tokenize(const char *xml, unsigned int len,
xml_token *tokens, unsigned int max_tokens);
/* XML schema type definitions */
typedef struct {
const char* xmlEncTypeDef;