libxml / tests /tests_HTMLparser_htmlValidateUtf8.c
AryaWu's picture
Upload folder using huggingface_hub
6baed57 verified
#include "unity/unity.h"
#include <libxml/HTMLparser.h>
#include <libxml/parserInternals.h> /* for xmlNewInputStream, xmlPushInput */
#include <libxml/parser.h> /* for xmlInitParser, xmlCleanupParser */
#include <stdlib.h>
#include <string.h>
/* Wrapper provided in the module to access the static function */
int test_htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len, int partial);
/* Helpers to create and free a clean parser context with a valid input */
static xmlParserCtxtPtr make_ctxt(void) {
htmlParserCtxtPtr hctxt = htmlNewParserCtxt();
TEST_ASSERT_NOT_NULL(hctxt);
xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)hctxt;
xmlParserInputPtr input = xmlNewInputStream(ctxt);
TEST_ASSERT_NOT_NULL(input);
xmlPushInput(ctxt, input);
/* Ensure flags start at 0 for predictable testing */
if (ctxt->input)
ctxt->input->flags = 0;
TEST_ASSERT_NOT_NULL(ctxt->input);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
return ctxt;
}
static void free_ctxt(xmlParserCtxtPtr ctxt) {
if (ctxt != NULL) {
htmlFreeParserCtxt((htmlParserCtxtPtr)ctxt);
}
}
void setUp(void) {
/* Nothing required per-test */
}
void tearDown(void) {
/* Nothing required per-test */
}
/* ---------- Tests ---------- */
void test_htmlValidateUtf8_valid_2byte_returns_size_and_no_error_flag(void) {
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char seq[] = { 0xC2, 0xA2 }; /* U+00A2 */
int ret = test_htmlValidateUtf8(ctxt, (const xmlChar *)seq, sizeof(seq), 0);
TEST_ASSERT_EQUAL_INT(2, ret);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
void test_htmlValidateUtf8_invalid_start_byte_below_C2_sets_flag_and_returns_minus1(void) {
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char seq[] = { 0xC1, 0x80 }; /* Overlong lead byte -> invalid */
unsigned int before = ctxt->input->flags;
int ret = test_htmlValidateUtf8(ctxt, (const xmlChar *)seq, sizeof(seq), 0);
TEST_ASSERT_EQUAL_INT(-1, ret);
TEST_ASSERT_TRUE(ctxt->input->flags != before);
free_ctxt(ctxt);
}
void test_htmlValidateUtf8_invalid_2byte_continuation_sets_flag(void) {
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char seq[] = { 0xC2, 0x20 }; /* Bad continuation (not 10xxxxxx) */
int ret = test_htmlValidateUtf8(ctxt, (const xmlChar *)seq, sizeof(seq), 0);
TEST_ASSERT_EQUAL_INT(-1, ret);
TEST_ASSERT_NOT_EQUAL(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
void test_htmlValidateUtf8_incomplete_2byte_partial_returns_zero_no_flag(void) {
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char seq[] = { 0xC2 }; /* Incomplete */
int ret = test_htmlValidateUtf8(ctxt, (const xmlChar *)seq, sizeof(seq), 1);
TEST_ASSERT_EQUAL_INT(0, ret);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
void test_htmlValidateUtf8_incomplete_2byte_nonpartial_sets_flag_and_minus1(void) {
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char seq[] = { 0xC2 }; /* Incomplete */
int ret = test_htmlValidateUtf8(ctxt, (const xmlChar *)seq, sizeof(seq), 0);
TEST_ASSERT_EQUAL_INT(-1, ret);
TEST_ASSERT_NOT_EQUAL(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
void test_htmlValidateUtf8_valid_3byte_general_and_minimum(void) {
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char euro[] = { 0xE2, 0x82, 0xAC }; /* U+20AC */
int r1 = test_htmlValidateUtf8(ctxt, (const xmlChar *)euro, sizeof(euro), 0);
TEST_ASSERT_EQUAL_INT(3, r1);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
const unsigned char min3[] = { 0xE0, 0xA0, 0x80 }; /* U+0800 minimal 3-byte */
int r2 = test_htmlValidateUtf8(ctxt, (const xmlChar *)min3, sizeof(min3), 0);
TEST_ASSERT_EQUAL_INT(3, r2);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
/* 0xED with first cont < 0xA0 should be valid (below surrogates) */
const unsigned char ed_valid[] = { 0xED, 0x9F, 0xBF }; /* U+D7FF */
int r3 = test_htmlValidateUtf8(ctxt, (const xmlChar *)ed_valid, sizeof(ed_valid), 0);
TEST_ASSERT_EQUAL_INT(3, r3);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
void test_htmlValidateUtf8_invalid_3byte_overlong_and_surrogate(void) {
/* Overlong: E0 80 80 */
{
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char overlong[] = { 0xE0, 0x80, 0x80 };
int r = test_htmlValidateUtf8(ctxt, (const xmlChar *)overlong, sizeof(overlong), 0);
TEST_ASSERT_EQUAL_INT(-1, r);
TEST_ASSERT_NOT_EQUAL(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
/* Surrogate range: ED A0 80 */
{
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char surrogate[] = { 0xED, 0xA0, 0x80 }; /* U+D800 -> invalid */
int r = test_htmlValidateUtf8(ctxt, (const xmlChar *)surrogate, sizeof(surrogate), 0);
TEST_ASSERT_EQUAL_INT(-1, r);
TEST_ASSERT_NOT_EQUAL(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
}
void test_htmlValidateUtf8_valid_4byte_boundaries(void) {
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char min4[] = { 0xF0, 0x90, 0x80, 0x80 }; /* U+10000 */
int r1 = test_htmlValidateUtf8(ctxt, (const xmlChar *)min4, sizeof(min4), 0);
TEST_ASSERT_EQUAL_INT(4, r1);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
const unsigned char max4[] = { 0xF4, 0x8F, 0xBF, 0xBF }; /* U+10FFFF */
int r2 = test_htmlValidateUtf8(ctxt, (const xmlChar *)max4, sizeof(max4), 0);
TEST_ASSERT_EQUAL_INT(4, r2);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
void test_htmlValidateUtf8_invalid_4byte_overlong_and_above_max(void) {
/* Overlong for F0: second byte too low */
{
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char overlong4[] = { 0xF0, 0x80, 0x80, 0x80 };
int r = test_htmlValidateUtf8(ctxt, (const xmlChar *)overlong4, sizeof(overlong4), 0);
TEST_ASSERT_EQUAL_INT(-1, r);
TEST_ASSERT_NOT_EQUAL(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
/* Above Unicode max: F4 90 80 80 */
{
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char above_max[] = { 0xF4, 0x90, 0x80, 0x80 };
int r = test_htmlValidateUtf8(ctxt, (const xmlChar *)above_max, sizeof(above_max), 0);
TEST_ASSERT_EQUAL_INT(-1, r);
TEST_ASSERT_NOT_EQUAL(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
}
void test_htmlValidateUtf8_incomplete_3_and_4_byte_partial_behavior(void) {
/* Incomplete 3-byte, partial=1 -> 0, no flag */
{
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char seq3[] = { 0xE2, 0x82 }; /* Missing 3rd byte */
int r = test_htmlValidateUtf8(ctxt, (const xmlChar *)seq3, sizeof(seq3), 1);
TEST_ASSERT_EQUAL_INT(0, r);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
/* Incomplete 4-byte, partial=1 -> 0, no flag */
{
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char seq4[] = { 0xF0, 0x90, 0x80 }; /* Missing 4th byte */
int r = test_htmlValidateUtf8(ctxt, (const xmlChar *)seq4, sizeof(seq4), 1);
TEST_ASSERT_EQUAL_INT(0, r);
TEST_ASSERT_EQUAL_UINT(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
/* Incomplete 4-byte, partial=0 -> -1, sets flag */
{
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char seq4[] = { 0xF0, 0x90, 0x80 }; /* Missing 4th byte */
int r = test_htmlValidateUtf8(ctxt, (const xmlChar *)seq4, sizeof(seq4), 0);
TEST_ASSERT_EQUAL_INT(-1, r);
TEST_ASSERT_NOT_EQUAL(0u, ctxt->input->flags);
free_ctxt(ctxt);
}
}
void test_htmlValidateUtf8_error_flag_only_set_once_per_context(void) {
xmlParserCtxtPtr ctxt = make_ctxt();
const unsigned char bad1[] = { 0xC1, 0x80 }; /* invalid */
const unsigned char bad2[] = { 0xE0, 0x80, 0x80 };/* invalid */
int r1 = test_htmlValidateUtf8(ctxt, (const xmlChar *)bad1, sizeof(bad1), 0);
TEST_ASSERT_EQUAL_INT(-1, r1);
unsigned int flags_after_first = ctxt->input->flags;
TEST_ASSERT_NOT_EQUAL(0u, flags_after_first);
int r2 = test_htmlValidateUtf8(ctxt, (const xmlChar *)bad2, sizeof(bad2), 0);
TEST_ASSERT_EQUAL_INT(-1, r2);
unsigned int flags_after_second = ctxt->input->flags;
/* Should not clear or change the encoding error flag after the first error */
TEST_ASSERT_EQUAL_UINT(flags_after_first, flags_after_second);
free_ctxt(ctxt);
}
int main(void) {
xmlInitParser();
UNITY_BEGIN();
RUN_TEST(test_htmlValidateUtf8_valid_2byte_returns_size_and_no_error_flag);
RUN_TEST(test_htmlValidateUtf8_invalid_start_byte_below_C2_sets_flag_and_returns_minus1);
RUN_TEST(test_htmlValidateUtf8_invalid_2byte_continuation_sets_flag);
RUN_TEST(test_htmlValidateUtf8_incomplete_2byte_partial_returns_zero_no_flag);
RUN_TEST(test_htmlValidateUtf8_incomplete_2byte_nonpartial_sets_flag_and_minus1);
RUN_TEST(test_htmlValidateUtf8_valid_3byte_general_and_minimum);
RUN_TEST(test_htmlValidateUtf8_invalid_3byte_overlong_and_surrogate);
RUN_TEST(test_htmlValidateUtf8_valid_4byte_boundaries);
RUN_TEST(test_htmlValidateUtf8_invalid_4byte_overlong_and_above_max);
RUN_TEST(test_htmlValidateUtf8_incomplete_3_and_4_byte_partial_behavior);
RUN_TEST(test_htmlValidateUtf8_error_flag_only_set_once_per_context);
int rc = UNITY_END();
xmlCleanupParser();
return rc;
}