diff --git a/fuzz/README.md b/fuzz/README.md new file mode 100644 index 00000000..ea8c7499 --- /dev/null +++ b/fuzz/README.md @@ -0,0 +1,134 @@ +# libyaml fuzzing harnesses + +This directory contains experimental libFuzzer harnesses for several libyaml APIs. + +## Targets + +fuzz_scan.cpp +Targets yaml_parser_scan(). +Exercises the lower-level scanner/tokenization layer. + +fuzz_parse.cpp +Targets yaml_parser_parse(). +Exercises the parser event pipeline and reaches code in reader.c, scanner.c, and parser.c. + +fuzz_load.cpp +Targets yaml_parser_load(). +Extends parser-side exploration into document composition and loader logic, including loader.c. + +fuzz_emit.cpp +Targets yaml_emitter_emit() using simple valid event streams. +This is the initial emitter-side harness. + +fuzz_emit_nested.cpp +Targets yaml_emitter_emit() using nested valid event streams. +This improves emitter-side exploration by generating more structured mappings and sequences. + +fuzz_roundtrip_parse_emit.cpp +Targets a parse → emit round-trip workflow. +The harness parses fuzz input into events using yaml_parser_parse() and then emits those events using yaml_emitter_emit(). +This helps explore parser/emitter interaction and significantly improves emitter coverage. + +## Example build + +Build libyaml first, then compile a harness like: + +clang++ fuzz/fuzz_parse.cpp src/.libs/libyaml.a -I include -fsanitize=fuzzer,address,undefined -g -O1 -fno-omit-frame-pointer -o fuzz_parse + +Other examples: + +clang++ fuzz/fuzz_scan.cpp src/.libs/libyaml.a -I include -fsanitize=fuzzer,address,undefined -g -O1 -fno-omit-frame-pointer -o fuzz_scan +clang++ fuzz/fuzz_load.cpp src/.libs/libyaml.a -I include -fsanitize=fuzzer,address,undefined -g -O1 -fno-omit-frame-pointer -o fuzz_load +clang++ fuzz/fuzz_emit.cpp src/.libs/libyaml.a -I include -fsanitize=fuzzer,address,undefined -g -O1 -fno-omit-frame-pointer -o fuzz_emit +clang++ fuzz/fuzz_emit_nested.cpp src/.libs/libyaml.a -I include -fsanitize=fuzzer,address,undefined -g -O1 -fno-omit-frame-pointer -o fuzz_emit_nested +clang++ fuzz/fuzz_roundtrip_parse_emit.cpp src/.libs/libyaml.a -I include -fsanitize=fuzzer,address,undefined -g -O1 -fno-omit-frame-pointer -o fuzz_roundtrip_parse_emit + +## Example run + +./fuzz_parse corpus_dir + +Other examples: + +./fuzz_scan corpus_dir +./fuzz_load corpus_dir +./fuzz_emit corpus_dir +./fuzz_emit_nested corpus_dir +./fuzz_roundtrip_parse_emit corpus_dir + +## Coverage observations + +Local coverage measurements with llvm-cov showed the following. + +Parser-side coverage + +yaml_parser_parse() +- parser.c: 92.90% line coverage +- reader.c: 94.85% line coverage +- scanner.c: 94.01% line coverage + +Overall parse coverage binary +- total line coverage: 77.07% +- total branch coverage: 72.13% +- total region coverage: 78.25% + +yaml_parser_load() +- loader.c: 89.55% line coverage +- parser.c: 91.85% line coverage +- reader.c: 94.85% line coverage +- scanner.c: 94.01% line coverage + +Overall load coverage binary +- total line coverage: 77.56% +- total branch coverage: 71.80% +- total region coverage: 78.34% + +Emitter-side coverage + +Initial yaml_emitter_emit() harness +- emitter.c: 59.31% line coverage +- writer.c: 35.44% line coverage + +Overall emit coverage binary +- total line coverage: 50.75% +- total branch coverage: 44.94% +- total region coverage: 50.69% + +Nested emitter harness +- emitter.c: 68.65% line coverage +- writer.c: 25.32% line coverage + +Overall nested emitter coverage binary +- total line coverage: 58.43% +- total branch coverage: 52.12% +- total region coverage: 58.67% + +Parse → emit round-trip harness +- emitter.c: 85.35% line coverage +- writer.c: 25.32% line coverage +- parser.c: 92.90% line coverage +- reader.c: 94.85% line coverage +- scanner.c: 94.01% line coverage + +Overall round-trip coverage binary +- total line coverage: 79.64% +- total branch coverage: 71.51% +- total region coverage: 79.55% + +## Corpus + +During local fuzzing runs, seed corpora were generated and minimized using libFuzzer's merge mode. + +These corpora are not included in this repository because they are large and mostly machine-generated. The harnesses are designed to work with any YAML seed corpus. + +## Summary + +The strongest parser-side harnesses are fuzz_parse.cpp and fuzz_load.cpp. +The strongest emitter-side harness is fuzz_roundtrip_parse_emit.cpp. + +In local testing, the round-trip harness substantially improved emitter.c coverage compared with the initial standalone emitter harness. + +## Notes + +During local experimentation, an earlier round-trip harness using a fixed-size emitter output buffer triggered a double-free candidate during cleanup. After replacing that output path with a sink callback, the issue no longer reproduced, so it is not treated as a confirmed libyaml vulnerability. + +These harnesses are intended as a starting point for continued fuzzing and may also be useful for future OSS-Fuzz-style integration. diff --git a/fuzz/fuzz_emit.cpp b/fuzz/fuzz_emit.cpp new file mode 100644 index 00000000..d1933ea6 --- /dev/null +++ b/fuzz/fuzz_emit.cpp @@ -0,0 +1,160 @@ +#include +#include + +#include + +static bool EmitOwned(yaml_emitter_t *emitter, yaml_event_t *event) { + if (yaml_emitter_emit(emitter, event)) { + return true; // emitter consumed the event + } + yaml_event_delete(event); // clean up on failure + return false; +} + +static bool EmitScalar(yaml_emitter_t *emitter, + const uint8_t *data, + size_t size, + yaml_scalar_style_t style) { + yaml_event_t event; + static yaml_char_t tag[] = "tag:yaml.org,2002:str"; + + if (!yaml_scalar_event_initialize( + &event, + nullptr, // anchor + tag, // tag + (yaml_char_t *)data, // value + (int)size, // length + 1, // plain_implicit + 1, // quoted_implicit + style)) { + return false; + } + + return EmitOwned(emitter, &event); +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + yaml_emitter_t emitter; + yaml_event_t event; + + unsigned char output[8192]; + size_t written = 0; + + // Declare these before any goto target crossing. + uint8_t selector = (size > 0) ? data[0] : 0; + const uint8_t *payload = (size > 0) ? data + 1 : data; + size_t payload_size = (size > 0) ? size - 1 : 0; + + if (!yaml_emitter_initialize(&emitter)) { + return 0; + } + + yaml_emitter_set_output_string(&emitter, output, sizeof(output), &written); + yaml_emitter_set_unicode(&emitter, 1); + yaml_emitter_set_encoding(&emitter, YAML_UTF8_ENCODING); + + // 1. Stream start + if (!yaml_stream_start_event_initialize(&event, YAML_UTF8_ENCODING) || + !EmitOwned(&emitter, &event)) { + goto done; + } + + // 2. Document start + if (!yaml_document_start_event_initialize(&event, nullptr, nullptr, nullptr, 1) || + !EmitOwned(&emitter, &event)) { + goto done; + } + + yaml_scalar_style_t scalar_style; + switch ((selector >> 2) % 3) { + case 0: + scalar_style = YAML_PLAIN_SCALAR_STYLE; + break; + case 1: + scalar_style = YAML_SINGLE_QUOTED_SCALAR_STYLE; + break; + default: + scalar_style = YAML_DOUBLE_QUOTED_SCALAR_STYLE; + break; + } + + switch (selector % 3) { + case 0: { + // Single scalar document + if (!EmitScalar(&emitter, payload, payload_size, scalar_style)) { + goto done; + } + break; + } + + case 1: { + // Sequence with up to 2 scalars + yaml_sequence_style_t seq_style = + ((selector >> 4) & 1) ? YAML_FLOW_SEQUENCE_STYLE + : YAML_BLOCK_SEQUENCE_STYLE; + + if (!yaml_sequence_start_event_initialize(&event, nullptr, nullptr, 1, seq_style) || + !EmitOwned(&emitter, &event)) { + goto done; + } + + size_t mid = payload_size / 2; + + if (!EmitScalar(&emitter, payload, mid, scalar_style)) { + goto done; + } + if (!EmitScalar(&emitter, payload + mid, payload_size - mid, scalar_style)) { + goto done; + } + + if (!yaml_sequence_end_event_initialize(&event) || + !EmitOwned(&emitter, &event)) { + goto done; + } + break; + } + + case 2: { + // Mapping with one key/value pair + yaml_mapping_style_t map_style = + ((selector >> 4) & 1) ? YAML_FLOW_MAPPING_STYLE + : YAML_BLOCK_MAPPING_STYLE; + + if (!yaml_mapping_start_event_initialize(&event, nullptr, nullptr, 1, map_style) || + !EmitOwned(&emitter, &event)) { + goto done; + } + + size_t mid = payload_size / 2; + + if (!EmitScalar(&emitter, payload, mid, YAML_PLAIN_SCALAR_STYLE)) { + goto done; + } + if (!EmitScalar(&emitter, payload + mid, payload_size - mid, scalar_style)) { + goto done; + } + + if (!yaml_mapping_end_event_initialize(&event) || + !EmitOwned(&emitter, &event)) { + goto done; + } + break; + } + } + + // 3. Document end + if (!yaml_document_end_event_initialize(&event, 1) || + !EmitOwned(&emitter, &event)) { + goto done; + } + + // 4. Stream end + if (!yaml_stream_end_event_initialize(&event) || + !EmitOwned(&emitter, &event)) { + goto done; + } + +done: + yaml_emitter_delete(&emitter); + return 0; +} diff --git a/fuzz/fuzz_emit_nested.cpp b/fuzz/fuzz_emit_nested.cpp new file mode 100644 index 00000000..33f1b73e --- /dev/null +++ b/fuzz/fuzz_emit_nested.cpp @@ -0,0 +1,290 @@ +#include +#include + +#include + +struct FuzzCursor { + const uint8_t *data; + size_t size; + size_t pos; + + uint8_t NextByte() { + if (pos >= size) { + return 0; + } + return data[pos++]; + } + + bool NextBool() { + return (NextByte() & 1) != 0; + } + + size_t NextRange(size_t n) { + if (n == 0) { + return 0; + } + return static_cast(NextByte()) % n; + } + + const uint8_t *RemainingData() const { + if (pos >= size) { + return data + size; + } + return data + pos; + } + + size_t RemainingSize() const { + if (pos >= size) { + return 0; + } + return size - pos; + } +}; + +static bool EmitOwned(yaml_emitter_t *emitter, yaml_event_t *event) { + if (yaml_emitter_emit(emitter, event)) { + return true; // emitter consumed the event + } + yaml_event_delete(event); // clean up if emitter rejected it + return false; +} + +static yaml_scalar_style_t ChooseScalarStyle(FuzzCursor &cur) { + switch (cur.NextRange(4)) { + case 0: + return YAML_PLAIN_SCALAR_STYLE; + case 1: + return YAML_SINGLE_QUOTED_SCALAR_STYLE; + case 2: + return YAML_DOUBLE_QUOTED_SCALAR_STYLE; + default: + return YAML_LITERAL_SCALAR_STYLE; + } +} + +static yaml_sequence_style_t ChooseSequenceStyle(FuzzCursor &cur) { + return cur.NextBool() ? YAML_FLOW_SEQUENCE_STYLE : YAML_BLOCK_SEQUENCE_STYLE; +} + +static yaml_mapping_style_t ChooseMappingStyle(FuzzCursor &cur) { + return cur.NextBool() ? YAML_FLOW_MAPPING_STYLE : YAML_BLOCK_MAPPING_STYLE; +} + +static bool EmitAlias(yaml_emitter_t *emitter, yaml_char_t *anchor) { + yaml_event_t event; + if (!yaml_alias_event_initialize(&event, anchor)) { + return false; + } + return EmitOwned(emitter, &event); +} + +static bool EmitScalar(yaml_emitter_t *emitter, + FuzzCursor &cur, + yaml_char_t *anchor, + bool allow_tag) { + yaml_event_t event; + + static yaml_char_t kStrTag[] = "tag:yaml.org,2002:str"; + yaml_char_t *tag = nullptr; + + if (allow_tag && cur.NextBool()) { + tag = kStrTag; + } + + const uint8_t *payload = cur.RemainingData(); + size_t payload_size = cur.RemainingSize(); + + // Use a bounded slice so values are varied but not always huge. + size_t len = cur.NextRange(payload_size + 1); + yaml_scalar_style_t style = ChooseScalarStyle(cur); + + if (!yaml_scalar_event_initialize( + &event, + anchor, + tag, + const_cast(reinterpret_cast(payload)), + static_cast(len), + 1, // plain_implicit + 1, // quoted_implicit + style)) { + return false; + } + + return EmitOwned(emitter, &event); +} + +static bool EmitNode(yaml_emitter_t *emitter, + FuzzCursor &cur, + int depth, + yaml_char_t *anchor_a, + yaml_char_t *anchor_b); + +static bool EmitSequence(yaml_emitter_t *emitter, + FuzzCursor &cur, + int depth, + yaml_char_t *anchor, + yaml_char_t *anchor_a, + yaml_char_t *anchor_b) { + yaml_event_t event; + yaml_char_t *tag = nullptr; + + static yaml_char_t kSeqTag[] = "tag:yaml.org,2002:seq"; + if (cur.NextBool()) { + tag = kSeqTag; + } + + if (!yaml_sequence_start_event_initialize( + &event, + anchor, + tag, + 1, + ChooseSequenceStyle(cur))) { + return false; + } + + if (!EmitOwned(emitter, &event)) { + return false; + } + + size_t count = 1 + cur.NextRange(3); // 1..3 items + for (size_t i = 0; i < count; i++) { + if (!EmitNode(emitter, cur, depth + 1, anchor_a, anchor_b)) { + return false; + } + } + + if (!yaml_sequence_end_event_initialize(&event)) { + return false; + } + return EmitOwned(emitter, &event); +} + +static bool EmitMapping(yaml_emitter_t *emitter, + FuzzCursor &cur, + int depth, + yaml_char_t *anchor, + yaml_char_t *anchor_a, + yaml_char_t *anchor_b) { + yaml_event_t event; + yaml_char_t *tag = nullptr; + + static yaml_char_t kMapTag[] = "tag:yaml.org,2002:map"; + if (cur.NextBool()) { + tag = kMapTag; + } + + if (!yaml_mapping_start_event_initialize( + &event, + anchor, + tag, + 1, + ChooseMappingStyle(cur))) { + return false; + } + + if (!EmitOwned(emitter, &event)) { + return false; + } + + size_t pairs = 1 + cur.NextRange(3); // 1..3 pairs + for (size_t i = 0; i < pairs; i++) { + // Keep keys simpler to avoid pathological invalid structures. + if (!EmitScalar(emitter, cur, nullptr, true)) { + return false; + } + if (!EmitNode(emitter, cur, depth + 1, anchor_a, anchor_b)) { + return false; + } + } + + if (!yaml_mapping_end_event_initialize(&event)) { + return false; + } + return EmitOwned(emitter, &event); +} + +static bool EmitNode(yaml_emitter_t *emitter, + FuzzCursor &cur, + int depth, + yaml_char_t *anchor_a, + yaml_char_t *anchor_b) { + // Depth limit to keep event streams valid and efficient. + bool force_scalar = depth >= 3; + + // Occasionally emit aliases to previously defined anchors. + if (!force_scalar && cur.NextRange(10) == 0) { + if (cur.NextBool()) { + return EmitAlias(emitter, anchor_a); + } + return EmitAlias(emitter, anchor_b); + } + + // Occasionally attach an anchor to this node. + yaml_char_t *anchor = nullptr; + if (!force_scalar && cur.NextRange(6) == 0) { + anchor = cur.NextBool() ? anchor_a : anchor_b; + } + + size_t choice = force_scalar ? 0 : cur.NextRange(3); + + switch (choice) { + case 0: + return EmitScalar(emitter, cur, anchor, true); + case 1: + return EmitSequence(emitter, cur, depth, anchor, anchor_a, anchor_b); + case 2: + default: + return EmitMapping(emitter, cur, depth, anchor, anchor_a, anchor_b); + } +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + yaml_emitter_t emitter; + yaml_event_t event; + + unsigned char output[16384]; + size_t written = 0; + + if (!yaml_emitter_initialize(&emitter)) { + return 0; + } + + yaml_emitter_set_output_string(&emitter, output, sizeof(output), &written); + yaml_emitter_set_unicode(&emitter, 1); + yaml_emitter_set_encoding(&emitter, YAML_UTF8_ENCODING); + + FuzzCursor cur{data, size, 0}; + + static yaml_char_t anchor_a[] = "a1"; + static yaml_char_t anchor_b[] = "b2"; + + // Stream start + if (!yaml_stream_start_event_initialize(&event, YAML_UTF8_ENCODING) || + !EmitOwned(&emitter, &event)) { + goto done; + } + + // Single document + if (!yaml_document_start_event_initialize(&event, nullptr, nullptr, nullptr, 1) || + !EmitOwned(&emitter, &event)) { + goto done; + } + + if (!EmitNode(&emitter, cur, 0, anchor_a, anchor_b)) { + goto done; + } + + if (!yaml_document_end_event_initialize(&event, 1) || + !EmitOwned(&emitter, &event)) { + goto done; + } + + if (!yaml_stream_end_event_initialize(&event) || + !EmitOwned(&emitter, &event)) { + goto done; + } + +done: + yaml_emitter_delete(&emitter); + return 0; +} diff --git a/fuzz/fuzz_load.cpp b/fuzz/fuzz_load.cpp new file mode 100644 index 00000000..abaa2bd6 --- /dev/null +++ b/fuzz/fuzz_load.cpp @@ -0,0 +1,22 @@ +#include +#include + +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + yaml_parser_t parser; + yaml_document_t document; + + if (!yaml_parser_initialize(&parser)) { + return 0; + } + + yaml_parser_set_input_string(&parser, data, size); + + if (yaml_parser_load(&parser, &document)) { + yaml_document_delete(&document); + } + + yaml_parser_delete(&parser); + return 0; +} diff --git a/fuzz/fuzz_parse.cpp b/fuzz/fuzz_parse.cpp new file mode 100644 index 00000000..d8ee7ec6 --- /dev/null +++ b/fuzz/fuzz_parse.cpp @@ -0,0 +1,31 @@ +#include +#include + +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + yaml_parser_t parser; + yaml_event_t event; + + if (!yaml_parser_initialize(&parser)) { + return 0; + } + + yaml_parser_set_input_string(&parser, data, size); + + while (true) { + if (!yaml_parser_parse(&parser, &event)) { + break; + } + + yaml_event_type_t type = event.type; + yaml_event_delete(&event); + + if (type == YAML_STREAM_END_EVENT) { + break; + } + } + + yaml_parser_delete(&parser); + return 0; +} diff --git a/fuzz/fuzz_roundtrip.cpp b/fuzz/fuzz_roundtrip.cpp new file mode 100644 index 00000000..f17080e3 --- /dev/null +++ b/fuzz/fuzz_roundtrip.cpp @@ -0,0 +1,52 @@ +#include +#include + +#include + +static int DiscardWriter(void *data, unsigned char *buffer, size_t size) { + (void)data; + (void)buffer; + (void)size; + return 1; // pretend all output was written successfully +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + yaml_parser_t parser; + yaml_emitter_t emitter; + yaml_event_t event; + + if (!yaml_parser_initialize(&parser)) { + return 0; + } + + if (!yaml_emitter_initialize(&emitter)) { + yaml_parser_delete(&parser); + return 0; + } + + yaml_parser_set_input_string(&parser, data, size); + + yaml_emitter_set_output(&emitter, DiscardWriter, nullptr); + yaml_emitter_set_unicode(&emitter, 1); + yaml_emitter_set_encoding(&emitter, YAML_UTF8_ENCODING); + + while (true) { + if (!yaml_parser_parse(&parser, &event)) { + break; + } + + yaml_event_type_t type = event.type; + + if (!yaml_emitter_emit(&emitter, &event)) { + break; + } + + if (type == YAML_STREAM_END_EVENT) { + break; + } + } + + yaml_emitter_delete(&emitter); + yaml_parser_delete(&parser); + return 0; +} diff --git a/fuzz/fuzz_scan.cpp b/fuzz/fuzz_scan.cpp new file mode 100644 index 00000000..e04d737c --- /dev/null +++ b/fuzz/fuzz_scan.cpp @@ -0,0 +1,31 @@ +#include +#include + +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + yaml_parser_t parser; + yaml_token_t token; + + if (!yaml_parser_initialize(&parser)) { + return 0; + } + + yaml_parser_set_input_string(&parser, data, size); + + while (true) { + if (!yaml_parser_scan(&parser, &token)) { + break; + } + + yaml_token_type_t type = token.type; + yaml_token_delete(&token); + + if (type == YAML_STREAM_END_TOKEN) { + break; + } + } + + yaml_parser_delete(&parser); + return 0; +} diff --git a/fuzz/roundtrip_edgecase.yaml b/fuzz/roundtrip_edgecase.yaml new file mode 100644 index 00000000..37cd9a48 --- /dev/null +++ b/fuzz/roundtrip_edgecase.yaml @@ -0,0 +1 @@ +[[[[[?[[?[[?]]:[[?[[?[?]]]]:[[[[[[[[[?p]]]:[[?[?]]:[[?]]]]:[[?[[[[[?[?]]:[[?[[?