From 92c3e76f4b068279d97ba4b3289a185e0eebdffe Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Sat, 23 Mar 2019 20:49:10 -0500 Subject: [PATCH 01/18] breaking: big unicode overhaul Allow utf-8 in character literals Char Unicode escape syntax. Validate zig as UTF-8. overhaul std.unicode Fully implemented in stage2. Closes: #2097 Closes: #2129 --- About the UTF-8 validation in stage1: This implementation is quite slow, but the stage automata it claims to represent is correct, and it has two features faster validators don't that would make the code in stage1 more complicated: * They don't provide the char point * They don't provide the index of the error (although this could be hacked in, but at more cost) I don't want to put that much optimization effort into stage1 and C code. --- doc/langref.html.in | 16 +- src-self-hosted/ir.zig | 96 ++++++++++- src-self-hosted/value.zig | 21 +++ src/tokenizer.cpp | 89 +++++++---- src/utf8/iszig.h | 32 ++++ src/utf8/naive.c | 89 +++++++++++ src/utf8/range2-neon.c | 147 +++++++++++++++++ src/utf8/range2-sse.c | 170 ++++++++++++++++++++ src/utf8/utf8-lookup.h | 42 +++++ src/utf8/utf8.h | 19 +++ std/unicode.zig | 265 ++++++++++++++++++------------- std/zig/parse_string_literal.zig | 108 +++++++++++-- std/zig/parser_test.zig | 16 +- std/zig/tokenizer.zig | 79 +-------- test/stage1/behavior/misc.zig | 9 +- 15 files changed, 961 insertions(+), 237 deletions(-) create mode 100644 src/utf8/iszig.h create mode 100644 src/utf8/naive.c create mode 100644 src/utf8/range2-neon.c create mode 100644 src/utf8/range2-sse.c create mode 100644 src/utf8/utf8-lookup.h create mode 100644 src/utf8/utf8.h diff --git a/doc/langref.html.in b/doc/langref.html.in index 1d80c73a3e50..1698bedcb1f1 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -555,7 +555,8 @@ test "string literals" { assert(normal_bytes.len == 5); assert(normal_bytes[1] == 'e'); assert('e' == '\x65'); - assert('\U01f4a9' == 128169); + assert('\u{01f4a9}' == 128169); + assert('💩' == 128169); assert(mem.eql(u8, "hello", "h\x65llo")); // A C string literal is a null terminated pointer. @@ -605,11 +606,15 @@ test "string literals" { hexadecimal 8-bit character code (2 digits) - \uNNNN + \u{NN} + hexadecimal 16-bit Unicode character code UTF-8 encoded (2 digits) + + + \u{NNNN} hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits) - \UNNNNNN + \u{NNNNNN} hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits) @@ -9674,8 +9679,9 @@ eof <- !. hex <- [0-9a-fA-F] char_escape <- "\\x" hex hex - / "\\u" hex hex hex hex - / "\\U" hex hex hex hex hex hex + / "\\u" { hex hex } + / "\\u" { hex hex hex hex } + / "\\u" { hex hex hex hex hex hex } / "\\" [nr\\t'"] char_char <- char_escape diff --git a/src-self-hosted/ir.zig b/src-self-hosted/ir.zig index 8cdac92326b2..802985c4453e 100644 --- a/src-self-hosted/ir.zig +++ b/src-self-hosted/ir.zig @@ -1147,7 +1147,10 @@ pub const Builder = struct { return irb.lvalWrap(scope, inst, lval); }, ast.Node.Id.MultilineStringLiteral => return error.Unimplemented, - ast.Node.Id.CharLiteral => return error.Unimplemented, + ast.Node.Id.CharLiteral => { + const char_lit = @fieldParentPtr(ast.Node.CharLiteral, "base", node); + return irb.lvalWrap(scope, try irb.genCharLit(char_lit, scope), lval); + }, ast.Node.Id.BoolLiteral => return error.Unimplemented, ast.Node.Id.NullLiteral => return error.Unimplemented, ast.Node.Id.UndefinedLiteral => return error.Unimplemented, @@ -1333,8 +1336,7 @@ pub const Builder = struct { ) catch |err| switch (err) { error.OutOfMemory => return error.OutOfMemory, error.InvalidBase => unreachable, - error.InvalidCharForDigit => unreachable, - error.DigitTooLargeForBase => unreachable, + error.InvalidCharacter => unreachable, }; errdefer int_val.base.deref(irb.comp); @@ -1343,18 +1345,100 @@ pub const Builder = struct { return inst; } + pub fn genCharLit(irb: *Builder, char_lit: *ast.Node.CharLiteral, scope: *Scope) !*Inst { + const char_token = irb.code.tree_scope.tree.tokenSlice(char_lit.token); + + var char: u21 = undefined; + got_char: { + if (char_token[1] == '\\') { + char = switch (char_token[2]) { + 'x' => { + const hi = charToDigit(char_token[off], 16) catch unreachable; + const lo = charToDigit(char_token[off + 1], 16) catch unreachable; + char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8); + break :got_char; + }, + 'u' => { + // char_token[3] == '{'; + if (char_token[6] == '}') { + hex_escape_bytes = 1; + } else if (char_token[8] == '}') { + hex_escape_bytes = 2; + } else if (char_token[10] == '}') { + hex_escape_bytes = 3; + } else { + unreachable; + } + var off: u8 = 4; + while (hex_escape_bytes > 0) : (hex_escape_bytes -= 1) { + const hi = charToDigit(char_token[off], 16) catch unreachable; + const lo = charToDigit(char_token[off + 1], 16) catch unreachable; + char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8); + off += 2; + } + break :got_char; + }, + 'n' => '\n', + 'r' => '\r', + '\\' => '\\', + '\t' => '\t', + '\'' => '\'', + '\"' => '\"', + else => unreachable, + }; + break :got_char; + } + // This could read one byte past the end of the file, except + // this guarantees to not read past the first character, and we + // have already validated the file as UTF-8. + _ = utf8Decode(char_token[1..4], &char); + break :got_char; + } + + const comptime_int_type = Type.ComptimeInt.get(irb.comp); + defer comptime_int_type.base.base.deref(irb.comp); + + const int_val = Value.Int.createFromCharLiteral( + irb.comp, + &comptime_int_type.base, + rest, + ) catch |err| switch (err) { + error.OutOfMemory => return error.OutOfMemory, + }; + errdefer int_val.base.deref(irb.comp); + + const inst = try irb.build(Inst.Const, scope, Span.token(char_lit.token), Inst.Const.Params{}); + inst.val = IrVal{ .KnownValue = &int_val.base }; + return inst; + } + pub async fn genStrLit(irb: *Builder, str_lit: *ast.Node.StringLiteral, scope: *Scope) !*Inst { const str_token = irb.code.tree_scope.tree.tokenSlice(str_lit.token); const src_span = Span.token(str_lit.token); var bad_index: usize = undefined; var buf = std.zig.parseStringLiteral(irb.comp.gpa(), str_token, &bad_index) catch |err| switch (err) { - error.OutOfMemory => return error.OutOfMemory, - error.InvalidCharacter => { + .OutOfMemory => return error.OutOfMemory, + .UnicodeSurrogateHalf, .UnicodeCodepointTooLarge => { + var hex_string = if (mem.indexOfScalar(u8, str_token, '}')) |i| str_token[2..i] else str_token[2..str_token.len]; + try irb.comp.addCompileError( + irb.code.tree_scope, + src_span, + "Unicode codepoint U+{} cannot be represented in UTF-16 and is invalid", + hex_string, + ); + return error.SemanticAnalysisFailed; + }, + .ExpectXDigit, .ExpectLCurly, .ExpectRCurly => { try irb.comp.addCompileError( irb.code.tree_scope, src_span, - "invalid character in string literal: '{c}'", + "expected {}, got '{c}'", + switch (err) { + .ExpectXDigit => "hexidecimal digit", + .ExpectLCurly => "left curly bracket '{'", + .ExpectRCurly => "right curly bracket '}'", + }, str_token[bad_index], ); return error.SemanticAnalysisFailed; diff --git a/src-self-hosted/value.zig b/src-self-hosted/value.zig index d8c0f7b5c87c..0a78395ecd9b 100644 --- a/src-self-hosted/value.zig +++ b/src-self-hosted/value.zig @@ -534,6 +534,27 @@ pub const Value = struct { return self; } + pub fn createFromCharLiteral(comp: *Compilation, typ: *Type, value: u21) !*Int { + const self = try comp.gpa().create(Value.Int); + self.* = Value.Int{ + .base = Value{ + .id = Value.Id.Int, + .typ = typ, + .ref_count = std.atomic.Int(usize).init(1), + }, + .big_int = undefined, + }; + typ.base.ref(); + errdefer comp.gpa().destroy(self); + + self.big_int = try std.math.big.Int.init(comp.gpa()); + errdefer self.big_int.deinit(); + + try self.big_int.set(value); + + return self; + } + pub fn getLlvmConst(self: *Int, ofile: *ObjectFile) !?*llvm.Value { switch (self.base.typ.id) { Type.Id.Int => { diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 53554d1096d0..33df21a66fa5 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -8,6 +8,10 @@ #include "tokenizer.hpp" #include "util.hpp" +#include "utf8/utf8-lookup.h" +#include "utf8/utf8.h" +#include "utf8/iszig.h" + #include #include #include @@ -219,6 +223,7 @@ enum TokenizeState { TokenizeStateSawDotDot, TokenizeStateSawAtSign, TokenizeStateCharCode, + TokenizeStateCharCodeStart, TokenizeStateError, TokenizeStateLBracket, TokenizeStateLBracketStar, @@ -238,10 +243,10 @@ struct Tokenize { uint32_t radix; int32_t exp_add_amt; bool is_exp_negative; - size_t char_code_index; - size_t char_code_end; + size_t xdigits_seen; bool unicode; uint32_t char_code; + uint32_t utf8_validator_state; // http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ int exponent_in_bin_or_dec; BigInt specified_exponent; BigInt significand; @@ -404,6 +409,19 @@ void tokenize(Buf *buf, Tokenization *out) { t.tokens = out->tokens = allocate>(1); t.buf = buf; + for (size_t i=0;iline_offsets = allocate>(1); out->line_offsets->append(0); @@ -1050,24 +1068,14 @@ void tokenize(Buf *buf, Tokenization *out) { t.state = TokenizeStateCharCode; t.radix = 16; t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 2; + t.xdigits_seen = 0; t.unicode = false; break; case 'u': - t.state = TokenizeStateCharCode; + t.state = TokenizeStateCharCodeStart; t.radix = 16; t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 4; - t.unicode = true; - break; - case 'U': - t.state = TokenizeStateCharCode; - t.radix = 16; - t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 6; + t.xdigits_seen = 0; t.unicode = true; break; case 'n': @@ -1092,20 +1100,35 @@ void tokenize(Buf *buf, Tokenization *out) { invalid_char_error(&t, c); } break; + case TokenizeStateCharCodeStart: + if (c != '{') + tokenize_error(&t, "expected {: '%c'", c); + t.state = TokenizeStateCharCode; + break; case TokenizeStateCharCode: { - uint32_t digit_value = get_digit_value(c); - if (digit_value >= t.radix) { - tokenize_error(&t, "invalid digit: '%c'", c); - } - t.char_code *= t.radix; - t.char_code += digit_value; - t.char_code_index += 1; + if (c != '}') { + uint32_t digit_value = get_digit_value(c); + if (digit_value >= t.radix) { + tokenize_error(&t, "invalid digit: '%c'", c); + } + t.char_code *= t.radix; + t.char_code += digit_value; + t.xdigits_seen += 1; + + if (t.xdigits_seen > 6) + tokenize_error(&t, "expected }: '%c'", c); + } else + if (t.xdigits_seen % 2 != 0) + tokenize_error(&t, "expected hex digit: '%c'", c); - if (t.char_code_index >= t.char_code_end) { + if (c == '}' || (!t.unicode && t.xdigits_seen == 2)) { if (t.unicode) { - if (t.char_code > 0x10ffff) { - tokenize_error(&t, "unicode value out of range: %x", t.char_code); + if (t.char_code > 0xD7FF && + t.char_code < 0xE000) { + tokenize_error(&t, "unicode surrogate: 0x%x", t.char_code); + } else if (t.char_code > 0x10ffff) { + tokenize_error(&t, "unicode value out of range: 0x%x", t.char_code); } if (t.cur_tok->id == TokenIdCharLiteral) { t.cur_tok->data.char_lit.c = t.char_code; @@ -1149,9 +1172,20 @@ void tokenize(Buf *buf, Tokenization *out) { case '\\': t.state = TokenizeStateStringEscape; break; + case '\n': + tokenize_error(&t, "newline not allowed in character literal"); default: - t.cur_tok->data.char_lit.c = c; - t.state = TokenizeStateCharLiteralEnd; + if (c < 128) { + t.cur_tok->data.char_lit.c = c; + t.state = TokenizeStateCharLiteralEnd; + } else { + // http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + // Returns 0 when character complete. We already know the file is valid UTF8. + if (!utf8_decode(&t.utf8_validator_state, &t.char_code, c)) { + t.cur_tok->data.char_lit.c = t.char_code; + t.state = TokenizeStateCharLiteralEnd; + } + } break; } break; @@ -1387,6 +1421,7 @@ void tokenize(Buf *buf, Tokenization *out) { break; case TokenizeStateStringEscape: case TokenizeStateCharCode: + case TokenizeStateCharCodeStart: if (t.cur_tok->id == TokenIdStringLiteral) { tokenize_error(&t, "unterminated string"); } else if (t.cur_tok->id == TokenIdCharLiteral) { diff --git a/src/utf8/iszig.h b/src/utf8/iszig.h new file mode 100644 index 000000000000..f492ffa344c7 --- /dev/null +++ b/src/utf8/iszig.h @@ -0,0 +1,32 @@ +#include +#include + +// From std/ascii.zig + +static const uint8_t zig[] = { +// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, // '\n' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // DEL + + // utf8 continuation characters + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // Surrogate pairs + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 21-bit limit +}; + +inline bool is_zig(uint8_t c) { + return zig[c]; +} diff --git a/src/utf8/naive.c b/src/utf8/naive.c new file mode 100644 index 000000000000..b2663756f4ba --- /dev/null +++ b/src/utf8/naive.c @@ -0,0 +1,89 @@ +#include + +/* + * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 + * + * Table 3-7. Well-Formed UTF-8 Byte Sequences + * + * +--------------------+------------+-------------+------------+-------------+ + * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | + * +--------------------+------------+-------------+------------+-------------+ + * | U+0000..U+007F | 00..7F | | | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+0080..U+07FF | C2..DF | 80..BF | | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | + * +--------------------+------------+-------------+------------+-------------+ + * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | + * +--------------------+------------+-------------+------------+-------------+ + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | + * +--------------------+------------+-------------+------------+-------------+ + */ + +/* return 0-invalid, 1-valid */ +int utf8_naive(const unsigned char *data, int len) +{ + while (len) { + int bytes; + const unsigned char byte1 = data[0]; + + /* 00..7F */ + if (byte1 <= 0x7F) { + bytes = 1; + /* C2..DF, 80..BF */ + } else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && + (signed char)data[1] <= (signed char)0xBF) { + bytes = 2; + } else if (len >= 3) { + const unsigned char byte2 = data[1]; + + /* Is byte2, byte3 between 0x80 ~ 0xBF */ + const int byte2_ok = (signed char)byte2 <= (signed char)0xBF; + const int byte3_ok = (signed char)data[2] <= (signed char)0xBF; + + if (byte2_ok && byte3_ok && + /* E0, A0..BF, 80..BF */ + ((byte1 == 0xE0 && byte2 >= 0xA0) || + /* E1..EC, 80..BF, 80..BF */ + (byte1 >= 0xE1 && byte1 <= 0xEC) || + /* ED, 80..9F, 80..BF */ + (byte1 == 0xED && byte2 <= 0x9F) || + /* EE..EF, 80..BF, 80..BF */ + (byte1 >= 0xEE && byte1 <= 0xEF))) { + bytes = 3; + } else if (len >= 4) { + /* Is byte4 between 0x80 ~ 0xBF */ + const int byte4_ok = (signed char)data[3] <= (signed char)0xBF; + + if (byte2_ok && byte3_ok && byte4_ok && + /* F0, 90..BF, 80..BF, 80..BF */ + ((byte1 == 0xF0 && byte2 >= 0x90) || + /* F1..F3, 80..BF, 80..BF, 80..BF */ + (byte1 >= 0xF1 && byte1 <= 0xF3) || + /* F4, 80..8F, 80..BF, 80..BF */ + (byte1 == 0xF4 && byte2 <= 0x8F))) { + bytes = 4; + } else { + return 0; + } + } else { + return 0; + } + } else { + return 0; + } + + len -= bytes; + data += bytes; + } + + return 1; +} diff --git a/src/utf8/range2-neon.c b/src/utf8/range2-neon.c new file mode 100644 index 000000000000..c1610cd0dbec --- /dev/null +++ b/src/utf8/range2-neon.c @@ -0,0 +1,147 @@ +/* + * Process 2x16 bytes in each iteration. + * Comments removed for brevity. See range-neon.c for details. + */ +#ifdef __aarch64__ + +#include +#include +#include + +int utf8_naive(const unsigned char *data, int len); + +static const uint8_t _first_len_tbl[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, +}; + +static const uint8_t _first_range_tbl[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, +}; + +static const uint8_t _range_min_tbl[] = { + 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, + 0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, +}; +static const uint8_t _range_max_tbl[] = { + 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, + 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static const uint8_t _range_adjust_tbl[] = { + 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, +}; + +int utf8_range2(const unsigned char *data, int len) +{ + if (len >= 32) { + uint8x16_t prev_input = vdupq_n_u8(0); + uint8x16_t prev_first_len = vdupq_n_u8(0); + + const uint8x16_t first_len_tbl = vld1q_u8(_first_len_tbl); + const uint8x16_t first_range_tbl = vld1q_u8(_first_range_tbl); + const uint8x16_t range_min_tbl = vld1q_u8(_range_min_tbl); + const uint8x16_t range_max_tbl = vld1q_u8(_range_max_tbl); + const uint8x16x2_t range_adjust_tbl = vld2q_u8(_range_adjust_tbl); + + const uint8x16_t const_1 = vdupq_n_u8(1); + const uint8x16_t const_2 = vdupq_n_u8(2); + const uint8x16_t const_e0 = vdupq_n_u8(0xE0); + + uint8x16_t error = vdupq_n_u8(0); + + while (len >= 32) { + /*************************** block 1 *****************************/ + const uint8x16_t input = vld1q_u8(data); + + uint8x16_t high_nibbles = vshrq_n_u8(input, 4); + + const uint8x16_t first_len = + vqtbl1q_u8(first_len_tbl, high_nibbles); + + uint8x16_t range = vqtbl1q_u8(first_range_tbl, high_nibbles); + + range = + vorrq_u8(range, vextq_u8(prev_first_len, first_len, 15)); + + uint8x16_t tmp1, tmp2; + tmp1 = vqsubq_u8(first_len, const_1); + tmp2 = vqsubq_u8(prev_first_len, const_1); + range = vorrq_u8(range, vextq_u8(tmp2, tmp1, 14)); + + tmp1 = vqsubq_u8(first_len, const_2); + tmp2 = vqsubq_u8(prev_first_len, const_2); + range = vorrq_u8(range, vextq_u8(tmp2, tmp1, 13)); + + uint8x16_t shift1 = vextq_u8(prev_input, input, 15); + uint8x16_t pos = vsubq_u8(shift1, const_e0); + range = vaddq_u8(range, vqtbl2q_u8(range_adjust_tbl, pos)); + + uint8x16_t minv = vqtbl1q_u8(range_min_tbl, range); + uint8x16_t maxv = vqtbl1q_u8(range_max_tbl, range); + + error = vorrq_u8(error, vcltq_u8(input, minv)); + error = vorrq_u8(error, vcgtq_u8(input, maxv)); + + /*************************** block 2 *****************************/ + const uint8x16_t _input = vld1q_u8(data+16); + + high_nibbles = vshrq_n_u8(_input, 4); + + const uint8x16_t _first_len = + vqtbl1q_u8(first_len_tbl, high_nibbles); + + uint8x16_t _range = vqtbl1q_u8(first_range_tbl, high_nibbles); + + _range = + vorrq_u8(_range, vextq_u8(first_len, _first_len, 15)); + + tmp1 = vqsubq_u8(_first_len, const_1); + tmp2 = vqsubq_u8(first_len, const_1); + _range = vorrq_u8(_range, vextq_u8(tmp2, tmp1, 14)); + + tmp1 = vqsubq_u8(_first_len, const_2); + tmp2 = vqsubq_u8(first_len, const_2); + _range = vorrq_u8(_range, vextq_u8(tmp2, tmp1, 13)); + + shift1 = vextq_u8(input, _input, 15); + pos = vsubq_u8(shift1, const_e0); + _range = vaddq_u8(_range, vqtbl2q_u8(range_adjust_tbl, pos)); + + minv = vqtbl1q_u8(range_min_tbl, _range); + maxv = vqtbl1q_u8(range_max_tbl, _range); + + error = vorrq_u8(error, vcltq_u8(_input, minv)); + error = vorrq_u8(error, vcgtq_u8(_input, maxv)); + + /************************ next iteration *************************/ + prev_input = _input; + prev_first_len = _first_len; + + data += 32; + len -= 32; + } + + if (vmaxvq_u8(error)) + return 0; + + uint32_t token4; + vst1q_lane_u32(&token4, vreinterpretq_u32_u8(prev_input), 3); + + const int8_t *token = (const int8_t *)&token4; + int lookahead = 0; + if (token[3] > (int8_t)0xBF) + lookahead = 1; + else if (token[2] > (int8_t)0xBF) + lookahead = 2; + else if (token[1] > (int8_t)0xBF) + lookahead = 3; + + data -= lookahead; + len += lookahead; + } + + return utf8_naive(data, len); +} + +#endif diff --git a/src/utf8/range2-sse.c b/src/utf8/range2-sse.c new file mode 100644 index 000000000000..2369b4621d05 --- /dev/null +++ b/src/utf8/range2-sse.c @@ -0,0 +1,170 @@ +/* + * Process 2x16 bytes in each iteration. + * Comments removed for brevity. See range-sse.c for details. + */ + +#pragma GCC diagnostic ignored "-Wnarrowing" + +#ifdef __linux__ // because of use of IFUNC +#ifdef __x86_64__ + +#include +#include +#include + +int utf8_naive(const unsigned char *data, int len); + +static const int8_t _first_len_tbl[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, +}; + +static const int8_t _first_range_tbl[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, +}; + +static const int8_t _range_min_tbl[] = { + 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, + 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, +}; +static const int8_t _range_max_tbl[] = { + 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, + 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +}; + +static const int8_t _df_ee_tbl[] = { + 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, +}; +static const int8_t _ef_fe_tbl[] = { + 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +__attribute__((__target__ ("sse4.1"))) +int utf8_range2(const unsigned char *data, int len) +{ + if (len >= 32) { + __m128i prev_input = _mm_set1_epi8(0); + __m128i prev_first_len = _mm_set1_epi8(0); + + const __m128i first_len_tbl = + _mm_lddqu_si128((const __m128i *)_first_len_tbl); + const __m128i first_range_tbl = + _mm_lddqu_si128((const __m128i *)_first_range_tbl); + const __m128i range_min_tbl = + _mm_lddqu_si128((const __m128i *)_range_min_tbl); + const __m128i range_max_tbl = + _mm_lddqu_si128((const __m128i *)_range_max_tbl); + const __m128i df_ee_tbl = + _mm_lddqu_si128((const __m128i *)_df_ee_tbl); + const __m128i ef_fe_tbl = + _mm_lddqu_si128((const __m128i *)_ef_fe_tbl); + + __m128i error = _mm_set1_epi8(0); + + while (len >= 32) { + /***************************** block 1 ****************************/ + const __m128i input = _mm_lddqu_si128((const __m128i *)data); + + __m128i high_nibbles = + _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F)); + + __m128i first_len = _mm_shuffle_epi8(first_len_tbl, high_nibbles); + + __m128i range = _mm_shuffle_epi8(first_range_tbl, high_nibbles); + + range = _mm_or_si128( + range, _mm_alignr_epi8(first_len, prev_first_len, 15)); + + __m128i tmp1, tmp2; + tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1)); + tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1)); + range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14)); + + tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2)); + tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2)); + range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13)); + + __m128i shift1, pos, range2; + shift1 = _mm_alignr_epi8(input, prev_input, 15); + pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF)); + tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(240)); + range2 = _mm_shuffle_epi8(df_ee_tbl, tmp1); + tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112)); + range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp2)); + + range = _mm_add_epi8(range, range2); + + __m128i minv = _mm_shuffle_epi8(range_min_tbl, range); + __m128i maxv = _mm_shuffle_epi8(range_max_tbl, range); + + error = _mm_or_si128(error, _mm_cmplt_epi8(input, minv)); + error = _mm_or_si128(error, _mm_cmpgt_epi8(input, maxv)); + + /***************************** block 2 ****************************/ + const __m128i _input = _mm_lddqu_si128((const __m128i *)(data+16)); + + high_nibbles = + _mm_and_si128(_mm_srli_epi16(_input, 4), _mm_set1_epi8(0x0F)); + + __m128i _first_len = _mm_shuffle_epi8(first_len_tbl, high_nibbles); + + __m128i _range = _mm_shuffle_epi8(first_range_tbl, high_nibbles); + + _range = _mm_or_si128( + _range, _mm_alignr_epi8(_first_len, first_len, 15)); + + tmp1 = _mm_subs_epu8(_first_len, _mm_set1_epi8(1)); + tmp2 = _mm_subs_epu8(first_len, _mm_set1_epi8(1)); + _range = _mm_or_si128(_range, _mm_alignr_epi8(tmp1, tmp2, 14)); + + tmp1 = _mm_subs_epu8(_first_len, _mm_set1_epi8(2)); + tmp2 = _mm_subs_epu8(first_len, _mm_set1_epi8(2)); + _range = _mm_or_si128(_range, _mm_alignr_epi8(tmp1, tmp2, 13)); + + __m128i _range2; + shift1 = _mm_alignr_epi8(_input, input, 15); + pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF)); + tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(240)); + _range2 = _mm_shuffle_epi8(df_ee_tbl, tmp1); + tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112)); + _range2 = _mm_add_epi8(_range2, _mm_shuffle_epi8(ef_fe_tbl, tmp2)); + + _range = _mm_add_epi8(_range, _range2); + + minv = _mm_shuffle_epi8(range_min_tbl, _range); + maxv = _mm_shuffle_epi8(range_max_tbl, _range); + + error = _mm_or_si128(error, _mm_cmplt_epi8(_input, minv)); + error = _mm_or_si128(error, _mm_cmpgt_epi8(_input, maxv)); + + /************************ next iteration **************************/ + prev_input = _input; + prev_first_len = _first_len; + + data += 32; + len -= 32; + } + + int error_reduced = + _mm_movemask_epi8(_mm_cmpeq_epi8(error, _mm_set1_epi8(0))); + if (error_reduced != 0xFFFF) + return 0; + + int32_t token4 = _mm_extract_epi32(prev_input, 3); + const int8_t *token = (const int8_t *)&token4; + int lookahead = 0; + if (token[3] > (int8_t)0xBF) + lookahead = 1; + else if (token[2] > (int8_t)0xBF) + lookahead = 2; + else if (token[1] > (int8_t)0xBF) + lookahead = 3; + + data -= lookahead; + len += lookahead; + } + + return utf8_naive(data, len); +} + +#endif +#endif diff --git a/src/utf8/utf8-lookup.h b/src/utf8/utf8-lookup.h new file mode 100644 index 000000000000..02f70297e4be --- /dev/null +++ b/src/utf8/utf8-lookup.h @@ -0,0 +1,42 @@ +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +//Copyright (c) 2008-2009 Bjoern Hoehrmann + +//Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +//The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +uint32_t inline +utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h new file mode 100644 index 000000000000..0c5812ef3c15 --- /dev/null +++ b/src/utf8/utf8.h @@ -0,0 +1,19 @@ +#pragma once + +// These are here because I hate most build systems (meson is OK) +#include "range2-neon.c" +#include "range2-sse.c" +#include "naive.c" + +int utf8_naive(const unsigned char *data, int len); +int utf8_range2(const unsigned char *data, int len); + +#ifdef __linux__ +#ifdef __x86_64__ +__attribute__ ((__target__ ("default"))) +#endif +#endif +int utf8_range2(const unsigned char *data, int len) +{ + return utf8_naive(data, len); +} diff --git a/std/unicode.zig b/std/unicode.zig index 37a73d75004b..67274191b607 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -4,25 +4,74 @@ const assert = std.debug.assert; const testing = std.testing; const mem = std.mem; +pub const Utf8Error = UnicodeError || error{ + Utf8ShortChar, + Utf8OverlongEncoding, + Utf8InvalidStartByte, +}; + +pub const UnicodeError = error{ + UnicodeSurrogateHalf, + UnicodeCodepointTooLarge, +}; + +// http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 +// +// Table 3-7. Well-Formed UTF-8 Byte Sequences +// +// +--------------------+------------+-------------+------------+-------------+ +// | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | +// +--------------------+------------+-------------+------------+-------------+ +// | U+0000..U+007F | 00..7F | | | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+0080..U+07FF | C2..DF | 80..BF | | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+D000..U+D7FF | ED | 80..9F | 80..BF | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | +// +--------------------+------------+-------------+------------+-------------+ +// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | +// +--------------------+------------+-------------+------------+-------------+ +// | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | +// +--------------------+------------+-------------+------------+-------------+ + +// This accepts u32 instead of u21 on purpose +pub fn isValidUnicode(c: u32) UnicodeError!void { + switch (c) { + 0x0000...0xd7ff => {}, + 0xd800...0xdfff => return error.UnicodeSurrogateHalf, + 0xe000...0x10ffff => {}, + 0x110000...0xffffffff => return error.UnicodeCodepointTooLarge, + } +} + /// Returns how many bytes the UTF-8 representation would require /// for the given codepoint. -pub fn utf8CodepointSequenceLength(c: u32) !u3 { +pub fn utf8CodepointSequenceLength(c: u32) Utf8Error!u3 { if (c < 0x80) return u3(1); if (c < 0x800) return u3(2); if (c < 0x10000) return u3(3); if (c < 0x110000) return u3(4); - return error.CodepointTooLarge; + return error.UnicodeCodepointTooLarge; } /// Given the first byte of a UTF-8 codepoint, /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. -pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { - if (first_byte < 0b10000000) return u3(1); - if (first_byte & 0b11100000 == 0b11000000) return u3(2); - if (first_byte & 0b11110000 == 0b11100000) return u3(3); - if (first_byte & 0b11111000 == 0b11110000) return u3(4); - return error.Utf8InvalidStartByte; +pub fn utf8ByteSequenceLength(first_byte: u8) Utf8Error!u3 { + const INVALID = 0; + const swtch = []u8{1, INVALID, 2, 3, 4, INVALID, INVALID, INVALID, INVALID}; + var len = swtch[@clz(~first_byte)]; + if (len == INVALID) { + return error.Utf8InvalidStartByte; + } + return @intCast(u3, len); } /// Encodes the given codepoint into a UTF-8 byte sequence. @@ -30,7 +79,7 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { /// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c). /// Errors: if c cannot be encoded in UTF-8. /// Returns: the number of bytes written to out. -pub fn utf8Encode(c: u32, out: []u8) !u3 { +pub fn utf8Encode(c: u32, out: []u8) Utf8Error!u3 { const length = try utf8CodepointSequenceLength(c); assert(out.len >= length); switch (length) { @@ -44,7 +93,7 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 { out[1] = @intCast(u8, 0b10000000 | (c & 0b111111)); }, 3 => { - if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf; + if (0xd800 <= c and c <= 0xdfff) return error.UnicodeSurrogateHalf; out[0] = @intCast(u8, 0b11100000 | (c >> 12)); out[1] = @intCast(u8, 0b10000000 | ((c >> 6) & 0b111111)); out[2] = @intCast(u8, 0b10000000 | (c & 0b111111)); @@ -60,32 +109,36 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 { return length; } -const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error; - -/// Decodes the UTF-8 codepoint encoded in the given slice of bytes. -/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. -/// If you already know the length at comptime, you can call one of -/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. -pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 { - return switch (bytes.len) { +/// Decodes the UTF-8 codepoint encoded in the given slice of bytes and returns +/// then length of the character decoded. +/// +/// Guaranteed to not read bytes past this character. +/// +/// "ret" cannot be *u21 because when casting to *u32 it would have differn't +/// behavior on Little-Endian and Big-Endian machines, which is too much to ask +/// of our callers. +/// https://github.com/ziglang/zig/issues/2136 +pub fn utf8Decode(bytes: []const u8, ret: *align(4) u32) Utf8Error!u3 { + var len = try utf8ByteSequenceLength(bytes[0]); + if (bytes.len < len) { + return error.Utf8ShortChar; + } + ret.* = switch (len) { 1 => u32(bytes[0]), - 2 => utf8Decode2(bytes), - 3 => utf8Decode3(bytes), - 4 => utf8Decode4(bytes), + 2 => try utf8Decode2(bytes[0..2]), + 3 => try utf8Decode3(bytes[0..3]), + 4 => try utf8Decode4(bytes[0..4]), else => unreachable, }; + return len; } -const Utf8Decode2Error = error{ - Utf8ExpectedContinuation, - Utf8OverlongEncoding, -}; -pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { +pub fn utf8Decode2(bytes: []const u8) Utf8Error!u32 { assert(bytes.len == 2); assert(bytes[0] & 0b11100000 == 0b11000000); var value: u32 = bytes[0] & 0b00011111; - if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[1]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[1] & 0b00111111; @@ -94,74 +147,67 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { return value; } -const Utf8Decode3Error = error{ - Utf8ExpectedContinuation, - Utf8OverlongEncoding, - Utf8EncodesSurrogateHalf, -}; -pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 { +pub fn utf8Decode3(bytes: []const u8) Utf8Error!u32 { assert(bytes.len == 3); assert(bytes[0] & 0b11110000 == 0b11100000); var value: u32 = bytes[0] & 0b00001111; - if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[1]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[1] & 0b00111111; - if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[2] & 0b00111111; if (value < 0x800) return error.Utf8OverlongEncoding; - if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf; + if (0xd800 <= value and value <= 0xdfff) return error.UnicodeSurrogateHalf; return value; } -const Utf8Decode4Error = error{ - Utf8ExpectedContinuation, - Utf8OverlongEncoding, - Utf8CodepointTooLarge, -}; -pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 { +pub fn utf8Decode4(bytes: []const u8) Utf8Error!u32 { assert(bytes.len == 4); assert(bytes[0] & 0b11111000 == 0b11110000); var value: u32 = bytes[0] & 0b00000111; - if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[1] & 0b00111111; - if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[2] & 0b00111111; - if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[3] & 0b00111111; if (value < 0x10000) return error.Utf8OverlongEncoding; - if (value > 0x10FFFF) return error.Utf8CodepointTooLarge; + if (value > 0x10FFFF) return error.UnicodeCodepointTooLarge; return value; } -pub fn utf8ValidateSlice(s: []const u8) bool { +// TODO replace with something faster: +// https://github.com/cyb70289/utf8/ +// https://lemire.me/blog/2018/10/19/validating-utf-8-bytes-using-only-0-45-cycles-per-byte-avx-edition/ +pub fn utf8ValidateSliceWithLoc(s: []const u8, ret_invalid_maybe: ?*usize) Utf8Error!void { var i: usize = 0; while (i < s.len) { - if (utf8ByteSequenceLength(s[i])) |cp_len| { - if (i + cp_len > s.len) { - return false; + var c: u32 = undefined; + i += utf8Decode(s[i..], &c) catch |err| { + if (ret_invalid_maybe) |ret_invalid| { + ret_invalid.* = i; } - - if (utf8Decode(s[i .. i + cp_len])) |_| {} else |_| { - return false; - } - i += cp_len; - } else |err| { - return false; - } + return err; + }; } + return; +} + +pub fn utf8ValidateSlice(s: []const u8) bool { + utf8ValidateSliceWithLoc(s, null) catch return false; return true; } @@ -177,11 +223,9 @@ pub const Utf8View = struct { bytes: []const u8, pub fn init(s: []const u8) !Utf8View { - if (!utf8ValidateSlice(s)) { - return error.InvalidUtf8; - } - - return initUnchecked(s); + if (utf8ValidateSlice(s)) { + return initUnchecked(s); + } else return error.InvalidUtf8; } pub fn initUnchecked(s: []const u8) Utf8View { @@ -192,11 +236,9 @@ pub const Utf8View = struct { pub fn initComptime(comptime s: []const u8) Utf8View { if (comptime init(s)) |r| { return r; - } else |err| switch (err) { - error.InvalidUtf8 => { - @compileError("invalid utf8"); - unreachable; - }, + } else |err| { + @compileError("invalid utf8"); + unreachable; } } @@ -217,21 +259,19 @@ pub const Utf8Iterator = struct { return null; } - const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable; + const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch null; it.i += cp_len; return it.bytes[it.i - cp_len .. it.i]; } - pub fn nextCodepoint(it: *Utf8Iterator) ?u32 { - const slice = it.nextCodepointSlice() orelse return null; - - switch (slice.len) { - 1 => return u32(slice[0]), - 2 => return utf8Decode2(slice) catch unreachable, - 3 => return utf8Decode3(slice) catch unreachable, - 4 => return utf8Decode4(slice) catch unreachable, - else => unreachable, + pub fn nextCodepoint(it: *Utf8Iterator) ?u21 { + if (it.i >= it.bytes.len) { + return null; } + + var c: u32 = undefined; + it.i += utf8Decode(it.bytes[it.i..], &c) catch return null; + return @intCast(u21, c); } }; @@ -246,7 +286,7 @@ pub const Utf16LeIterator = struct { }; } - pub fn nextCodepoint(it: *Utf16LeIterator) !?u32 { + pub fn nextCodepoint(it: *Utf16LeIterator) !?u21 { assert(it.i <= it.bytes.len); if (it.i == it.bytes.len) return null; const c0: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); @@ -257,12 +297,12 @@ pub const Utf16LeIterator = struct { const c1: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); if (c1 & ~u32(0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; it.i += 2; - return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)); + return @truncate(u21, 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff))); } else if (c0 & ~u32(0x03ff) == 0xdc00) { return error.UnexpectedSecondSurrogateHalf; } else { it.i += 2; - return c0; + return @truncate(u21, c0); } } }; @@ -274,19 +314,19 @@ test "utf8 encode" { fn testUtf8Encode() !void { // A few taken from wikipedia a few taken elsewhere var array: [4]u8 = undefined; - testing.expect((try utf8Encode(try utf8Decode("€"), array[0..])) == 3); + testing.expect((try utf8Encode('€', array[0..])) == 3); testing.expect(array[0] == 0b11100010); testing.expect(array[1] == 0b10000010); testing.expect(array[2] == 0b10101100); - testing.expect((try utf8Encode(try utf8Decode("$"), array[0..])) == 1); + testing.expect((try utf8Encode('$', array[0..])) == 1); testing.expect(array[0] == 0b00100100); - testing.expect((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2); + testing.expect((try utf8Encode('¢', array[0..])) == 2); testing.expect(array[0] == 0b11000010); testing.expect(array[1] == 0b10100010); - testing.expect((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4); + testing.expect((try utf8Encode('𐍈', array[0..])) == 4); testing.expect(array[0] == 0b11110000); testing.expect(array[1] == 0b10010000); testing.expect(array[2] == 0b10001101); @@ -299,13 +339,12 @@ test "utf8 encode error" { } fn testUtf8EncodeError() void { var array: [4]u8 = undefined; - testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf); - testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf); - testErrorEncode(0x110000, array[0..], error.CodepointTooLarge); - testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge); + testErrorEncode(0xd800, array[0..], error.UnicodeSurrogateHalf); + testErrorEncode(0xdfff, array[0..], error.UnicodeSurrogateHalf); + testErrorEncode(0x110000, array[0..], error.UnicodeCodepointTooLarge); } -fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: anyerror) void { +fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) void { testing.expectError(expectedErr, utf8Encode(codePoint, array)); } @@ -401,24 +440,24 @@ fn testInvalidUtf8ContinuationBytes() void { testError("\xf8", error.Utf8InvalidStartByte); testError("\xff", error.Utf8InvalidStartByte); // expected continuation for 2 byte sequences - testError("\xc2", error.UnexpectedEof); - testError("\xc2\x00", error.Utf8ExpectedContinuation); - testError("\xc2\xc0", error.Utf8ExpectedContinuation); + testError("\xc2", error.Utf8ShortChar); + testError("\xc2\x00", error.Utf8ShortChar); + testError("\xc2\xc0", error.Utf8ShortChar); // expected continuation for 3 byte sequences - testError("\xe0", error.UnexpectedEof); - testError("\xe0\x00", error.UnexpectedEof); - testError("\xe0\xc0", error.UnexpectedEof); - testError("\xe0\xa0", error.UnexpectedEof); - testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation); - testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation); + testError("\xe0", error.Utf8ShortChar); + testError("\xe0\x00", error.Utf8ShortChar); + testError("\xe0\xc0", error.Utf8ShortChar); + testError("\xe0\xa0", error.Utf8ShortChar); + testError("\xe0\xa0\x00", error.Utf8ShortChar); + testError("\xe0\xa0\xc0", error.Utf8ShortChar); // expected continuation for 4 byte sequences - testError("\xf0", error.UnexpectedEof); - testError("\xf0\x00", error.UnexpectedEof); - testError("\xf0\xc0", error.UnexpectedEof); - testError("\xf0\x90\x00", error.UnexpectedEof); - testError("\xf0\x90\xc0", error.UnexpectedEof); - testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation); - testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation); + testError("\xf0", error.Utf8ShortChar); + testError("\xf0\x00", error.Utf8ShortChar); + testError("\xf0\xc0", error.Utf8ShortChar); + testError("\xf0\x90\x00", error.Utf8ShortChar); + testError("\xf0\x90\xc0", error.Utf8ShortChar); + testError("\xf0\x90\x80\x00", error.Utf8ShortChar); + testError("\xf0\x90\x80\xc0", error.Utf8ShortChar); } test "overlong utf8 codepoint" { @@ -440,12 +479,12 @@ test "misc invalid utf8" { } fn testMiscInvalidUtf8() void { // codepoint out of bounds - testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge); - testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge); + testError("\xf4\x90\x80\x80", error.UnicodeCodepointTooLarge); + testError("\xf7\xbf\xbf\xbf", error.UnicodeCodepointTooLarge); // surrogate halves testValid("\xed\x9f\xbf", 0xd7ff); - testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf); - testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf); + testError("\xed\xa0\x80", error.UnicodeSurrogateHalf); + testError("\xed\xbf\xbf", error.UnicodeSurrogateHalf); testValid("\xee\x80\x80", 0xe000); } @@ -459,9 +498,11 @@ fn testValid(bytes: []const u8, expected_codepoint: u32) void { fn testDecode(bytes: []const u8) !u32 { const length = try utf8ByteSequenceLength(bytes[0]); - if (bytes.len < length) return error.UnexpectedEof; + if (bytes.len < length) return error.Utf8ShortChar; testing.expect(bytes.len == length); - return utf8Decode(bytes); + var c: u32 = undefined; + _ = try utf8Decode(bytes, &c); + return c; } /// Caller must free returned memory. diff --git a/std/zig/parse_string_literal.zig b/std/zig/parse_string_literal.zig index acae0b64c79c..16bfa4c66ef3 100644 --- a/std/zig/parse_string_literal.zig +++ b/std/zig/parse_string_literal.zig @@ -1,16 +1,94 @@ -const std = @import("../std.zig"); +const std = @import("std");//("../std.zig"); const assert = std.debug.assert; +const mem = std.mem; +const fmt = std.fmt; +const unicode = std.unicode; + +pub const ParseEscapeError = std.unicode.UnicodeError || error{ + ExpectXDigit, + ExpectLCurly, + ExpectRCurly, +}; +inline fn parseEscape(escape_sequence: []const u8, ret_len: *u4) ParseEscapeError!u21 { + var ret: u21 = undefined; + var it = mem.byteIterator(escape_sequence); + errdefer ret_len.* = @intCast(u4, it.i); + got_escape: { switch (it.n()) { + 'x' => { + var hi = fmt.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + var lo = fmt.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + ret_len.* = 3; + return u21(((hi << 4) | lo)); + }, + 'u' => { + if (it.n() != '{') return error.ExpectLCurly; + var hi = fmt.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + var lo = fmt.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + ret_len.* = 4; + ret = (u21(hi) << 4) | u21(lo); + hi = fmt.charToDigit(it.n(), 16) catch { + if (it.n() != '}') return error.ExpectRCurly; + ret_len.* = 5; + break :got_escape; + }; + lo = fmt.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + ret_len.* = 6; + ret |= ((u21(hi) << 4) | u21(lo)) << 8; + hi = fmt.charToDigit(it.n(), 16) catch { + if (it.n() != '}') return error.ExpectRCurly; + ret_len.* = 7; + break :got_escape; + }; + lo = fmt.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + ret_len.* = 8; + ret |= ((u21(hi) << 4) | u21(lo)) << 16; + if (it.n() != '}') return error.ExpectRCurly; + ret_len.* = 9; + }, + else => unreachable, + }} + unicode.isValidUnicode(ret) catch |err| return err; + return ret; +} + +pub const ParseCharLiteralError = ParseEscapeError || unicode.Utf8Error || error{ + ExpectSQuote, +}; +pub fn parseCharLiteral(char_token: []const u8) ParseCharLiteralError!u21 { + var char: u21 = undefined; + if (char_token[1] == '\\') { + var len: u4 = undefined; + char = switch (char_token[2]) { + 'x', 'u' => try parseEscape(char_token[2..], &len), + 'n' => '\n', + 'r' => '\r', + '\\' => '\\', + '\t' => '\t', + '\'' => '\'', + '\"' => '\"', + else => unreachable, + }; + if (char_token[2 + len] != '}') return error.ExpectRCurly; + } + var len = try unicode.utf8Decode(char_token[1..], @ptrCast(*u32, &char)); // TODO: will this cast fail on Big-Endian? + if (char_token[1 + len] != '\'') return error.ExpectSQuote; + + return char; +} + +test "zig.parseCharLiteral" { + const expect = std.testing.expect; + expect(parseCharLiteral("\'0\'") catch unreachable == '0'); + expect(parseCharLiteral("\'\x20\'") catch unreachable == ' '); +} const State = enum { Start, Backslash, }; -pub const ParseStringLiteralError = error{ +pub const ParseStringLiteralError = ParseEscapeError || error{ OutOfMemory, - - /// When this is returned, index will be the position of the character. - InvalidCharacter, }; /// caller owns returned memory @@ -29,7 +107,9 @@ pub fn parseStringLiteral( try list.ensureCapacity(slice.len - 1); var state = State.Start; - for (slice) |b, index| { + var index: usize = 0; + while (index < slice.len) : (index += 1) { + var b = slice[index]; switch (state) { State.Start => switch (b) { '\\' => state = State.Backslash, @@ -41,9 +121,15 @@ pub fn parseStringLiteral( else => try list.append(b), }, State.Backslash => switch (b) { - 'x' => @panic("TODO"), - 'u' => @panic("TODO"), - 'U' => @panic("TODO"), + 'x', 'u' => { + var encoded: [4]u8 = undefined; + var len: u3 = undefined; + bad_index.* = index; + len = unicode.utf8Encode(try parseEscape(bytes[2..], &len), encoded[0..]) catch unreachable; + try list.appendSlice(encoded[0..len]); + index += len; + state = State.Start; + }, 'n' => { try list.append('\n'); state = State.Start; @@ -64,6 +150,10 @@ pub fn parseStringLiteral( try list.append('"'); state = State.Start; }, + '\'' => { + try list.append('\''); + state = State.Start; + }, else => { bad_index.* = index; return error.InvalidCharacter; diff --git a/std/zig/parser_test.zig b/std/zig/parser_test.zig index 43496994822d..0f2e82be6577 100644 --- a/std/zig/parser_test.zig +++ b/std/zig/parser_test.zig @@ -12,9 +12,21 @@ test "zig fmt: enum literal" { ); } -test "zig fmt: character literal larger than u8" { +test "zig fmt: character literals" { try testCanonical( - \\const x = '\U01f4a9'; + \\const x = '\x80'; + \\ + ); + try testCanonical( + \\const x = '\u{80}'; + \\ + ); + try testCanonical( + \\const x = '\u{01f4}'; + \\ + ); + try testCanonical( + \\const x = '\u{01f4a9}'; \\ ); } diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index 2ace430a15fd..0a5f489fa654 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -1,5 +1,6 @@ const std = @import("../std.zig"); const mem = std.mem; +const unicode = std.unicode; pub const Token = struct { id: Id, @@ -234,12 +235,8 @@ pub const Tokenizer = struct { Builtin, C, StringLiteral, - StringLiteralBackslash, MultilineStringLiteralLine, CharLiteral, - CharLiteralBackslash, - CharLiteralHexEscape, - CharLiteralEnd, Backslash, Equal, Bang, @@ -619,90 +616,28 @@ pub const Tokenizer = struct { else => break, }, State.StringLiteral => switch (c) { - '\\' => { - state = State.StringLiteralBackslash; - }, '"' => { self.index += 1; break; }, - '\n' => break, // Look for this error later. - else => self.checkLiteralCharacter(), - }, - - State.StringLiteralBackslash => switch (c) { - '\n' => break, // Look for this error later. - else => { - state = State.StringLiteral; - }, - }, - - State.CharLiteral => switch (c) { - '\\' => { - state = State.CharLiteralBackslash; - }, - '\'' => { - result.id = Token.Id.Invalid; - break; - }, - else => { - if (c < 0x20 or c == 0x7f) { - result.id = Token.Id.Invalid; - break; - } - - state = State.CharLiteralEnd; - }, - }, - - State.CharLiteralBackslash => switch (c) { '\n' => { result.id = Token.Id.Invalid; break; }, - 'x' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 2; - }, - 'u' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 4; - }, - 'U' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 6; - }, - else => { - state = State.CharLiteralEnd; - }, - }, - - State.CharLiteralHexEscape => switch (c) { - '0'...'9', 'a'...'z', 'A'...'F' => { - seen_escape_digits += 1; - if (seen_escape_digits == expected_escape_digits) { - state = State.CharLiteralEnd; - } - }, - else => { - result.id = Token.Id.Invalid; - break; - }, + else => self.checkLiteralCharacter(), }, - State.CharLiteralEnd => switch (c) { + State.CharLiteral => switch (c) { '\'' => { result.id = Token.Id.CharLiteral; self.index += 1; break; }, - else => { + '\n' => { result.id = Token.Id.Invalid; break; }, + else => self.checkLiteralCharacter(), }, State.MultilineStringLiteralLine => switch (c) { @@ -1052,10 +987,6 @@ pub const Tokenizer = struct { State.SawAtSign, State.Backslash, State.CharLiteral, - State.CharLiteralBackslash, - State.CharLiteralHexEscape, - State.CharLiteralEnd, - State.StringLiteralBackslash, State.LBracketStar, State.LBracketStarC, => { diff --git a/test/stage1/behavior/misc.zig b/test/stage1/behavior/misc.zig index 4cc401a008e5..4b030fdc03a4 100644 --- a/test/stage1/behavior/misc.zig +++ b/test/stage1/behavior/misc.zig @@ -190,7 +190,7 @@ test "string escapes" { expect(mem.eql(u8, "\r", "\x0d")); expect(mem.eql(u8, "\t", "\x09")); expect(mem.eql(u8, "\\", "\x5c")); - expect(mem.eql(u8, "\u1234\u0069", "\xe1\x88\xb4\x69")); + expect(mem.eql(u8, "\u{1234}\u{0069}", "\xe1\x88\xb4\x69")); } test "multiline string" { @@ -696,6 +696,11 @@ test "thread local variable" { } test "unicode escape in character literal" { - var a: u24 = '\U01f4a9'; + var a: u24 = '\u{01f4a9}'; + expect(a == 128169); +} + +test "utf-8 in character literal" { + var a: u24 = '💩'; expect(a == 128169); } From e469a7e111f51070a69b27087851cf1616e4f9c4 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Wed, 3 Apr 2019 08:59:48 -0500 Subject: [PATCH 02/18] build.zig: add test-parser target --- build.zig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build.zig b/build.zig index 2dc9c671ec64..49a7f3340a14 100644 --- a/build.zig +++ b/build.zig @@ -113,6 +113,8 @@ pub fn build(b: *Builder) !void { const fmt_step = b.step("test-fmt", "Run zig fmt against build.zig to make sure it works"); fmt_step.dependOn(&fmt_build_zig.step); + test_step.dependOn(tests.addPkgTests(b, test_filter, "std/zig/parser_test.zig", "parser", "Run the parser tests", modes)); + test_step.dependOn(tests.addPkgTests(b, test_filter, "test/stage1/behavior.zig", "behavior", "Run the behavior tests", modes)); test_step.dependOn(tests.addPkgTests(b, test_filter, "std/std.zig", "std", "Run the standard library tests", modes)); From 63080edcf554083d0769b6262d280c990ad712cb Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Tue, 2 Apr 2019 14:52:05 -0500 Subject: [PATCH 03/18] optimize fmt.charToDigit benchmarks are here https://github.com/ziglang/zig/issues/2128#issuecomment-477877639 --- std/fmt.zig | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/std/fmt.zig b/std/fmt.zig index 640227156305..d965ae7da1e5 100644 --- a/std/fmt.zig +++ b/std/fmt.zig @@ -866,17 +866,39 @@ test "fmt.parseFloat" { _ = @import("fmt/parse_float.zig"); } -pub fn charToDigit(c: u8, radix: u8) (error{InvalidCharacter}!u8) { - const value = switch (c) { - '0'...'9' => c - '0', - 'A'...'Z' => c - 'A' + 10, - 'a'...'z' => c - 'a' + 10, - else => return error.InvalidCharacter, - }; +// TODO This is not inside charToDigit() due to a bug https://github.com/ziglang/zig/issues/2128#issuecomment-477877639 +const NOT = 0xff; +const swtch = []u8{ +// All XDigit code points in this table are in their place in this ASCII+128 table. +// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, NOT, NOT, NOT, NOT, NOT, NOT, + + NOT, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, NOT, NOT, NOT, NOT, NOT, + NOT, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, NOT, NOT, NOT, NOT, NOT, + + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, + NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, NOT, +}; + +pub fn charToDigit(c: u8, radix: u8) (error{InvalidCharacter}!u6) { + @import("std").debug.assert(radix <= 36); + const value = swtch[c]; if (value >= radix) return error.InvalidCharacter; - return value; + return @intCast(u6, value); } fn digitToChar(digit: u8, uppercase: bool) u8 { @@ -1431,7 +1453,7 @@ pub fn hexToBytes(out: []u8, input: []const u8) !void { while (in_i != input.len) : (in_i += 2) { const hi = try charToDigit(input[in_i], 16); const lo = try charToDigit(input[in_i + 1], 16); - out[in_i / 2] = (hi << 4) | lo; + out[in_i / 2] = (u8(hi) << 4) | u8(lo); } } From d44abc4459a099ec15fd46ceae9593f9e956f9df Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Fri, 5 Apr 2019 21:05:41 -0500 Subject: [PATCH 04/18] use optimized charToDigit in bigint code unless I am missing something it appears that the self-hosted compiler was not compliant as it did not take upper case hex digits --- std/math/big/int.zig | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/std/math/big/int.zig b/std/math/big/int.zig index 8800c2c7a959..29b0f7c906bc 100644 --- a/std/math/big/int.zig +++ b/std/math/big/int.zig @@ -4,6 +4,7 @@ const debug = std.debug; const testing = std.testing; const math = std.math; const mem = std.mem; +const fmt = std.fmt; const Allocator = mem.Allocator; const ArrayList = std.ArrayList; const maxInt = std.math.maxInt; @@ -281,16 +282,6 @@ pub const Int = struct { } } - fn charToDigit(ch: u8, base: u8) !u8 { - const d = switch (ch) { - '0'...'9' => ch - '0', - 'a'...'f' => (ch - 'a') + 0xa, - else => return error.InvalidCharForDigit, - }; - - return if (d < base) d else return error.DigitTooLargeForBase; - } - fn digitToChar(d: u8, base: u8) !u8 { if (d >= base) { return error.DigitTooLargeForBase; @@ -326,7 +317,7 @@ pub const Int = struct { try self.set(0); for (value[i..]) |ch| { - const d = try charToDigit(ch, base); + const d = try fmt.charToDigit(ch, base); d_fba.end_index = 0; const d_ap = try Int.initSet(d_al, d); @@ -423,7 +414,7 @@ pub const Int = struct { /// TODO make this non-allocating pub fn format( self: Int, - comptime fmt: []const u8, + comptime fmtstr: []const u8, context: var, comptime FmtError: type, output: fn (@typeOf(context), []const u8) FmtError!void, From b808baf132b363a5c68e80df28a6bdd5fd450b48 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Sat, 30 Mar 2019 16:50:24 -0500 Subject: [PATCH 05/18] std.mem: add ByteIterator --- std/mem.zig | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/std/mem.zig b/std/mem.zig index 46cfda2d9487..67136a5c3af1 100644 --- a/std/mem.zig +++ b/std/mem.zig @@ -961,6 +961,32 @@ pub const SplitIterator = struct { } }; +// It would be nice to have type interence in structs, such that this could be iterator/Iterator +// This is useful because of the lack of a ++ operator in zig. +pub fn byteIterator(slice: []const u8) ByteIterator { + return ByteIterator{ + .buf = slice, + .i = 0, + }; +} + +pub const ByteIterator = struct { + buf: []const u8, + i: usize, + + pub fn next(self: *ByteIterator) ?u8 { + if (self.i > self.buf.len) return null; + self.i += 1; + return self.buf[self.i - 1]; + } + /// Unsafe version + pub fn n(self: *ByteIterator) u8 { + assert(self.i <= self.buf.len); + self.i += 1; + return self.buf[self.i - 1]; + } +}; + /// Naively combines a series of slices with a separator. /// Allocates memory for the result, which must be freed by the caller. pub fn join(allocator: *Allocator, separator: []const u8, slices: []const []const u8) ![]u8 { From 05f76d767bec7507f7c2446ae2b40db002048d78 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Sat, 6 Apr 2019 15:52:32 -0500 Subject: [PATCH 06/18] finially move past annoying crash (workaround for #2203) --- src-self-hosted/ir.zig | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src-self-hosted/ir.zig b/src-self-hosted/ir.zig index 802985c4453e..c7d4fd12f47e 100644 --- a/src-self-hosted/ir.zig +++ b/src-self-hosted/ir.zig @@ -1418,8 +1418,8 @@ pub const Builder = struct { var bad_index: usize = undefined; var buf = std.zig.parseStringLiteral(irb.comp.gpa(), str_token, &bad_index) catch |err| switch (err) { - .OutOfMemory => return error.OutOfMemory, - .UnicodeSurrogateHalf, .UnicodeCodepointTooLarge => { + error.OutOfMemory => return error.OutOfMemory, + error.UnicodeSurrogateHalf, error.UnicodeCodepointTooLarge => { var hex_string = if (mem.indexOfScalar(u8, str_token, '}')) |i| str_token[2..i] else str_token[2..str_token.len]; try irb.comp.addCompileError( irb.code.tree_scope, @@ -1429,15 +1429,15 @@ pub const Builder = struct { ); return error.SemanticAnalysisFailed; }, - .ExpectXDigit, .ExpectLCurly, .ExpectRCurly => { + error.ExpectXDigit, error.ExpectLCurly, error.ExpectRCurly => { try irb.comp.addCompileError( irb.code.tree_scope, src_span, "expected {}, got '{c}'", switch (err) { - .ExpectXDigit => "hexidecimal digit", - .ExpectLCurly => "left curly bracket '{'", - .ExpectRCurly => "right curly bracket '}'", + error.ExpectXDigit => "hexidecimal digit", + error.ExpectLCurly => "left curly bracket '{'", + error.ExpectRCurly => "right curly bracket '}'", }, str_token[bad_index], ); From ec0f04a7966bdf2c9553ba6b322926f9bca2d04c Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Sun, 31 Mar 2019 11:19:40 -0500 Subject: [PATCH 07/18] self-hosted: pull the utf8 and char validation out of the tokenizer --- src-self-hosted/compilation.zig | 3 +- src-self-hosted/main.zig | 5 +-- src/utf8/naive.c | 32 ++++++++++++++++++ src/utf8/range2-neon.c | 2 ++ src/utf8/range2-sse.c | 2 ++ src/utf8/utf8-lookup.h | 24 +++++++++++--- std/zig/bench.zig | 2 +- std/zig/parse.zig | 27 ++++++++++++++- std/zig/parser_test.zig | 2 +- std/zig/tokenizer.zig | 58 +++------------------------------ 10 files changed, 92 insertions(+), 65 deletions(-) diff --git a/src-self-hosted/compilation.zig b/src-self-hosted/compilation.zig index 478edce02001..8f31ef132b1a 100644 --- a/src-self-hosted/compilation.zig +++ b/src-self-hosted/compilation.zig @@ -842,7 +842,8 @@ pub const Compilation = struct { errdefer self.gpa().free(source_code); const tree = try self.gpa().create(ast.Tree); - tree.* = try std.zig.parse(self.gpa(), source_code); + var ret_err: usize = undefined; + tree.* = try std.zig.parse(self.gpa(), source_code, &ret_err); errdefer { tree.deinit(); self.gpa().destroy(tree); diff --git a/src-self-hosted/main.zig b/src-self-hosted/main.zig index 4c3edf6d5df5..450133a4d76d 100644 --- a/src-self-hosted/main.zig +++ b/src-self-hosted/main.zig @@ -625,8 +625,9 @@ fn cmdFmt(allocator: *Allocator, args: []const []const u8) !void { const source_code = try stdin.stream.readAllAlloc(allocator, max_src_size); defer allocator.free(source_code); - var tree = std.zig.parse(allocator, source_code) catch |err| { - try stderr.print("error parsing stdin: {}\n", err); + var ret_err: usize = undefined; + var tree = std.zig.parse(allocator, source_code, &ret_err) catch |err| { + try stderr.print("error parsing stdin at character {}: {}\n", ret_err, err); os.exit(1); }; defer tree.deinit(); diff --git a/src/utf8/naive.c b/src/utf8/naive.c index b2663756f4ba..36c234c00736 100644 --- a/src/utf8/naive.c +++ b/src/utf8/naive.c @@ -1,3 +1,35 @@ +/* +range2-neon.c +range2-sse.c +naive.c + +From: https://github.com/cyb70289/utf8 + +MIT License + +Copyright (c) 2019 Yibo Cai + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +// Copyright (c) 2019 Yibo Cai + #include /* diff --git a/src/utf8/range2-neon.c b/src/utf8/range2-neon.c index c1610cd0dbec..e626e54db2c0 100644 --- a/src/utf8/range2-neon.c +++ b/src/utf8/range2-neon.c @@ -1,3 +1,5 @@ +// Copyright (c) 2019 Yibo Cai +// see naive.c for license /* * Process 2x16 bytes in each iteration. * Comments removed for brevity. See range-neon.c for details. diff --git a/src/utf8/range2-sse.c b/src/utf8/range2-sse.c index 2369b4621d05..3e9f5bca43e1 100644 --- a/src/utf8/range2-sse.c +++ b/src/utf8/range2-sse.c @@ -1,3 +1,5 @@ +// Copyright (c) 2019 Yibo Cai +// see naive.c for license /* * Process 2x16 bytes in each iteration. * Comments removed for brevity. See range-sse.c for details. diff --git a/src/utf8/utf8-lookup.h b/src/utf8/utf8-lookup.h index 02f70297e4be..07eb83d6cf94 100644 --- a/src/utf8/utf8-lookup.h +++ b/src/utf8/utf8-lookup.h @@ -3,11 +3,25 @@ //Copyright (c) 2008-2009 Bjoern Hoehrmann -//Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -//The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/* +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ #define UTF8_ACCEPT 0 #define UTF8_REJECT 1 diff --git a/std/zig/bench.zig b/std/zig/bench.zig index ed6ae9a128b3..7474d4f28ab2 100644 --- a/std/zig/bench.zig +++ b/std/zig/bench.zig @@ -31,6 +31,6 @@ pub fn main() !void { fn testOnce() usize { var fixed_buf_alloc = std.heap.FixedBufferAllocator.init(fixed_buffer_mem[0..]); var allocator = &fixed_buf_alloc.allocator; - _ = std.zig.parse(allocator, source) catch @panic("parse failure"); + _ = std.zig.parse(allocator, source, null) catch @panic("parse failure"); return fixed_buf_alloc.end_index; } diff --git a/std/zig/parse.zig b/std/zig/parse.zig index 96aec714abcf..94e3442cfac2 100644 --- a/std/zig/parse.zig +++ b/std/zig/parse.zig @@ -1,6 +1,8 @@ const std = @import("../std.zig"); const assert = std.debug.assert; const mem = std.mem; +const ascii = std.ascii; +const unicode = std.unicode; const ast = std.zig.ast; const Tokenizer = std.zig.Tokenizer; const Token = std.zig.Token; @@ -9,7 +11,7 @@ const Error = ast.Error; /// Result should be freed with tree.deinit() when there are /// no more references to any of the tokens or nodes. -pub fn parse(allocator: *mem.Allocator, source: []const u8) !ast.Tree { +pub fn parse(allocator: *mem.Allocator, source: []const u8, ret_err_off: ?*usize) !ast.Tree { var tree_arena = std.heap.ArenaAllocator.init(allocator); errdefer tree_arena.deinit(); @@ -27,6 +29,29 @@ pub fn parse(allocator: *mem.Allocator, source: []const u8) !ast.Tree { .eof_token = undefined, }; + // TODO Do it in one pass by streaming through these two tests to the tokenizer. + for (source) |c, i| { + if (!ascii.isZig(c)) { + if (ret_err_off) |err_off| { + err_off.* = i; + } + return error.InvalidCharacter; + } + } + // TODO we use to ban certain Unicode characters, but this wasn't documented. + // Should we still ban them? + // U+0085 (NEL) + // U+2028 (LS) + // U+2029 (PS) + // If so, it would be fastest to ban them in their utf-8 representations, + // (because the faster utf8 validators do not get the code-points) + // but it would still take a whole additional streaming check. + // But if we do it here are other characters to ban: + // --Investigate anything else that might virtically effect the rendering + // (so not RTL scripts). + // U+fffe and U+ffff (BOMs) + try unicode.utf8ValidateSliceWithLoc(source, ret_err_off); + var tree = ast.Tree{ .source = source, .root_node = root_node, diff --git a/std/zig/parser_test.zig b/std/zig/parser_test.zig index 0f2e82be6577..7cc4b83fd3a8 100644 --- a/std/zig/parser_test.zig +++ b/std/zig/parser_test.zig @@ -2142,7 +2142,7 @@ fn testParse(source: []const u8, allocator: *mem.Allocator, anything_changed: *b var stderr_file = try io.getStdErr(); var stderr = &stderr_file.outStream().stream; - var tree = try std.zig.parse(allocator, source); + var tree = try std.zig.parse(allocator, source, null); defer tree.deinit(); var error_it = tree.errors.iterator(0); diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index 0a5f489fa654..51e7cb150aca 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -624,7 +624,7 @@ pub const Tokenizer = struct { result.id = Token.Id.Invalid; break; }, - else => self.checkLiteralCharacter(), + else => {} }, State.CharLiteral => switch (c) { @@ -637,7 +637,7 @@ pub const Tokenizer = struct { result.id = Token.Id.Invalid; break; }, - else => self.checkLiteralCharacter(), + else => {}, }, State.MultilineStringLiteralLine => switch (c) { @@ -645,7 +645,7 @@ pub const Tokenizer = struct { self.index += 1; break; }, - else => self.checkLiteralCharacter(), + else => {}, }, State.Bang => switch (c) { @@ -824,7 +824,6 @@ pub const Tokenizer = struct { '\n' => break, else => { state = State.LineComment; - self.checkLiteralCharacter(); }, }, State.DocCommentStart => switch (c) { @@ -838,12 +837,11 @@ pub const Tokenizer = struct { else => { state = State.DocComment; result.id = Token.Id.DocComment; - self.checkLiteralCharacter(); }, }, State.LineComment, State.DocComment => switch (c) { '\n' => break, - else => self.checkLiteralCharacter(), + else => {}, }, State.Zero => switch (c) { 'b', 'o' => { @@ -1069,54 +1067,6 @@ pub const Tokenizer = struct { result.end = self.index; return result; } - - fn checkLiteralCharacter(self: *Tokenizer) void { - if (self.pending_invalid_token != null) return; - const invalid_length = self.getInvalidCharacterLength(); - if (invalid_length == 0) return; - self.pending_invalid_token = Token{ - .id = Token.Id.Invalid, - .start = self.index, - .end = self.index + invalid_length, - }; - } - - fn getInvalidCharacterLength(self: *Tokenizer) u3 { - const c0 = self.buffer[self.index]; - if (c0 < 0x80) { - if (c0 < 0x20 or c0 == 0x7f) { - // ascii control codes are never allowed - // (note that \n was checked before we got here) - return 1; - } - // looks fine to me. - return 0; - } else { - // check utf8-encoded character. - const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; - if (self.index + length > self.buffer.len) { - return @intCast(u3, self.buffer.len - self.index); - } - const bytes = self.buffer[self.index .. self.index + length]; - switch (length) { - 2 => { - const value = std.unicode.utf8Decode2(bytes) catch return length; - if (value == 0x85) return length; // U+0085 (NEL) - }, - 3 => { - const value = std.unicode.utf8Decode3(bytes) catch return length; - if (value == 0x2028) return length; // U+2028 (LS) - if (value == 0x2029) return length; // U+2029 (PS) - }, - 4 => { - _ = std.unicode.utf8Decode4(bytes) catch return length; - }, - else => unreachable, - } - self.index += length - 1; - return 0; - } - } }; test "tokenizer" { From 11fef207928fa8e3d3ca664cf215d60d8518771e Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Mon, 1 Apr 2019 08:23:24 -0500 Subject: [PATCH 08/18] remove tests of things the tokenizer no longer handles --- std/zig/tokenizer.zig | 76 +------------------------------------------ 1 file changed, 1 insertion(+), 75 deletions(-) diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index 51e7cb150aca..f8d07d396940 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -1118,26 +1118,7 @@ test "tokenizer - invalid token characters" { testTokenize("`", []Token.Id{Token.Id.Invalid}); testTokenize("'c", []Token.Id{Token.Id.Invalid}); testTokenize("'", []Token.Id{Token.Id.Invalid}); - testTokenize("''", []Token.Id{ Token.Id.Invalid, Token.Id.Invalid }); -} - -test "tokenizer - invalid literal/comment characters" { - testTokenize("\"\x00\"", []Token.Id{ - Token.Id.StringLiteral, - Token.Id.Invalid, - }); - testTokenize("//\x00", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\x1f", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\x7f", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); + //testTokenize("''", []Token.Id{ Token.Id.Invalid, Token.Id.Invalid }); Catch this in the parser. } test "tokenizer - utf8" { @@ -1145,61 +1126,6 @@ test "tokenizer - utf8" { testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{Token.Id.LineComment}); } -test "tokenizer - invalid utf8" { - testTokenize("//\x80", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xbf", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xf8", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xff", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xc2\xc0", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xe0", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xf0", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xf0\x90\x80\xc0", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); -} - -test "tokenizer - illegal unicode codepoints" { - // unicode newline characters.U+0085, U+2028, U+2029 - testTokenize("//\xc2\x84", []Token.Id{Token.Id.LineComment}); - testTokenize("//\xc2\x85", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xc2\x86", []Token.Id{Token.Id.LineComment}); - testTokenize("//\xe2\x80\xa7", []Token.Id{Token.Id.LineComment}); - testTokenize("//\xe2\x80\xa8", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xe2\x80\xa9", []Token.Id{ - Token.Id.LineComment, - Token.Id.Invalid, - }); - testTokenize("//\xe2\x80\xaa", []Token.Id{Token.Id.LineComment}); -} - test "tokenizer - string identifier and builtin fns" { testTokenize( \\const @"if" = @import("std"); From 4f8be8324a085347716dac4ba9d452403d2ae955 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Tue, 2 Apr 2019 09:50:57 -0500 Subject: [PATCH 09/18] Check for rejected UTF-8 characters again --- std/zig/parse.zig | 51 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/std/zig/parse.zig b/std/zig/parse.zig index 94e3442cfac2..2180f3242527 100644 --- a/std/zig/parse.zig +++ b/std/zig/parse.zig @@ -29,27 +29,46 @@ pub fn parse(allocator: *mem.Allocator, source: []const u8, ret_err_off: ?*usize .eof_token = undefined, }; - // TODO Do it in one pass by streaming through these two tests to the tokenizer. + // TODO Do it in one pass by streaming through these three tests to the tokenizer. + var prev2: u8 = ' '; + var prev: u8 = ' '; for (source) |c, i| { if (!ascii.isZig(c)) { - if (ret_err_off) |err_off| { - err_off.* = i; - } + if (ret_err_off) |err_off| err_off.* = i; + return error.InvalidCharacter; + } + // Ban certain Unicode characters + // + // These first three were first banned in the tokenizer + // U+0085 (NEL) C2 85 -- Looks like a large > in gedit. + // U+2028 (LS) E2 80 A8 -- Causes a line break in gedit even when wrap is off! + // U+2029 (PS) E2 80 A9 -- Same!
 + // + // UTF-16 byte-order-marks + // U+FFFE EF BF BE + // U+FFFF EF BF BF + prev2 = prev; + prev = c; + switch (u16(prev2) << 8 | prev) { + 0xc285 => { // Doesn't catch this character if it is the last, but that isn't a big deal. + if (ret_err_off) |err_off| err_off.* = i - 2; return error.InvalidCharacter; + }, + 0xe280 => { + if (c == 0xa8 or c == 0xa9) { + if (ret_err_off) |err_off| err_off.* = i - 2; + return error.InvalidCharacter; + } + }, + 0xefbf => { + if (c == 0xbe or c == 0xbf) { + if (ret_err_off) |err_off| err_off.* = i - 2; + return error.InvalidCharacter; + } + }, + else => {}, } } - // TODO we use to ban certain Unicode characters, but this wasn't documented. - // Should we still ban them? - // U+0085 (NEL) - // U+2028 (LS) - // U+2029 (PS) - // If so, it would be fastest to ban them in their utf-8 representations, - // (because the faster utf8 validators do not get the code-points) - // but it would still take a whole additional streaming check. - // But if we do it here are other characters to ban: - // --Investigate anything else that might virtically effect the rendering - // (so not RTL scripts). - // U+fffe and U+ffff (BOMs) try unicode.utf8ValidateSliceWithLoc(source, ret_err_off); var tree = ast.Tree{ From 467511f12a0001a9ac10a7545097936318200ab9 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Sat, 6 Apr 2019 17:51:33 -0500 Subject: [PATCH 10/18] remove banning of UTF-16 BOMs --- std/zig/parse.zig | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/std/zig/parse.zig b/std/zig/parse.zig index 2180f3242527..454c25889321 100644 --- a/std/zig/parse.zig +++ b/std/zig/parse.zig @@ -39,18 +39,15 @@ pub fn parse(allocator: *mem.Allocator, source: []const u8, ret_err_off: ?*usize } // Ban certain Unicode characters // - // These first three were first banned in the tokenizer - // U+0085 (NEL) C2 85 -- Looks like a large > in gedit. - // U+2028 (LS) E2 80 A8 -- Causes a line break in gedit even when wrap is off! - // U+2029 (PS) E2 80 A9 -- Same!
 + // All three of these are line-endings. + // U+0085 (NEL) C2 85 + // U+2028 (LS) E2 80 A8 + // U+2029 (PS) E2 80 A9 // - // UTF-16 byte-order-marks - // U+FFFE EF BF BE - // U+FFFF EF BF BF prev2 = prev; prev = c; switch (u16(prev2) << 8 | prev) { - 0xc285 => { // Doesn't catch this character if it is the last, but that isn't a big deal. + 0xc285 => { // Doesn't catch this character if it is the last character, but that is OK because it is the last line. if (ret_err_off) |err_off| err_off.* = i - 2; return error.InvalidCharacter; }, @@ -60,12 +57,6 @@ pub fn parse(allocator: *mem.Allocator, source: []const u8, ret_err_off: ?*usize return error.InvalidCharacter; } }, - 0xefbf => { - if (c == 0xbe or c == 0xbf) { - if (ret_err_off) |err_off| err_off.* = i - 2; - return error.InvalidCharacter; - } - }, else => {}, } } From ebeeab2c7dd6c7141f49821ef5d7f4179b9531ba Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Tue, 2 Apr 2019 09:56:14 -0500 Subject: [PATCH 11/18] expand std.ascii, add std.ascii.isZig() Tested against glibc. --- std/ascii.zig | 123 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 105 insertions(+), 18 deletions(-) diff --git a/std/ascii.zig b/std/ascii.zig index 47449c94c132..faa84e7ab301 100644 --- a/std/ascii.zig +++ b/std/ascii.zig @@ -1,5 +1,4 @@ // Does NOT look at the locale the way C89's toupper(3), isspace() et cetera does. -// I could have taken only a u7 to make this clear, but it would be slower // It is my opinion that encodings other than UTF-8 should not be supported. // // (and 128 bytes is not much to pay). @@ -7,23 +6,26 @@ // // https://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/USASCII_code_chart.png/1200px-USASCII_code_chart.png -const tIndex = enum(u3) { - Alpha, - Hex, - Space, - Digit, - Lower, - Upper, - // Ctrl, < 0x20 || == DEL - // Print, = Graph || == ' '. NOT '\t' et cetera - Punct, +const tIndex = enum(u4) { + Alpha, // Lower or Upper + Hex, // Digit or 'a'...'f' or 'A'...'F' + Space, // ' ', Form-feed, '\n', '\r', '\t', '\v' Vertical Tab + Digit, // '0'...'9' + Lower, // 'a'...'z' + Upper, // 'A'...'Z' + Punct, // ASCII and !DEL and !AlNum Graph, + // AlNum Alpha or Digit + // Table 2 + Cntrl,// Ctrl, < 0x20 or == DEL + Print,// Print, = Graph or == ' '. NOT '\t' et cetera. Same as if (Ascii) !Cntrl else false + Blank, //isBlank, == ' ' or == '\t' Horizontal Tab + Zig, // !Cntrl or '\n' or UTF8 //ASCII, | ~0b01111111 - //isBlank, == ' ' || == '\x09' }; -const combinedTable = init: { - comptime var table: [256]u8 = undefined; +const combinedTable: [512]u8 = init: { + comptime var table: [512]u8 = undefined; const std = @import("std"); const mem = std.mem; @@ -125,6 +127,68 @@ const combinedTable = init: { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, }; + const cntrl = []u1{ + // 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + }; + const print = []u1{ + // 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + }; + const blank = []u1{ + // 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + // https://ziglang.org/documentation/master/#Source-Encoding + // or doc/langref.html.in + const zig = []u1{ + // 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, // '\n' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // DEL + + // utf8 continuation characters + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // Surrogate pairs + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 21-bit limit + }; + comptime var i = 0; inline while (i < 128) : (i += 1) { table[i] = @@ -138,11 +202,30 @@ const combinedTable = init: { u8(graph[i]) << @enumToInt(tIndex.Graph); } mem.set(u8, table[128..256], 0); + i = 0; + inline while (i < 128) : (i += 1) { + table[i + 256] = + u8(cntrl[i]) << @truncate(u3, @enumToInt(tIndex.Cntrl) % 8) | + u8(print[i]) << @truncate(u3, @enumToInt(tIndex.Print) % 8) | + u8(blank[i]) << @truncate(u3, @enumToInt(tIndex.Blank) % 8); + } + mem.set(u8, table[256 + 128..], 0); + i = 0; + inline while (i < 256) : (i += 1) { + table[i + 256] |= + u8(zig[i]) << @truncate(u3, @enumToInt(tIndex.Zig) % 8); + } break :init table; }; fn inTable(c: u8, t: tIndex) bool { - return (combinedTable[c] & (u8(1) << @enumToInt(t))) != 0; + var index = @enumToInt(t); + if (index <= 7) { + return (combinedTable[c] & (u8(1) << @truncate(u3, (index)))) != 0; + } else if (index <= 15) { + index %= 8; + return (combinedTable[u9(c) + 256] & (u8(1) << @truncate(u3, index % 8))) != 0; + } else unreachable; } pub fn isAlNum(c: u8) bool { @@ -155,7 +238,7 @@ pub fn isAlpha(c: u8) bool { } pub fn isCntrl(c: u8) bool { - return c < 0x20 or c == 127; //DEL + return inTable(c, tIndex.Cntrl); } pub fn isDigit(c: u8) bool { @@ -171,7 +254,7 @@ pub fn isLower(c: u8) bool { } pub fn isPrint(c: u8) bool { - return inTable(c, tIndex.Graph) or c == ' '; + return iGraph(c) or c == ' '; } pub fn isPunct(c: u8) bool { @@ -195,7 +278,11 @@ pub fn isASCII(c: u8) bool { } pub fn isBlank(c: u8) bool { - return (c == ' ') or (c == '\x09'); + return inTable(c, tIndex.Blank); +} + +pub fn isZig(c: u8) bool { + return inTable(c, tIndex.Zig); } pub fn toUpper(c: u8) u8 { From 7eba643829f1c899186eb81291fa08cadcc697cf Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Mon, 8 Apr 2019 07:05:36 -0500 Subject: [PATCH 12/18] clarify docs: do not suggest character literals are utf-8 encoded --- doc/langref.html.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/langref.html.in b/doc/langref.html.in index 1698bedcb1f1..317877cec898 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -603,19 +603,19 @@ test "string literals" { \xNN - hexadecimal 8-bit character code (2 digits) + hexadecimal 8-bit character code (2 digits), in strings encoded as a single byte \u{NN} - hexadecimal 16-bit Unicode character code UTF-8 encoded (2 digits) + hexadecimal Unicode character code, in strings UTF-8 encoded \u{NNNN} - hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits) + hexadecimal Unicode character code, in strings UTF-8 encoded \u{NNNNNN} - hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits) + hexadecimal Unicode character code, in strings UTF-8 encoded From b4960b0c3622add5254407defd2082e996259630 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Mon, 8 Apr 2019 10:11:55 -0500 Subject: [PATCH 13/18] Iterate UTF-8 code again --- src-self-hosted/compilation.zig | 4 +- src-self-hosted/ir.zig | 99 ++++++++++++------------ src-self-hosted/main.zig | 3 +- std/math/big/int.zig | 2 +- std/os.zig | 47 ++++++------ std/os/path.zig | 7 +- std/os/windows/util.zig | 9 +-- std/special/fmt_runner.zig | 10 ++- std/unicode.zig | 126 +++++++++++++++---------------- std/zig.zig | 2 +- std/zig/parse_string_literal.zig | 30 +++++--- 11 files changed, 175 insertions(+), 164 deletions(-) diff --git a/src-self-hosted/compilation.zig b/src-self-hosted/compilation.zig index 8f31ef132b1a..b5187f6a1b51 100644 --- a/src-self-hosted/compilation.zig +++ b/src-self-hosted/compilation.zig @@ -255,7 +255,8 @@ pub const Compilation = struct { const CompileErrList = std.ArrayList(*Msg); // TODO handle some of these earlier and report them in a way other than error codes - pub const BuildError = error{ + pub const BuildError = std.unicode.Utf8Error || error{ + InvalidCharacter, // !ascii.isZig() or unicode newline OutOfMemory, EndOfStream, IsDir, @@ -299,7 +300,6 @@ pub const Compilation = struct { InvalidDarwinVersionString, UnsupportedLinkArchitecture, UserResourceLimitReached, - InvalidUtf8, BadPathName, DeviceBusy, }; diff --git a/src-self-hosted/ir.zig b/src-self-hosted/ir.zig index c7d4fd12f47e..fdc5b4174839 100644 --- a/src-self-hosted/ir.zig +++ b/src-self-hosted/ir.zig @@ -1347,53 +1347,39 @@ pub const Builder = struct { pub fn genCharLit(irb: *Builder, char_lit: *ast.Node.CharLiteral, scope: *Scope) !*Inst { const char_token = irb.code.tree_scope.tree.tokenSlice(char_lit.token); + const src_span = Span.token(char_lit.token); - var char: u21 = undefined; - got_char: { - if (char_token[1] == '\\') { - char = switch (char_token[2]) { - 'x' => { - const hi = charToDigit(char_token[off], 16) catch unreachable; - const lo = charToDigit(char_token[off + 1], 16) catch unreachable; - char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8); - break :got_char; - }, - 'u' => { - // char_token[3] == '{'; - if (char_token[6] == '}') { - hex_escape_bytes = 1; - } else if (char_token[8] == '}') { - hex_escape_bytes = 2; - } else if (char_token[10] == '}') { - hex_escape_bytes = 3; - } else { - unreachable; - } - var off: u8 = 4; - while (hex_escape_bytes > 0) : (hex_escape_bytes -= 1) { - const hi = charToDigit(char_token[off], 16) catch unreachable; - const lo = charToDigit(char_token[off + 1], 16) catch unreachable; - char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8); - off += 2; - } - break :got_char; - }, - 'n' => '\n', - 'r' => '\r', - '\\' => '\\', - '\t' => '\t', - '\'' => '\'', - '\"' => '\"', - else => unreachable, - }; - break :got_char; - } - // This could read one byte past the end of the file, except - // this guarantees to not read past the first character, and we - // have already validated the file as UTF-8. - _ = utf8Decode(char_token[1..4], &char); - break :got_char; - } + var bad_index: usize = undefined; + var char = std.zig.parseCharLiteral(char_token, &bad_index) catch |err| switch (err) { + error.UnicodeSurrogateHalf, error.UnicodeCodepointTooLarge => { + var hex_string = if (mem.indexOfScalar(u8, char_token, '}')) |i| char_token[2..i] else char_token[2..char_token.len]; + try irb.comp.addCompileError( + irb.code.tree_scope, + src_span, + "Unicode codepoint U+{} cannot be represented in UTF-16 and is invalid", + hex_string, + ); + return error.SemanticAnalysisFailed; + }, + error.ExpectXDigit, error.ExpectLCurly, error.ExpectRCurly, error.ExpectSQuote => { + try irb.comp.addCompileError( + irb.code.tree_scope, + src_span, + "expected {}, got '{c}'", + switch (err) { + error.ExpectXDigit => "hexidecimal digit", + error.ExpectLCurly => "left curly bracket '{'", + error.ExpectRCurly => "right curly bracket '}'", + error.ExpectSQuote => "single quote '''", + else => unreachable, + }, + char_token[bad_index], + ); + return error.SemanticAnalysisFailed; + }, + // File has already been validated as UTF8 + error.Utf8ShortChar, error.Utf8OverlongEncoding, error.Utf8InvalidStartByte => unreachable, + }; const comptime_int_type = Type.ComptimeInt.get(irb.comp); defer comptime_int_type.base.base.deref(irb.comp); @@ -1401,7 +1387,7 @@ pub const Builder = struct { const int_val = Value.Int.createFromCharLiteral( irb.comp, &comptime_int_type.base, - rest, + char, ) catch |err| switch (err) { error.OutOfMemory => return error.OutOfMemory, }; @@ -1438,11 +1424,30 @@ pub const Builder = struct { error.ExpectXDigit => "hexidecimal digit", error.ExpectLCurly => "left curly bracket '{'", error.ExpectRCurly => "right curly bracket '}'", + else => unreachable, }, str_token[bad_index], ); return error.SemanticAnalysisFailed; }, + error.InvalidCharacter => { + assert(str_token[bad_index] == '\n'); + try irb.comp.addCompileError( + irb.code.tree_scope, + src_span, + "expected '\"' before newline", + ); + return error.SemanticAnalysisFailed; + }, + error.InvalidEscape => { + try irb.comp.addCompileError( + irb.code.tree_scope, + src_span, + "invalid escape: '\\{c}'", + str_token[bad_index], + ); + return error.SemanticAnalysisFailed; + }, }; var buf_cleaned = false; errdefer if (!buf_cleaned) irb.comp.gpa().free(buf); diff --git a/src-self-hosted/main.zig b/src-self-hosted/main.zig index 450133a4d76d..0038fdcb04fc 100644 --- a/src-self-hosted/main.zig +++ b/src-self-hosted/main.zig @@ -769,7 +769,8 @@ async fn fmtPath(fmt: *Fmt, file_path_ref: []const u8, check_mode: bool) FmtErro }; defer fmt.loop.allocator.free(source_code); - var tree = std.zig.parse(fmt.loop.allocator, source_code) catch |err| { + var err_loc: usize = undefined; + var tree = std.zig.parse(fmt.loop.allocator, source_code, &err_loc) catch |err| { try stderr.print("error parsing file '{}': {}\n", file_path, err); fmt.any_error = true; return; diff --git a/std/math/big/int.zig b/std/math/big/int.zig index 29b0f7c906bc..0cd69b1e84c2 100644 --- a/std/math/big/int.zig +++ b/std/math/big/int.zig @@ -1275,7 +1275,7 @@ test "big.int string negative" { test "big.int string set bad char error" { var a = try Int.init(al); - testing.expectError(error.InvalidCharForDigit, a.setString(10, "x")); + testing.expectError(error.InvalidCharacter, a.setString(10, "x")); } test "big.int string set bad base error" { diff --git a/std/os.zig b/std/os.zig index d641cf29c970..b9f73ae69e5b 100644 --- a/std/os.zig +++ b/std/os.zig @@ -792,8 +792,7 @@ pub const GetEnvVarOwnedError = error{ EnvironmentVariableNotFound, /// See https://github.com/ziglang/zig/issues/1774 - InvalidUtf8, -}; +} || std.unicode.Utf8Error; /// Caller must free returned memory. /// TODO make this go through libc when we have it @@ -825,12 +824,7 @@ pub fn getEnvVarOwned(allocator: *mem.Allocator, key: []const u8) GetEnvVarOwned continue; } - return std.unicode.utf16leToUtf8Alloc(allocator, buf) catch |err| switch (err) { - error.DanglingSurrogateHalf => return error.InvalidUtf8, - error.ExpectedSecondSurrogateHalf => return error.InvalidUtf8, - error.UnexpectedSecondSurrogateHalf => return error.InvalidUtf8, - error.OutOfMemory => return error.OutOfMemory, - }; + return try std.unicode.utf16leToUtf8Alloc(allocator, buf); } } else { const result = getEnvPosix(key) orelse return error.EnvironmentVariableNotFound; @@ -902,12 +896,11 @@ pub fn symLink(existing_path: []const u8, new_path: []const u8) SymLinkError!voi pub const WindowsSymLinkError = error{ NameTooLong, - InvalidUtf8, BadPathName, /// See https://github.com/ziglang/zig/issues/1396 Unexpected, -}; +} || std.unicode.Utf8Error; pub fn symLinkW(existing_path_w: [*]const u16, new_path_w: [*]const u16) WindowsSymLinkError!void { if (windows.CreateSymbolicLinkW(existing_path_w, new_path_w, 0) == 0) { @@ -1013,16 +1006,15 @@ pub const DeleteFileError = error{ SystemResources, ReadOnlyFileSystem, - /// On Windows, file paths must be valid Unicode. - InvalidUtf8, - /// On Windows, file paths cannot contain these characters: /// '/', '*', '?', '"', '<', '>', '|' BadPathName, /// See https://github.com/ziglang/zig/issues/1396 Unexpected, -}; + + /// On Windows, file paths must be valid Unicode. +} || std.unicode.Utf8Error; pub fn deleteFile(file_path: []const u8) DeleteFileError!void { if (builtin.os == Os.windows) { @@ -1337,12 +1329,11 @@ pub const DeleteDirError = error{ NotDir, DirNotEmpty, ReadOnlyFileSystem, - InvalidUtf8, BadPathName, /// See https://github.com/ziglang/zig/issues/1396 Unexpected, -}; +} || std.unicode.Utf8Error; pub fn deleteDirC(dir_path: [*]const u8) DeleteDirError!void { switch (builtin.os) { @@ -1425,16 +1416,15 @@ const DeleteTreeError = error{ DirNotEmpty, DeviceBusy, - /// On Windows, file paths must be valid Unicode. - InvalidUtf8, - /// On Windows, file paths cannot contain these characters: /// '/', '*', '?', '"', '<', '>', '|' BadPathName, /// See https://github.com/ziglang/zig/issues/1396 Unexpected, -}; + + /// On Windows, file paths must be valid Unicode. +} || std.unicode.Utf8Error; /// TODO determine if we can remove the allocator requirement pub fn deleteTree(allocator: *Allocator, full_path: []const u8) DeleteTreeError!void { @@ -1448,7 +1438,11 @@ pub fn deleteTree(allocator: *Allocator, full_path: []const u8) DeleteTreeError! error.IsDir => {}, error.AccessDenied => got_access_denied = true, - error.InvalidUtf8, + error.Utf8ShortChar, + error.Utf8OverlongEncoding, + error.Utf8InvalidStartByte, + error.UnicodeSurrogateHalf, + error.UnicodeCodepointTooLarge, error.SymLinkLoop, error.NameTooLong, error.SystemResources, @@ -1483,7 +1477,11 @@ pub fn deleteTree(allocator: *Allocator, full_path: []const u8) DeleteTreeError! error.NoSpaceLeft, error.PathAlreadyExists, error.Unexpected, - error.InvalidUtf8, + error.Utf8ShortChar, + error.Utf8OverlongEncoding, + error.Utf8InvalidStartByte, + error.UnicodeSurrogateHalf, + error.UnicodeCodepointTooLarge, error.BadPathName, error.DeviceBusy, => return err, @@ -1566,13 +1564,14 @@ pub const Dir = struct { NoSpaceLeft, PathAlreadyExists, OutOfMemory, - InvalidUtf8, BadPathName, DeviceBusy, /// See https://github.com/ziglang/zig/issues/1396 Unexpected, - }; + + /// On Windows, pathnames must be valid UTF-8 + } || std.unicode.Utf8Error; /// TODO remove the allocator requirement from this API pub fn open(allocator: *Allocator, dir_path: []const u8) OpenError!Dir { diff --git a/std/os/path.zig b/std/os/path.zig index fa8bb282eb9e..eb53b80d589e 100644 --- a/std/os/path.zig +++ b/std/os/path.zig @@ -1159,15 +1159,14 @@ pub const RealError = error{ BadPathName, DeviceBusy, - /// On Windows, file paths must be valid Unicode. - InvalidUtf8, - /// TODO remove this possibility PathAlreadyExists, /// TODO remove this possibility Unexpected, -}; + + /// On Windows, file paths must be valid Unicode. +} || std.unicode.Utf8Error; /// Call from Windows-specific code if you already have a UTF-16LE encoded, null terminated string. /// Otherwise use `real` or `realC`. diff --git a/std/os/windows/util.zig b/std/os/windows/util.zig index 72c84502e369..6001ed5065e8 100644 --- a/std/os/windows/util.zig +++ b/std/os/windows/util.zig @@ -115,16 +115,15 @@ pub const OpenError = error{ PipeBusy, NameTooLong, - /// On Windows, file paths must be valid Unicode. - InvalidUtf8, - /// On Windows, file paths cannot contain these characters: /// '/', '*', '?', '"', '<', '>', '|' BadPathName, /// See https://github.com/ziglang/zig/issues/1396 Unexpected, -}; + + /// On Windows, file paths must be valid Unicode. +} || unicode.Utf8Error; pub fn windowsOpenW( file_path_w: [*]const u16, @@ -308,7 +307,7 @@ pub fn sliceToPrefixedSuffixedFileW(s: []const u8, comptime suffix: []const u16) mem.copy(u16, result[0..], prefix); break :blk prefix.len; }; - const end_index = start_index + try std.unicode.utf8ToUtf16Le(result[start_index..], s); + const end_index = start_index + (try std.unicode.utf8ToUtf16Le(result[start_index..], s)); assert(end_index <= result.len); if (end_index + suffix.len > result.len) return error.NameTooLong; mem.copy(u16, result[end_index..], suffix); diff --git a/std/special/fmt_runner.zig b/std/special/fmt_runner.zig index f0ed6704edba..98841a85933f 100644 --- a/std/special/fmt_runner.zig +++ b/std/special/fmt_runner.zig @@ -71,8 +71,9 @@ pub fn main() !void { const source_code = try stdin.stream.readAllAlloc(allocator, self_hosted_main.max_src_size); defer allocator.free(source_code); - var tree = std.zig.parse(allocator, source_code) catch |err| { - try stderr.print("error parsing stdin: {}\n", err); + var err_loc: usize = undefined; + var tree = std.zig.parse(allocator, source_code, &err_loc) catch |err| { + try stderr.print("error parsing stdin at byte {}: {}\n", err_loc, err); os.exit(1); }; defer tree.deinit(); @@ -166,8 +167,9 @@ fn fmtPath(fmt: *Fmt, file_path_ref: []const u8, check_mode: bool) FmtError!void }; defer fmt.allocator.free(source_code); - var tree = std.zig.parse(fmt.allocator, source_code) catch |err| { - try stderr.print("error parsing file '{}': {}\n", file_path, err); + var err_loc: usize = undefined; + var tree = std.zig.parse(fmt.allocator, source_code, &err_loc) catch |err| { + try stderr.print("error parsing file '{}' at byte {}: {}\n", file_path, err_loc, err); fmt.any_error = true; return; }; diff --git a/std/unicode.zig b/std/unicode.zig index 67274191b607..148562c02c9a 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -135,7 +135,7 @@ pub fn utf8Decode(bytes: []const u8, ret: *align(4) u32) Utf8Error!u3 { pub fn utf8Decode2(bytes: []const u8) Utf8Error!u32 { assert(bytes.len == 2); - assert(bytes[0] & 0b11100000 == 0b11000000); + assert(@clz(~bytes[0]) == 2); var value: u32 = bytes[0] & 0b00011111; if (@clz(~bytes[1]) != 1) return error.Utf8ShortChar; @@ -149,7 +149,7 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Error!u32 { pub fn utf8Decode3(bytes: []const u8) Utf8Error!u32 { assert(bytes.len == 3); - assert(bytes[0] & 0b11110000 == 0b11100000); + assert(@clz(~bytes[0]) == 3); var value: u32 = bytes[0] & 0b00001111; if (@clz(~bytes[1]) != 1) return error.Utf8ShortChar; @@ -168,10 +168,10 @@ pub fn utf8Decode3(bytes: []const u8) Utf8Error!u32 { pub fn utf8Decode4(bytes: []const u8) Utf8Error!u32 { assert(bytes.len == 4); - assert(bytes[0] & 0b11111000 == 0b11110000); + assert(@clz(~bytes[0]) == 4); var value: u32 = bytes[0] & 0b00000111; - if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; + if (@clz(~bytes[1]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[1] & 0b00111111; @@ -179,7 +179,7 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Error!u32 { value <<= 6; value |= bytes[2] & 0b00111111; - if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; + if (@clz(~bytes[3]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[3] & 0b00111111; @@ -223,9 +223,8 @@ pub const Utf8View = struct { bytes: []const u8, pub fn init(s: []const u8) !Utf8View { - if (utf8ValidateSlice(s)) { - return initUnchecked(s); - } else return error.InvalidUtf8; + try utf8ValidateSliceWithLoc(s, null); + return initUnchecked(s); } pub fn initUnchecked(s: []const u8) Utf8View { @@ -254,23 +253,23 @@ pub const Utf8Iterator = struct { bytes: []const u8, i: usize, - pub fn nextCodepointSlice(it: *Utf8Iterator) ?[]const u8 { + pub fn nextCodepointSlice(it: *Utf8Iterator) !?[]const u8 { if (it.i >= it.bytes.len) { return null; } - const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch null; + const cp_len = try utf8ByteSequenceLength(it.bytes[it.i]); it.i += cp_len; return it.bytes[it.i - cp_len .. it.i]; } - pub fn nextCodepoint(it: *Utf8Iterator) ?u21 { + pub fn nextCodepoint(it: *Utf8Iterator) !?u21 { if (it.i >= it.bytes.len) { return null; } var c: u32 = undefined; - it.i += utf8Decode(it.bytes[it.i..], &c) catch return null; + it.i += try utf8Decode(it.bytes[it.i..], &c); return @intCast(u21, c); } }; @@ -307,32 +306,6 @@ pub const Utf16LeIterator = struct { } }; -test "utf8 encode" { - comptime testUtf8Encode() catch unreachable; - try testUtf8Encode(); -} -fn testUtf8Encode() !void { - // A few taken from wikipedia a few taken elsewhere - var array: [4]u8 = undefined; - testing.expect((try utf8Encode('€', array[0..])) == 3); - testing.expect(array[0] == 0b11100010); - testing.expect(array[1] == 0b10000010); - testing.expect(array[2] == 0b10101100); - - testing.expect((try utf8Encode('$', array[0..])) == 1); - testing.expect(array[0] == 0b00100100); - - testing.expect((try utf8Encode('¢', array[0..])) == 2); - testing.expect(array[0] == 0b11000010); - testing.expect(array[1] == 0b10100010); - - testing.expect((try utf8Encode('𐍈', array[0..])) == 4); - testing.expect(array[0] == 0b11110000); - testing.expect(array[1] == 0b10010000); - testing.expect(array[2] == 0b10001101); - testing.expect(array[3] == 0b10001000); -} - test "utf8 encode error" { comptime testUtf8EncodeError(); testUtf8EncodeError(); @@ -349,23 +322,23 @@ fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) void { } test "utf8 iterator on ascii" { - comptime testUtf8IteratorOnAscii(); - testUtf8IteratorOnAscii(); + try comptime testUtf8IteratorOnAscii(); + try testUtf8IteratorOnAscii(); } -fn testUtf8IteratorOnAscii() void { +fn testUtf8IteratorOnAscii() !void { const s = Utf8View.initComptime("abc"); var it1 = s.iterator(); - testing.expect(std.mem.eql(u8, "a", it1.nextCodepointSlice().?)); - testing.expect(std.mem.eql(u8, "b", it1.nextCodepointSlice().?)); - testing.expect(std.mem.eql(u8, "c", it1.nextCodepointSlice().?)); - testing.expect(it1.nextCodepointSlice() == null); + testing.expect(std.mem.eql(u8, "a", (try it1.nextCodepointSlice()).?)); + testing.expect(std.mem.eql(u8, "b", (try it1.nextCodepointSlice()).?)); + testing.expect(std.mem.eql(u8, "c", (try it1.nextCodepointSlice()).?)); + testing.expect((try it1.nextCodepointSlice()) == null); var it2 = s.iterator(); - testing.expect(it2.nextCodepoint().? == 'a'); - testing.expect(it2.nextCodepoint().? == 'b'); - testing.expect(it2.nextCodepoint().? == 'c'); - testing.expect(it2.nextCodepoint() == null); + testing.expect((try it2.nextCodepoint()).? == 'a'); + testing.expect((try it2.nextCodepoint()).? == 'b'); + testing.expect((try it2.nextCodepoint()).? == 'c'); + testing.expect((try it2.nextCodepoint()) == null); } test "utf8 view bad" { @@ -375,27 +348,27 @@ test "utf8 view bad" { fn testUtf8ViewBad() void { // Compile-time error. // const s3 = Utf8View.initComptime("\xfe\xf2"); - testing.expectError(error.InvalidUtf8, Utf8View.init("hel\xadlo")); + testing.expectError(error.Utf8InvalidStartByte, Utf8View.init("hel\xadlo")); } test "utf8 view ok" { - comptime testUtf8ViewOk(); - testUtf8ViewOk(); + try comptime testUtf8ViewOk(); + try testUtf8ViewOk(); } -fn testUtf8ViewOk() void { +fn testUtf8ViewOk() !void { const s = Utf8View.initComptime("東京市"); var it1 = s.iterator(); - testing.expect(std.mem.eql(u8, "東", it1.nextCodepointSlice().?)); - testing.expect(std.mem.eql(u8, "京", it1.nextCodepointSlice().?)); - testing.expect(std.mem.eql(u8, "市", it1.nextCodepointSlice().?)); - testing.expect(it1.nextCodepointSlice() == null); + testing.expect(std.mem.eql(u8, "東", (try it1.nextCodepointSlice()).?)); + testing.expect(std.mem.eql(u8, "京", (try it1.nextCodepointSlice()).?)); + testing.expect(std.mem.eql(u8, "市", (try it1.nextCodepointSlice()).?)); + testing.expect((try it1.nextCodepointSlice()) == null); var it2 = s.iterator(); - testing.expect(it2.nextCodepoint().? == 0x6771); - testing.expect(it2.nextCodepoint().? == 0x4eac); - testing.expect(it2.nextCodepoint().? == 0x5e02); - testing.expect(it2.nextCodepoint() == null); + testing.expect((try it2.nextCodepoint()).? == 0x6771); + testing.expect((try it2.nextCodepoint()).? == 0x4eac); + testing.expect((try it2.nextCodepoint()).? == 0x5e02); + testing.expect((try it2.nextCodepoint()) == null); } test "bad utf8 slice" { @@ -592,7 +565,7 @@ pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![]u16 const view = try Utf8View.init(utf8); var it = view.iterator(); - while (it.nextCodepoint()) |codepoint| { + while (try it.nextCodepoint()) |codepoint| { try result.append(@intCast(u16, codepoint)); // TODO surrogate pairs } @@ -608,7 +581,7 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { var end_index: usize = 0; var it = (try Utf8View.init(utf8)).iterator(); - while (it.nextCodepoint()) |codepoint| { + while (try it.nextCodepoint()) |codepoint| { if (end_index == utf16le_as_bytes.len) return (end_index / 2) + 1; // TODO surrogate pairs mem.writeIntSliceLittle(u16, utf16le_as_bytes[end_index..], @intCast(u16, codepoint)); @@ -616,3 +589,30 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { } return end_index / 2; } + +test "utf8 encode" { + comptime testUtf8Encode() catch unreachable; + try testUtf8Encode(); +} +fn testUtf8Encode() !void { + // A few taken from wikipedia a few taken elsewhere + var array: [4]u8 = undefined; + testing.expect((try utf8Encode('€', array[0..])) == 3); + testing.expect(array[0] == 0b11100010); + testing.expect(array[1] == 0b10000010); + testing.expect(array[2] == 0b10101100); + + testing.expect((try utf8Encode('$', array[0..])) == 1); + testing.expect(array[0] == 0b00100100); + + testing.expect((try utf8Encode('¢', array[0..])) == 2); + testing.expect(array[0] == 0b11000010); + testing.expect(array[1] == 0b10100010); + + testing.expect((try utf8Encode('𐍈', array[0..])) == 4); + testing.expect(array[0] == 0b11110000); + testing.expect(array[1] == 0b10010000); + testing.expect(array[2] == 0b10001101); + testing.expect(array[3] == 0b10001000); +} + diff --git a/std/zig.zig b/std/zig.zig index 2d4978a4aec8..50d2a4fb63a2 100644 --- a/std/zig.zig +++ b/std/zig.zig @@ -2,7 +2,7 @@ const tokenizer = @import("zig/tokenizer.zig"); pub const Token = tokenizer.Token; pub const Tokenizer = tokenizer.Tokenizer; pub const parse = @import("zig/parse.zig").parse; -pub const parseStringLiteral = @import("zig/parse_string_literal.zig").parseStringLiteral; +use @import("zig/parse_string_literal.zig"); pub const render = @import("zig/render.zig").render; pub const ast = @import("zig/ast.zig"); diff --git a/std/zig/parse_string_literal.zig b/std/zig/parse_string_literal.zig index 16bfa4c66ef3..0938d90d4a0e 100644 --- a/std/zig/parse_string_literal.zig +++ b/std/zig/parse_string_literal.zig @@ -4,7 +4,7 @@ const mem = std.mem; const fmt = std.fmt; const unicode = std.unicode; -pub const ParseEscapeError = std.unicode.UnicodeError || error{ +const ParseEscapeError = std.unicode.UnicodeError || error{ ExpectXDigit, ExpectLCurly, ExpectRCurly, @@ -47,14 +47,14 @@ inline fn parseEscape(escape_sequence: []const u8, ret_len: *u4) ParseEscapeErro }, else => unreachable, }} - unicode.isValidUnicode(ret) catch |err| return err; + try unicode.isValidUnicode(ret); return ret; } pub const ParseCharLiteralError = ParseEscapeError || unicode.Utf8Error || error{ ExpectSQuote, }; -pub fn parseCharLiteral(char_token: []const u8) ParseCharLiteralError!u21 { +pub fn parseCharLiteral(char_token: []const u8, maybe_ret_err: ?*usize) ParseCharLiteralError!u21 { var char: u21 = undefined; if (char_token[1] == '\\') { var len: u4 = undefined; @@ -78,8 +78,8 @@ pub fn parseCharLiteral(char_token: []const u8) ParseCharLiteralError!u21 { test "zig.parseCharLiteral" { const expect = std.testing.expect; - expect(parseCharLiteral("\'0\'") catch unreachable == '0'); - expect(parseCharLiteral("\'\x20\'") catch unreachable == ' '); + expect(parseCharLiteral("\'0\'", null) catch unreachable == '0'); + expect(parseCharLiteral("\'\x20\'", null) catch unreachable == ' '); } const State = enum { @@ -89,13 +89,15 @@ const State = enum { pub const ParseStringLiteralError = ParseEscapeError || error{ OutOfMemory, + InvalidEscape, + InvalidCharacter, }; /// caller owns returned memory pub fn parseStringLiteral( allocator: *std.mem.Allocator, bytes: []const u8, - bad_index: *usize, // populated if error.InvalidCharacter is returned + maybe_ret_bad_index: ?*usize, // populated if error.InvalidCharacter is returned ) ParseStringLiteralError![]u8 { const first_index = if (bytes[0] == 'c') usize(2) else usize(1); assert(bytes[bytes.len - 1] == '"'); @@ -114,7 +116,7 @@ pub fn parseStringLiteral( State.Start => switch (b) { '\\' => state = State.Backslash, '\n' => { - bad_index.* = index; + if (maybe_ret_bad_index) |i| i.* = index; return error.InvalidCharacter; }, '"' => return list.toOwnedSlice(), @@ -123,9 +125,13 @@ pub fn parseStringLiteral( State.Backslash => switch (b) { 'x', 'u' => { var encoded: [4]u8 = undefined; - var len: u3 = undefined; - bad_index.* = index; - len = unicode.utf8Encode(try parseEscape(bytes[2..], &len), encoded[0..]) catch unreachable; + var len: u4 = undefined; + len = unicode.utf8Encode(parseEscape(bytes[2..], &len) catch |err| { + if (maybe_ret_bad_index) |i| { + i.* = index + len; + } + return err; + }, encoded[0..]) catch unreachable; try list.appendSlice(encoded[0..len]); index += len; state = State.Start; @@ -155,8 +161,8 @@ pub fn parseStringLiteral( state = State.Start; }, else => { - bad_index.* = index; - return error.InvalidCharacter; + if (maybe_ret_bad_index) |i| i.* = index; + return error.InvalidEscape; }, }, else => unreachable, From 91fdc138578460aca689c2eb5ced7ac08a6a8f9a Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Mon, 8 Apr 2019 12:35:50 -0500 Subject: [PATCH 14/18] remove unused enum entry You can just use git grep to see if this stuff is used. --- src/all_types.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/all_types.hpp b/src/all_types.hpp index 92faad1e03aa..3bd3cec45fbc 100644 --- a/src/all_types.hpp +++ b/src/all_types.hpp @@ -266,7 +266,6 @@ enum RuntimeHintErrorUnion { enum RuntimeHintOptional { RuntimeHintOptionalUnknown, - RuntimeHintOptionalNull, // TODO is this value even possible? if this is the case it might mean the const value is compile time known. RuntimeHintOptionalNonNull, }; From a2d9b9e0b0228d62ef41f1a6fc93bd2e27f877c5 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Mon, 8 Apr 2019 12:48:42 -0500 Subject: [PATCH 15/18] stage1: const_values_equal for Error Unions My unicode.zig stuff actually used this stuff and found out it wasn't complete --- src/analyze.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/analyze.cpp b/src/analyze.cpp index 394364c68fc7..ed0bbbc2974a 100644 --- a/src/analyze.cpp +++ b/src/analyze.cpp @@ -5140,6 +5140,12 @@ static bool const_values_equal_array(CodeGen *g, ConstExprValue *a, ConstExprVal } bool const_values_equal(CodeGen *g, ConstExprValue *a, ConstExprValue *b) { + if (a == nullptr || b == nullptr) { + if (a == nullptr && b == nullptr) + return true; + else + return false; + } assert(a->type->id == b->type->id); assert(a->special == ConstValSpecialStatic); assert(b->special == ConstValSpecialStatic); @@ -5223,7 +5229,8 @@ bool const_values_equal(CodeGen *g, ConstExprValue *a, ConstExprValue *b) { return const_values_equal(g, a->data.x_optional, b->data.x_optional); } case ZigTypeIdErrorUnion: - zig_panic("TODO"); + return const_values_equal(g, a->data.x_err_union.payload, b->data.x_err_union.payload) && + const_values_equal(g, a->data.x_err_union.error_set, b->data.x_err_union.error_set); case ZigTypeIdArgTuple: return a->data.x_arg_tuple.start_index == b->data.x_arg_tuple.start_index && a->data.x_arg_tuple.end_index == b->data.x_arg_tuple.end_index; From 6912bdfc5b0c063079b293cedb890ee47bc8d616 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Tue, 9 Apr 2019 10:55:36 -0500 Subject: [PATCH 16/18] stage1: track filename in IR --- src/all_types.hpp | 1 + src/analyze.cpp | 2 +- src/parser.cpp | 8 ++++++++ src/tokenizer.cpp | 6 +++++- src/tokenizer.hpp | 3 ++- 5 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/all_types.hpp b/src/all_types.hpp index 3bd3cec45fbc..5fdef666a1b9 100644 --- a/src/all_types.hpp +++ b/src/all_types.hpp @@ -939,6 +939,7 @@ struct AstNode { enum NodeType type; size_t line; size_t column; + char *filename; ZigType *owner; union { AstNodeFnDef fn_def; diff --git a/src/analyze.cpp b/src/analyze.cpp index ed0bbbc2974a..0ac6cd5458f1 100644 --- a/src/analyze.cpp +++ b/src/analyze.cpp @@ -3838,7 +3838,7 @@ ZigType *add_source_file(CodeGen *g, ZigPackage *package, Buf *resolved_path, Bu } Tokenization tokenization = {0}; - tokenize(source_code, &tokenization); + tokenize(source_code, &tokenization, buf_ptr(resolved_path)); if (tokenization.err) { ErrorMsg *err = err_msg_create_with_line(resolved_path, tokenization.err_line, tokenization.err_column, diff --git a/src/parser.cpp b/src/parser.cpp index 9172e21b9244..d943e2bf7772 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -165,6 +165,7 @@ static AstNode *ast_create_node(ParseContext *pc, NodeType type, Token *first_to AstNode *node = ast_create_node_no_line_info(pc, type); node->line = first_token->start_line; node->column = first_token->start_column; + node->filename = first_token->filename; return node; } @@ -596,6 +597,7 @@ static AstNode *ast_parse_top_level_decl(ParseContext *pc, VisibMod visib_mod) { assert(var_decl->type == NodeTypeVariableDeclaration); var_decl->line = first->start_line; var_decl->column = first->start_column; + var_decl->filename = first->filename; var_decl->data.variable_declaration.visib_mod = visib_mod; var_decl->data.variable_declaration.is_extern = first->id == TokenIdKeywordExtern; var_decl->data.variable_declaration.is_export = first->id == TokenIdKeywordExport; @@ -613,6 +615,7 @@ static AstNode *ast_parse_top_level_decl(ParseContext *pc, VisibMod visib_mod) { assert(fn_proto->type == NodeTypeFnProto); fn_proto->line = first->start_line; fn_proto->column = first->start_column; + fn_proto->filename = first->filename; fn_proto->data.fn_proto.visib_mod = visib_mod; fn_proto->data.fn_proto.is_extern = first->id == TokenIdKeywordExtern; fn_proto->data.fn_proto.is_export = first->id == TokenIdKeywordExport; @@ -1547,6 +1550,7 @@ static AstNode *ast_parse_primary_type_expr(ParseContext *pc) { assert(res->type == NodeTypeFnCallExpr); res->line = at_sign->start_line; res->column = at_sign->start_column; + res->filename = at_sign->filename; res->data.fn_call_expr.fn_ref_expr = name_sym; res->data.fn_call_expr.is_builtin = true; return res; @@ -1683,6 +1687,7 @@ static AstNode *ast_parse_container_decl(ParseContext *pc) { assert(res->type == NodeTypeContainerDecl); res->line = extern_token->start_line; res->column = extern_token->start_column; + res->filename = extern_token->filename; res->data.container_decl.layout = ContainerLayoutExtern; return res; } @@ -1693,6 +1698,7 @@ static AstNode *ast_parse_container_decl(ParseContext *pc) { assert(res->type == NodeTypeContainerDecl); res->line = packed_token->start_line; res->column = packed_token->start_column; + res->filename = packed_token->filename; res->data.container_decl.layout = ContainerLayoutPacked; return res; } @@ -1831,6 +1837,7 @@ static AstNode *ast_parse_asm_expr(ParseContext *pc) { res->line = asm_token->start_line; res->column = asm_token->start_column; + res->filename = asm_token->filename; res->data.asm_expr.volatile_token = volatile_token; res->data.asm_expr.asm_template = asm_template; return res; @@ -2069,6 +2076,7 @@ static AstNode *ast_parse_param_decl(ParseContext *pc) { assert(res->type == NodeTypeParamDecl); res->line = first->start_line; res->column = first->start_column; + res->filename = first->filename; res->data.param_decl.name = token_buf(name); res->data.param_decl.is_noalias = first->id == TokenIdKeywordNoAlias; res->data.param_decl.is_inline = first->id == TokenIdKeywordCompTime; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 33df21a66fa5..c9f70048ff58 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -238,6 +238,8 @@ struct Tokenize { ZigList *tokens; int line; int column; + // TODO use a lookup table, so that this can go from 64-bits to maybe 12-bits for every instruction + char *filename; Token *cur_tok; Tokenization *out; uint32_t radix; @@ -286,6 +288,7 @@ static void begin_token(Tokenize *t, TokenId id) { Token *token = &t->tokens->last(); token->start_line = t->line; token->start_column = t->column; + token->filename = t->filename; token->start_pos = t->pos; set_token_id(t, token, id); @@ -403,11 +406,12 @@ static void invalid_char_error(Tokenize *t, uint8_t c) { tokenize_error(t, "invalid character: '\\x%02x'", c); } -void tokenize(Buf *buf, Tokenization *out) { +void tokenize(Buf *buf, Tokenization *out, char *filename) { Tokenize t = {0}; t.out = out; t.tokens = out->tokens = allocate>(1); t.buf = buf; + t.filename = filename; for (size_t i=0;i *tokens); From d998fbe1fc33ebf9b453c084f37d0de495e982e6 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Tue, 9 Apr 2019 15:00:53 -0500 Subject: [PATCH 17/18] stage1: fix nasty casting bug That produced garbage. --- src/ir.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ir.cpp b/src/ir.cpp index de4543df4e61..acf157ca52bf 100644 --- a/src/ir.cpp +++ b/src/ir.cpp @@ -18129,7 +18129,7 @@ static Error ir_make_type_info_defs(IrAnalyze *ira, IrInstruction *source_instr, return ErrorSemanticAnalyzeFail; } - AstNodeFnProto *fn_node = (AstNodeFnProto *)(fn_entry->proto_node); + AstNodeFnProto *fn_node = &fn_entry->proto_node->data.fn_proto; ConstExprValue *fn_def_val = create_const_vals(1); fn_def_val->special = ConstValSpecialStatic; From fa59ab09e88609adac30b65f31f733401db6a4a9 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Tue, 9 Apr 2019 19:16:51 -0500 Subject: [PATCH 18/18] remove Shebang (#!) support Closes: #2165 --- src/analyze.cpp | 2 +- src/cache_hash.cpp | 2 +- src/codegen.cpp | 6 +++--- src/libc_installation.cpp | 2 +- src/main.cpp | 4 ++-- src/os.cpp | 35 +++++------------------------------ src/os.hpp | 4 ++-- std/zig/ast.zig | 3 --- std/zig/parse.zig | 10 ---------- std/zig/parser_test.zig | 8 -------- std/zig/render.zig | 5 ----- 11 files changed, 15 insertions(+), 66 deletions(-) diff --git a/src/analyze.cpp b/src/analyze.cpp index 0ac6cd5458f1..efc5809478e8 100644 --- a/src/analyze.cpp +++ b/src/analyze.cpp @@ -6077,7 +6077,7 @@ Error file_fetch(CodeGen *g, Buf *resolved_path, Buf *contents) { if (g->enable_cache) { return cache_add_file_fetch(&g->cache_hash, resolved_path, contents); } else { - return os_fetch_file_path(resolved_path, contents, false); + return os_fetch_file_path(resolved_path, contents); } } diff --git a/src/cache_hash.cpp b/src/cache_hash.cpp index 1f25a9982e14..2da52dd82120 100644 --- a/src/cache_hash.cpp +++ b/src/cache_hash.cpp @@ -469,7 +469,7 @@ Error cache_add_file(CacheHash *ch, Buf *path) { Error cache_add_dep_file(CacheHash *ch, Buf *dep_file_path, bool verbose) { Error err; Buf *contents = buf_alloc(); - if ((err = os_fetch_file_path(dep_file_path, contents, false))) { + if ((err = os_fetch_file_path(dep_file_path, contents))) { if (verbose) { fprintf(stderr, "unable to read .d file: %s\n", err_str(err)); } diff --git a/src/codegen.cpp b/src/codegen.cpp index 568344fc099d..2dffb1eaac72 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -7814,7 +7814,7 @@ static Error define_builtin_compile_vars(CodeGen *g) { Buf *contents; if (hit) { contents = buf_alloc(); - if ((err = os_fetch_file_path(builtin_zig_path, contents, false))) { + if ((err = os_fetch_file_path(builtin_zig_path, contents))) { fprintf(stderr, "Unable to open '%s': %s\n", buf_ptr(builtin_zig_path), err_str(err)); exit(1); } @@ -8233,7 +8233,7 @@ static void gen_root_source(CodeGen *g) { Error err; // No need for using the caching system for this file fetch because it is handled // separately. - if ((err = os_fetch_file_path(resolved_path, source_code, true))) { + if ((err = os_fetch_file_path(resolved_path, source_code))) { fprintf(stderr, "unable to open '%s': %s\n", buf_ptr(resolved_path), err_str(err)); exit(1); } @@ -8308,7 +8308,7 @@ static void gen_global_asm(CodeGen *g) { Buf *asm_file = g->assembly_files.at(i); // No need to use the caching system for these fetches because they // are handled separately. - if ((err = os_fetch_file_path(asm_file, &contents, false))) { + if ((err = os_fetch_file_path(asm_file, &contents))) { zig_panic("Unable to read %s: %s", buf_ptr(asm_file), err_str(err)); } buf_append_buf(&g->global_asm, &contents); diff --git a/src/libc_installation.cpp b/src/libc_installation.cpp index 3ea17f1bdc52..3e5f8b0d662b 100644 --- a/src/libc_installation.cpp +++ b/src/libc_installation.cpp @@ -45,7 +45,7 @@ Error zig_libc_parse(ZigLibCInstallation *libc, Buf *libc_file, const ZigTarget bool found_keys[array_length(zig_libc_keys)] = {}; Buf *contents = buf_alloc(); - if ((err = os_fetch_file_path(libc_file, contents, false))) { + if ((err = os_fetch_file_path(libc_file, contents))) { if (err != ErrorFileNotFound && verbose) { fprintf(stderr, "Unable to read '%s': %s\n", buf_ptr(libc_file), err_str(err)); } diff --git a/src/main.cpp b/src/main.cpp index bd3d57495600..ad56b086ff99 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -341,7 +341,7 @@ int main(int argc, char **argv) { os_path_split(cwd, nullptr, cwd_basename); Buf *build_zig_contents = buf_alloc(); - if ((err = os_fetch_file_path(build_zig_path, build_zig_contents, false))) { + if ((err = os_fetch_file_path(build_zig_path, build_zig_contents))) { fprintf(stderr, "Unable to read %s: %s\n", buf_ptr(build_zig_path), err_str(err)); return EXIT_FAILURE; } @@ -356,7 +356,7 @@ int main(int argc, char **argv) { } Buf *main_zig_contents = buf_alloc(); - if ((err = os_fetch_file_path(main_zig_path, main_zig_contents, false))) { + if ((err = os_fetch_file_path(main_zig_path, main_zig_contents))) { fprintf(stderr, "Unable to read %s: %s\n", buf_ptr(main_zig_path), err_str(err)); return EXIT_FAILURE; } diff --git a/src/os.cpp b/src/os.cpp index 470d2223072f..7779f3396f13 100644 --- a/src/os.cpp +++ b/src/os.cpp @@ -751,39 +751,15 @@ Buf os_path_resolve(Buf **paths_ptr, size_t paths_len) { #endif } -Error os_fetch_file(FILE *f, Buf *out_buf, bool skip_shebang) { +Error os_fetch_file(FILE *f, Buf *out_buf) { static const ssize_t buf_size = 0x2000; buf_resize(out_buf, buf_size); ssize_t actual_buf_len = 0; - bool first_read = true; - for (;;) { size_t amt_read = fread(buf_ptr(out_buf) + actual_buf_len, 1, buf_size, f); actual_buf_len += amt_read; - if (skip_shebang && first_read && buf_starts_with_str(out_buf, "#!")) { - size_t i = 0; - while (true) { - if (i > buf_len(out_buf)) { - zig_panic("shebang line exceeded %zd characters", buf_size); - } - - size_t current_pos = i; - i += 1; - - if (out_buf->list.at(current_pos) == '\n') { - break; - } - } - - ZigList *list = &out_buf->list; - memmove(list->items, list->items + i, list->length - i); - list->length -= i; - - actual_buf_len -= i; - } - if (amt_read != buf_size) { if (feof(f)) { buf_resize(out_buf, actual_buf_len); @@ -794,7 +770,6 @@ Error os_fetch_file(FILE *f, Buf *out_buf, bool skip_shebang) { } buf_resize(out_buf, actual_buf_len + buf_size); - first_read = false; } zig_unreachable(); } @@ -864,8 +839,8 @@ static Error os_exec_process_posix(const char *exe, ZigList &args, FILE *stdout_f = fdopen(stdout_pipe[0], "rb"); FILE *stderr_f = fdopen(stderr_pipe[0], "rb"); - Error err1 = os_fetch_file(stdout_f, out_stdout, false); - Error err2 = os_fetch_file(stderr_f, out_stderr, false); + Error err1 = os_fetch_file(stdout_f, out_stdout); + Error err2 = os_fetch_file(stderr_f, out_stderr); fclose(stdout_f); fclose(stderr_f); @@ -1097,7 +1072,7 @@ Error os_copy_file(Buf *src_path, Buf *dest_path) { } } -Error os_fetch_file_path(Buf *full_path, Buf *out_contents, bool skip_shebang) { +Error os_fetch_file_path(Buf *full_path, Buf *out_contents) { FILE *f = fopen(buf_ptr(full_path), "rb"); if (!f) { switch (errno) { @@ -1116,7 +1091,7 @@ Error os_fetch_file_path(Buf *full_path, Buf *out_contents, bool skip_shebang) { return ErrorFileSystem; } } - Error result = os_fetch_file(f, out_contents, skip_shebang); + Error result = os_fetch_file(f, out_contents); fclose(f); return result; } diff --git a/src/os.hpp b/src/os.hpp index 5064a6444c2e..b79870718f01 100644 --- a/src/os.hpp +++ b/src/os.hpp @@ -126,8 +126,8 @@ void os_file_close(OsFile file); Error ATTRIBUTE_MUST_USE os_write_file(Buf *full_path, Buf *contents); Error ATTRIBUTE_MUST_USE os_copy_file(Buf *src_path, Buf *dest_path); -Error ATTRIBUTE_MUST_USE os_fetch_file(FILE *file, Buf *out_contents, bool skip_shebang); -Error ATTRIBUTE_MUST_USE os_fetch_file_path(Buf *full_path, Buf *out_contents, bool skip_shebang); +Error ATTRIBUTE_MUST_USE os_fetch_file(FILE *file, Buf *out_contents); +Error ATTRIBUTE_MUST_USE os_fetch_file_path(Buf *full_path, Buf *out_contents); Error ATTRIBUTE_MUST_USE os_get_cwd(Buf *out_cwd); diff --git a/std/zig/ast.zig b/std/zig/ast.zig index 9aba59f77cda..7024f988a22a 100644 --- a/std/zig/ast.zig +++ b/std/zig/ast.zig @@ -479,7 +479,6 @@ pub const Node = struct { doc_comments: ?*DocComment, decls: DeclList, eof_token: TokenIndex, - shebang: ?TokenIndex, pub const DeclList = SegmentedList(*Node, 4); @@ -491,7 +490,6 @@ pub const Node = struct { } pub fn firstToken(self: *const Root) TokenIndex { - if (self.shebang) |shebang| return shebang; return if (self.decls.len == 0) self.eof_token else (self.decls.at(0).*).firstToken(); } @@ -2235,7 +2233,6 @@ test "iterate" { .doc_comments = null, .decls = Node.Root.DeclList.init(std.debug.global_allocator), .eof_token = 0, - .shebang = null, }; var base = &root.base; testing.expect(base.iterate(0) == null); diff --git a/std/zig/parse.zig b/std/zig/parse.zig index 454c25889321..e14ef3aa9654 100644 --- a/std/zig/parse.zig +++ b/std/zig/parse.zig @@ -24,7 +24,6 @@ pub fn parse(allocator: *mem.Allocator, source: []const u8, ret_err_off: ?*usize .base = ast.Node{ .id = ast.Node.Id.Root }, .decls = ast.Node.Root.DeclList.init(arena), .doc_comments = null, - .shebang = null, // initialized when we get the eof token .eof_token = undefined, }; @@ -78,15 +77,6 @@ pub fn parse(allocator: *mem.Allocator, source: []const u8, ret_err_off: ?*usize } var tok_it = tree.tokens.iterator(0); - // skip over shebang line - shebang: { - const shebang_tok_index = tok_it.index; - const shebang_tok_ptr = tok_it.peek() orelse break :shebang; - if (shebang_tok_ptr.id != Token.Id.ShebangLine) break :shebang; - root_node.shebang = shebang_tok_index; - _ = tok_it.next(); - } - // skip over line comments at the top of the file while (true) { const next_tok = tok_it.peek() orelse break; diff --git a/std/zig/parser_test.zig b/std/zig/parser_test.zig index 7cc4b83fd3a8..51320c06d5bb 100644 --- a/std/zig/parser_test.zig +++ b/std/zig/parser_test.zig @@ -62,14 +62,6 @@ test "zig fmt: linksection" { ); } -test "zig fmt: shebang line" { - try testCanonical( - \\#!/usr/bin/env zig - \\pub fn main() void {} - \\ - ); -} - test "zig fmt: correctly move doc comments on struct fields" { try testTransform( \\pub const section_64 = extern struct { diff --git a/std/zig/render.zig b/std/zig/render.zig index f1fe23c2a8c1..74c1e2acfc20 100644 --- a/std/zig/render.zig +++ b/std/zig/render.zig @@ -73,11 +73,6 @@ fn renderRoot( ) (@typeOf(stream).Child.Error || Error)!void { var tok_it = tree.tokens.iterator(0); - // render the shebang line - if (tree.root_node.shebang) |shebang| { - try stream.write(tree.tokenSlice(shebang)); - } - // render all the line comments at the beginning of the file while (tok_it.next()) |token| { if (token.id != Token.Id.LineComment) break;