-
-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Unicode escapes: support u{N...} #2823
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -190,6 +190,7 @@ enum TokenizeState { | |
| TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5" | ||
| TokenizeStateString, | ||
| TokenizeStateStringEscape, | ||
| TokenizeStateStringEscapeUnicodeStart, | ||
| TokenizeStateCharLiteral, | ||
| TokenizeStateCharLiteralEnd, | ||
| TokenizeStateSawStar, | ||
|
|
@@ -241,7 +242,6 @@ struct Tokenize { | |
| int32_t exp_add_amt; | ||
| bool is_exp_negative; | ||
| size_t char_code_index; | ||
| size_t char_code_end; | ||
| bool unicode; | ||
| uint32_t char_code; | ||
| int exponent_in_bin_or_dec; | ||
|
|
@@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) { | |
| t.radix = 16; | ||
| t.char_code = 0; | ||
| t.char_code_index = 0; | ||
| t.char_code_end = 2; | ||
| t.unicode = false; | ||
| break; | ||
| case 'u': | ||
| t.state = TokenizeStateCharCode; | ||
| t.radix = 16; | ||
| t.char_code = 0; | ||
| t.char_code_index = 0; | ||
| t.char_code_end = 4; | ||
| t.unicode = true; | ||
| break; | ||
| case 'U': | ||
| t.state = TokenizeStateCharCode; | ||
| t.radix = 16; | ||
| t.char_code = 0; | ||
| t.char_code_index = 0; | ||
| t.char_code_end = 6; | ||
| t.unicode = true; | ||
| t.state = TokenizeStateStringEscapeUnicodeStart; | ||
| break; | ||
| case 'n': | ||
| handle_string_escape(&t, '\n'); | ||
|
|
@@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) { | |
| invalid_char_error(&t, c); | ||
| } | ||
| break; | ||
| case TokenizeStateStringEscapeUnicodeStart: | ||
| switch (c) { | ||
| case '{': | ||
| t.state = TokenizeStateCharCode; | ||
| t.radix = 16; | ||
| t.char_code = 0; | ||
| t.char_code_index = 0; | ||
| t.unicode = true; | ||
| break; | ||
| default: | ||
| invalid_char_error(&t, c); | ||
| } | ||
| break; | ||
| case TokenizeStateCharCode: | ||
| { | ||
| if (t.unicode && c == '}') { | ||
| if (t.char_code_index == 0) { | ||
| tokenize_error(&t, "empty unicode escape sequence"); | ||
| break; | ||
| } | ||
| if (t.char_code > 0x10ffff) { | ||
| tokenize_error(&t, "unicode value out of range: %x", t.char_code); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move this down to the |
||
| break; | ||
| } | ||
| if (t.cur_tok->id == TokenIdCharLiteral) { | ||
| t.cur_tok->data.char_lit.c = t.char_code; | ||
| t.state = TokenizeStateCharLiteralEnd; | ||
| } else if (t.char_code <= 0x7f) { | ||
| // 00000000 00000000 00000000 0xxxxxxx | ||
| handle_string_escape(&t, (uint8_t)t.char_code); | ||
| } else if (t.char_code <= 0x7ff) { | ||
| // 00000000 00000000 00000xxx xx000000 | ||
| handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6))); | ||
| // 00000000 00000000 00000000 00xxxxxx | ||
| handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); | ||
| } else if (t.char_code <= 0xffff) { | ||
| // 00000000 00000000 xxxx0000 00000000 | ||
| handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12))); | ||
| // 00000000 00000000 0000xxxx xx000000 | ||
| handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); | ||
| // 00000000 00000000 00000000 00xxxxxx | ||
| handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); | ||
| } else if (t.char_code <= 0x10ffff) { | ||
| // 00000000 000xxx00 00000000 00000000 | ||
| handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18))); | ||
| // 00000000 000000xx xxxx0000 00000000 | ||
| handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f))); | ||
| // 00000000 00000000 0000xxxx xx000000 | ||
| handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); | ||
| // 00000000 00000000 00000000 00xxxxxx | ||
| handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); | ||
| } else { | ||
| zig_unreachable(); | ||
| } | ||
| break; | ||
| } | ||
|
|
||
| uint32_t digit_value = get_digit_value(c); | ||
| if (digit_value >= t.radix) { | ||
| tokenize_error(&t, "invalid digit: '%c'", c); | ||
|
|
@@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) { | |
| t.char_code += digit_value; | ||
| t.char_code_index += 1; | ||
|
|
||
| if (t.char_code_index >= t.char_code_end) { | ||
| if (t.unicode) { | ||
| if (t.char_code > 0x10ffff) { | ||
| tokenize_error(&t, "unicode value out of range: %x", t.char_code); | ||
| break; | ||
| } | ||
| if (t.cur_tok->id == TokenIdCharLiteral) { | ||
| t.cur_tok->data.char_lit.c = t.char_code; | ||
| t.state = TokenizeStateCharLiteralEnd; | ||
| } else if (t.char_code <= 0x7f) { | ||
| // 00000000 00000000 00000000 0xxxxxxx | ||
| handle_string_escape(&t, (uint8_t)t.char_code); | ||
| } else if (t.char_code <= 0x7ff) { | ||
| // 00000000 00000000 00000xxx xx000000 | ||
| handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6))); | ||
| // 00000000 00000000 00000000 00xxxxxx | ||
| handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); | ||
| } else if (t.char_code <= 0xffff) { | ||
| // 00000000 00000000 xxxx0000 00000000 | ||
| handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12))); | ||
| // 00000000 00000000 0000xxxx xx000000 | ||
| handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); | ||
| // 00000000 00000000 00000000 00xxxxxx | ||
| handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); | ||
| } else if (t.char_code <= 0x10ffff) { | ||
| // 00000000 000xxx00 00000000 00000000 | ||
| handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18))); | ||
| // 00000000 000000xx xxxx0000 00000000 | ||
| handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f))); | ||
| // 00000000 00000000 0000xxxx xx000000 | ||
| handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); | ||
| // 00000000 00000000 00000000 00xxxxxx | ||
| handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); | ||
| } | ||
| } else { | ||
| assert(t.char_code <= 255); | ||
| handle_string_escape(&t, (uint8_t)t.char_code); | ||
| } | ||
| if (!t.unicode && t.char_code_index >= 2) { | ||
| assert(t.char_code <= 255); | ||
| handle_string_escape(&t, (uint8_t)t.char_code); | ||
| } | ||
| } | ||
| break; | ||
|
|
@@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) { | |
| tokenize_error(&t, "unterminated string"); | ||
| break; | ||
| case TokenizeStateStringEscape: | ||
| case TokenizeStateStringEscapeUnicodeStart: | ||
| case TokenizeStateCharCode: | ||
| if (t.cur_tok->id == TokenIdStringLiteral) { | ||
| tokenize_error(&t, "unterminated string"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -240,6 +240,9 @@ pub const Tokenizer = struct { | |
| CharLiteral, | ||
| CharLiteralBackslash, | ||
| CharLiteralHexEscape, | ||
| CharLiteralUnicodeEscapeSawU, | ||
| CharLiteralUnicodeEscape, | ||
| CharLiteralUnicodeInvalid, | ||
| CharLiteralEnd, | ||
| Backslash, | ||
| Equal, | ||
|
|
@@ -296,7 +299,6 @@ pub const Tokenizer = struct { | |
| .end = undefined, | ||
| }; | ||
| var seen_escape_digits: usize = undefined; | ||
| var expected_escape_digits: usize = undefined; | ||
| while (self.index < self.buffer.len) : (self.index += 1) { | ||
| const c = self.buffer[self.index]; | ||
| switch (state) { | ||
|
|
@@ -664,27 +666,19 @@ pub const Tokenizer = struct { | |
| 'x' => { | ||
| state = State.CharLiteralHexEscape; | ||
| seen_escape_digits = 0; | ||
| expected_escape_digits = 2; | ||
| }, | ||
| 'u' => { | ||
| state = State.CharLiteralHexEscape; | ||
| seen_escape_digits = 0; | ||
| expected_escape_digits = 4; | ||
| }, | ||
| 'U' => { | ||
| state = State.CharLiteralHexEscape; | ||
| seen_escape_digits = 0; | ||
| expected_escape_digits = 6; | ||
| state = State.CharLiteralUnicodeEscapeSawU; | ||
| }, | ||
| else => { | ||
| state = State.CharLiteralEnd; | ||
| }, | ||
| }, | ||
|
|
||
| State.CharLiteralHexEscape => switch (c) { | ||
| '0'...'9', 'a'...'z', 'A'...'F' => { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume this was a bug (found when new tests were added)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep. thanks! |
||
| '0'...'9', 'a'...'f', 'A'...'F' => { | ||
| seen_escape_digits += 1; | ||
| if (seen_escape_digits == expected_escape_digits) { | ||
| if (seen_escape_digits == 2) { | ||
| state = State.CharLiteralEnd; | ||
| } | ||
| }, | ||
|
|
@@ -694,6 +688,43 @@ pub const Tokenizer = struct { | |
| }, | ||
| }, | ||
|
|
||
| State.CharLiteralUnicodeEscapeSawU => switch (c) { | ||
| '{' => { | ||
| state = State.CharLiteralUnicodeEscape; | ||
| seen_escape_digits = 0; | ||
| }, | ||
| else => { | ||
| result.id = Token.Id.Invalid; | ||
| state = State.CharLiteralUnicodeInvalid; | ||
| }, | ||
| }, | ||
|
|
||
| State.CharLiteralUnicodeEscape => switch (c) { | ||
| '0'...'9', 'a'...'f', 'A'...'F' => { | ||
| seen_escape_digits += 1; | ||
| }, | ||
| '}' => { | ||
| if (seen_escape_digits == 0) { | ||
| result.id = Token.Id.Invalid; | ||
| state = State.CharLiteralUnicodeInvalid; | ||
| } else { | ||
| state = State.CharLiteralEnd; | ||
| } | ||
| }, | ||
| else => { | ||
| result.id = Token.Id.Invalid; | ||
| state = State.CharLiteralUnicodeInvalid; | ||
| }, | ||
| }, | ||
|
|
||
| State.CharLiteralUnicodeInvalid => switch (c) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I got a little creative here because I thought this behavior might prevent some confusing error output. If it doesn't actually help, I'd be totally fine removing this special state.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's run with this and see what happens. |
||
| // Keep consuming characters until an obvious stopping point. | ||
| // This consolidates e.g. `u{0ab1Q}` into a single invalid token | ||
| // instead of creating the tokens `u{0ab1`, `Q`, `}` | ||
| '0'...'9', 'a'...'z', 'A'...'Z', '}' => {}, | ||
| else => break, | ||
| }, | ||
|
|
||
| State.CharLiteralEnd => switch (c) { | ||
| '\'' => { | ||
| result.id = Token.Id.CharLiteral; | ||
|
|
@@ -1055,6 +1086,9 @@ pub const Tokenizer = struct { | |
| State.CharLiteral, | ||
| State.CharLiteralBackslash, | ||
| State.CharLiteralHexEscape, | ||
| State.CharLiteralUnicodeEscapeSawU, | ||
| State.CharLiteralUnicodeEscape, | ||
| State.CharLiteralUnicodeInvalid, | ||
| State.CharLiteralEnd, | ||
| State.StringLiteralBackslash, | ||
| State.LBracketStar, | ||
|
|
@@ -1208,7 +1242,60 @@ test "tokenizer - unknown length pointer and then c pointer" { | |
| test "tokenizer - char literal with hex escape" { | ||
| testTokenize( | ||
| \\'\x1b' | ||
| , [_]Token.Id{Token.Id.CharLiteral}); | ||
| , [_]Token.Id{.CharLiteral}); | ||
| testTokenize( | ||
| \\'\x1' | ||
| , [_]Token.Id{ .Invalid, .Invalid }); | ||
| } | ||
|
|
||
| test "tokenizer - char literal with unicode escapes" { | ||
| // Valid unicode escapes | ||
| testTokenize( | ||
| \\'\u{3}' | ||
| , [_]Token.Id{.CharLiteral}); | ||
| testTokenize( | ||
| \\'\u{01}' | ||
| , [_]Token.Id{.CharLiteral}); | ||
| testTokenize( | ||
| \\'\u{2a}' | ||
| , [_]Token.Id{.CharLiteral}); | ||
| testTokenize( | ||
| \\'\u{3f9}' | ||
| , [_]Token.Id{.CharLiteral}); | ||
| testTokenize( | ||
| \\'\u{6E09aBc1523}' | ||
| , [_]Token.Id{.CharLiteral}); | ||
| testTokenize( | ||
| \\"\u{440}" | ||
| , [_]Token.Id{.StringLiteral}); | ||
|
|
||
| // Invalid unicode escapes | ||
| testTokenize( | ||
| \\'\u' | ||
| , [_]Token.Id{.Invalid}); | ||
| testTokenize( | ||
| \\'\u{{' | ||
| , [_]Token.Id{ .Invalid, .Invalid }); | ||
| testTokenize( | ||
| \\'\u{}' | ||
| , [_]Token.Id{ .Invalid, .Invalid }); | ||
| testTokenize( | ||
| \\'\u{s}' | ||
| , [_]Token.Id{ .Invalid, .Invalid }); | ||
| testTokenize( | ||
| \\'\u{2z}' | ||
| , [_]Token.Id{ .Invalid, .Invalid }); | ||
| testTokenize( | ||
| \\'\u{4a' | ||
| , [_]Token.Id{.Invalid}); | ||
|
|
||
| // Test old-style unicode literals | ||
| testTokenize( | ||
| \\'\u0333' | ||
| , [_]Token.Id{ .Invalid, .Invalid }); | ||
| testTokenize( | ||
| \\'\U0333' | ||
| , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid }); | ||
| } | ||
|
|
||
| test "tokenizer - float literal e exponent" { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure what the clearest way to write this is. Could also be something like:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the "1 or more digits" you have below is sufficient