Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions doc/langref.html.in
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ test "string literals" {
assert(normal_bytes.len == 5);
assert(normal_bytes[1] == 'e');
assert('e' == '\x65');
assert('\U01f4a9' == 128169);
assert('\u{1f4a9}' == 128169);
assert(mem.eql(u8, "hello", "h\x65llo"));

// A C string literal is a null terminated pointer.
Expand Down Expand Up @@ -616,12 +616,8 @@ test "string literals" {
<td>hexadecimal 8-bit character code (2 digits)</td>
</tr>
<tr>
<td><code>\uNNNN</code></td>
<td>hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)</td>
</tr>
<tr>
<td><code>\UNNNNNN</code></td>
<td>hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)</td>
<td><code>\u{NNNNNN}</code></td>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what the clearest way to write this is. Could also be something like:

\u{N...}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the "1 or more digits" you have below is sufficient

<td>hexadecimal Unicode character code UTF-8 encoded (1 or more digits)</td>
</tr>
</table>
</div>
Expand Down Expand Up @@ -10008,8 +10004,7 @@ eof &lt;- !.
hex &lt;- [0-9a-fA-F]
char_escape
&lt;- "\\x" hex hex
/ "\\u" hex hex hex hex
/ "\\U" hex hex hex hex hex hex
/ "\\u{" hex+ "}"
/ "\\" [nr\\t'"]
char_char
&lt;- char_escape
Expand Down
115 changes: 61 additions & 54 deletions src/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ enum TokenizeState {
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
TokenizeStateString,
TokenizeStateStringEscape,
TokenizeStateStringEscapeUnicodeStart,
TokenizeStateCharLiteral,
TokenizeStateCharLiteralEnd,
TokenizeStateSawStar,
Expand Down Expand Up @@ -241,7 +242,6 @@ struct Tokenize {
int32_t exp_add_amt;
bool is_exp_negative;
size_t char_code_index;
size_t char_code_end;
bool unicode;
uint32_t char_code;
int exponent_in_bin_or_dec;
Expand Down Expand Up @@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) {
t.radix = 16;
t.char_code = 0;
t.char_code_index = 0;
t.char_code_end = 2;
t.unicode = false;
break;
case 'u':
t.state = TokenizeStateCharCode;
t.radix = 16;
t.char_code = 0;
t.char_code_index = 0;
t.char_code_end = 4;
t.unicode = true;
break;
case 'U':
t.state = TokenizeStateCharCode;
t.radix = 16;
t.char_code = 0;
t.char_code_index = 0;
t.char_code_end = 6;
t.unicode = true;
t.state = TokenizeStateStringEscapeUnicodeStart;
break;
case 'n':
handle_string_escape(&t, '\n');
Expand All @@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) {
invalid_char_error(&t, c);
}
break;
case TokenizeStateStringEscapeUnicodeStart:
switch (c) {
case '{':
t.state = TokenizeStateCharCode;
t.radix = 16;
t.char_code = 0;
t.char_code_index = 0;
t.unicode = true;
break;
default:
invalid_char_error(&t, c);
}
break;
case TokenizeStateCharCode:
{
if (t.unicode && c == '}') {
if (t.char_code_index == 0) {
tokenize_error(&t, "empty unicode escape sequence");
break;
}
if (t.char_code > 0x10ffff) {
tokenize_error(&t, "unicode value out of range: %x", t.char_code);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this down to the else below?

break;
}
if (t.cur_tok->id == TokenIdCharLiteral) {
t.cur_tok->data.char_lit.c = t.char_code;
t.state = TokenizeStateCharLiteralEnd;
} else if (t.char_code <= 0x7f) {
// 00000000 00000000 00000000 0xxxxxxx
handle_string_escape(&t, (uint8_t)t.char_code);
} else if (t.char_code <= 0x7ff) {
// 00000000 00000000 00000xxx xx000000
handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
// 00000000 00000000 00000000 00xxxxxx
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
} else if (t.char_code <= 0xffff) {
// 00000000 00000000 xxxx0000 00000000
handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
// 00000000 00000000 0000xxxx xx000000
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
// 00000000 00000000 00000000 00xxxxxx
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
} else if (t.char_code <= 0x10ffff) {
// 00000000 000xxx00 00000000 00000000
handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
// 00000000 000000xx xxxx0000 00000000
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
// 00000000 00000000 0000xxxx xx000000
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
// 00000000 00000000 00000000 00xxxxxx
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
} else {
zig_unreachable();
}
break;
}

uint32_t digit_value = get_digit_value(c);
if (digit_value >= t.radix) {
tokenize_error(&t, "invalid digit: '%c'", c);
Expand All @@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) {
t.char_code += digit_value;
t.char_code_index += 1;

if (t.char_code_index >= t.char_code_end) {
if (t.unicode) {
if (t.char_code > 0x10ffff) {
tokenize_error(&t, "unicode value out of range: %x", t.char_code);
break;
}
if (t.cur_tok->id == TokenIdCharLiteral) {
t.cur_tok->data.char_lit.c = t.char_code;
t.state = TokenizeStateCharLiteralEnd;
} else if (t.char_code <= 0x7f) {
// 00000000 00000000 00000000 0xxxxxxx
handle_string_escape(&t, (uint8_t)t.char_code);
} else if (t.char_code <= 0x7ff) {
// 00000000 00000000 00000xxx xx000000
handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
// 00000000 00000000 00000000 00xxxxxx
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
} else if (t.char_code <= 0xffff) {
// 00000000 00000000 xxxx0000 00000000
handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
// 00000000 00000000 0000xxxx xx000000
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
// 00000000 00000000 00000000 00xxxxxx
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
} else if (t.char_code <= 0x10ffff) {
// 00000000 000xxx00 00000000 00000000
handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
// 00000000 000000xx xxxx0000 00000000
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
// 00000000 00000000 0000xxxx xx000000
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
// 00000000 00000000 00000000 00xxxxxx
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
}
} else {
assert(t.char_code <= 255);
handle_string_escape(&t, (uint8_t)t.char_code);
}
if (!t.unicode && t.char_code_index >= 2) {
assert(t.char_code <= 255);
handle_string_escape(&t, (uint8_t)t.char_code);
}
}
break;
Expand Down Expand Up @@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) {
tokenize_error(&t, "unterminated string");
break;
case TokenizeStateStringEscape:
case TokenizeStateStringEscapeUnicodeStart:
case TokenizeStateCharCode:
if (t.cur_tok->id == TokenIdStringLiteral) {
tokenize_error(&t, "unterminated string");
Expand Down
2 changes: 1 addition & 1 deletion std/zig/parser_test.zig
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ test "zig fmt: enum literal inside array literal" {

test "zig fmt: character literal larger than u8" {
try testCanonical(
\\const x = '\U01f4a9';
\\const x = '\u{01f4a9}';
\\
);
}
Expand Down
113 changes: 100 additions & 13 deletions std/zig/tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@ pub const Tokenizer = struct {
CharLiteral,
CharLiteralBackslash,
CharLiteralHexEscape,
CharLiteralUnicodeEscapeSawU,
CharLiteralUnicodeEscape,
CharLiteralUnicodeInvalid,
CharLiteralEnd,
Backslash,
Equal,
Expand Down Expand Up @@ -296,7 +299,6 @@ pub const Tokenizer = struct {
.end = undefined,
};
var seen_escape_digits: usize = undefined;
var expected_escape_digits: usize = undefined;
while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
Expand Down Expand Up @@ -664,27 +666,19 @@ pub const Tokenizer = struct {
'x' => {
state = State.CharLiteralHexEscape;
seen_escape_digits = 0;
expected_escape_digits = 2;
},
'u' => {
state = State.CharLiteralHexEscape;
seen_escape_digits = 0;
expected_escape_digits = 4;
},
'U' => {
state = State.CharLiteralHexEscape;
seen_escape_digits = 0;
expected_escape_digits = 6;
state = State.CharLiteralUnicodeEscapeSawU;
},
else => {
state = State.CharLiteralEnd;
},
},

State.CharLiteralHexEscape => switch (c) {
'0'...'9', 'a'...'z', 'A'...'F' => {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this was a bug (found when new tests were added)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep. thanks!

'0'...'9', 'a'...'f', 'A'...'F' => {
seen_escape_digits += 1;
if (seen_escape_digits == expected_escape_digits) {
if (seen_escape_digits == 2) {
state = State.CharLiteralEnd;
}
},
Expand All @@ -694,6 +688,43 @@ pub const Tokenizer = struct {
},
},

State.CharLiteralUnicodeEscapeSawU => switch (c) {
'{' => {
state = State.CharLiteralUnicodeEscape;
seen_escape_digits = 0;
},
else => {
result.id = Token.Id.Invalid;
state = State.CharLiteralUnicodeInvalid;
},
},

State.CharLiteralUnicodeEscape => switch (c) {
'0'...'9', 'a'...'f', 'A'...'F' => {
seen_escape_digits += 1;
},
'}' => {
if (seen_escape_digits == 0) {
result.id = Token.Id.Invalid;
state = State.CharLiteralUnicodeInvalid;
} else {
state = State.CharLiteralEnd;
}
},
else => {
result.id = Token.Id.Invalid;
state = State.CharLiteralUnicodeInvalid;
},
},

State.CharLiteralUnicodeInvalid => switch (c) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got a little creative here because I thought this behavior might prevent some confusing error output. If it doesn't actually help, I'd be totally fine removing this special state.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's run with this and see what happens.

// Keep consuming characters until an obvious stopping point.
// This consolidates e.g. `u{0ab1Q}` into a single invalid token
// instead of creating the tokens `u{0ab1`, `Q`, `}`
'0'...'9', 'a'...'z', 'A'...'Z', '}' => {},
else => break,
},

State.CharLiteralEnd => switch (c) {
'\'' => {
result.id = Token.Id.CharLiteral;
Expand Down Expand Up @@ -1055,6 +1086,9 @@ pub const Tokenizer = struct {
State.CharLiteral,
State.CharLiteralBackslash,
State.CharLiteralHexEscape,
State.CharLiteralUnicodeEscapeSawU,
State.CharLiteralUnicodeEscape,
State.CharLiteralUnicodeInvalid,
State.CharLiteralEnd,
State.StringLiteralBackslash,
State.LBracketStar,
Expand Down Expand Up @@ -1208,7 +1242,60 @@ test "tokenizer - unknown length pointer and then c pointer" {
test "tokenizer - char literal with hex escape" {
testTokenize(
\\'\x1b'
, [_]Token.Id{Token.Id.CharLiteral});
, [_]Token.Id{.CharLiteral});
testTokenize(
\\'\x1'
, [_]Token.Id{ .Invalid, .Invalid });
}

test "tokenizer - char literal with unicode escapes" {
// Valid unicode escapes
testTokenize(
\\'\u{3}'
, [_]Token.Id{.CharLiteral});
testTokenize(
\\'\u{01}'
, [_]Token.Id{.CharLiteral});
testTokenize(
\\'\u{2a}'
, [_]Token.Id{.CharLiteral});
testTokenize(
\\'\u{3f9}'
, [_]Token.Id{.CharLiteral});
testTokenize(
\\'\u{6E09aBc1523}'
, [_]Token.Id{.CharLiteral});
testTokenize(
\\"\u{440}"
, [_]Token.Id{.StringLiteral});

// Invalid unicode escapes
testTokenize(
\\'\u'
, [_]Token.Id{.Invalid});
testTokenize(
\\'\u{{'
, [_]Token.Id{ .Invalid, .Invalid });
testTokenize(
\\'\u{}'
, [_]Token.Id{ .Invalid, .Invalid });
testTokenize(
\\'\u{s}'
, [_]Token.Id{ .Invalid, .Invalid });
testTokenize(
\\'\u{2z}'
, [_]Token.Id{ .Invalid, .Invalid });
testTokenize(
\\'\u{4a'
, [_]Token.Id{.Invalid});

// Test old-style unicode literals
testTokenize(
\\'\u0333'
, [_]Token.Id{ .Invalid, .Invalid });
testTokenize(
\\'\U0333'
, [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
}

test "tokenizer - float literal e exponent" {
Expand Down
18 changes: 18 additions & 0 deletions test/compile_errors.zig
Original file line number Diff line number Diff line change
Expand Up @@ -5414,6 +5414,24 @@ pub fn addCases(cases: *tests.CompileErrorContext) void {
"tmp.zig:1:17: error: invalid carriage return, only '\\n' line endings are supported",
);

cases.add(
"invalid legacy unicode escape",
\\export fn entry() void {
\\ const a = '\U1234';
\\}
,
"tmp.zig:2:17: error: invalid character: 'U'",
);

cases.add(
"invalid empty unicode escape",
\\export fn entry() void {
\\ const a = '\u{}';
\\}
,
"tmp.zig:2:19: error: empty unicode escape sequence",
);

cases.add(
"non-printable invalid character",
"\xff\xfe" ++
Expand Down
Loading