Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/langref.html.in
Original file line number Diff line number Diff line change
Expand Up @@ -552,8 +552,7 @@ pub fn main() void {
<p>
Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
character literals will be allowed to have a single UTF-8 encoded codepoint.
and character literals.
</p>
{#code_begin|test#}
const assert = @import("std").debug.assert;
Expand All @@ -567,6 +566,7 @@ test "string literals" {
assert(normal_bytes[1] == 'e');
assert('e' == '\x65');
assert('\u{1f4a9}' == 128169);
assert('💯' == 128175);
assert(mem.eql(u8, "hello", "h\x65llo"));

// A C string literal is a null terminated pointer.
Expand Down
41 changes: 35 additions & 6 deletions lib/std/zig/tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ pub const Tokenizer = struct {
CharLiteralUnicodeEscapeSawU,
CharLiteralUnicodeEscape,
CharLiteralUnicodeInvalid,
CharLiteralUnicode,
CharLiteralEnd,
Backslash,
Equal,
Expand Down Expand Up @@ -427,6 +428,7 @@ pub const Tokenizer = struct {
.end = undefined,
};
var seen_escape_digits: usize = undefined;
var remaining_code_units: usize = undefined;
while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
Expand Down Expand Up @@ -774,16 +776,23 @@ pub const Tokenizer = struct {
'\\' => {
state = State.CharLiteralBackslash;
},
'\'' => {
'\'', 0x80...0xbf, 0xf8...0xff => {
result.id = Token.Id.Invalid;
break;
},
0xc0...0xdf => { // 110xxxxx
remaining_code_units = 1;
state = State.CharLiteralUnicode;
},
0xe0...0xef => { // 1110xxxx
remaining_code_units = 2;
state = State.CharLiteralUnicode;
},
0xf0...0xf7 => { // 11110xxx
remaining_code_units = 3;
state = State.CharLiteralUnicode;
},
else => {
if (c < 0x20 or c == 0x7f) {
result.id = Token.Id.Invalid;
break;
}

state = State.CharLiteralEnd;
},
},
Expand Down Expand Up @@ -867,6 +876,19 @@ pub const Tokenizer = struct {
},
},

State.CharLiteralUnicode => switch (c) {
0x80...0xbf => {
remaining_code_units -= 1;
if (remaining_code_units == 0) {
state = State.CharLiteralEnd;
}
},
else => {
result.id = Token.Id.Invalid;
break;
},
},

State.MultilineStringLiteralLine => switch (c) {
'\n' => {
self.index += 1;
Expand Down Expand Up @@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
State.CharLiteralUnicodeEscape,
State.CharLiteralUnicodeInvalid,
State.CharLiteralEnd,
State.CharLiteralUnicode,
State.StringLiteralBackslash,
State.LBracketStar,
State.LBracketStarC,
Expand Down Expand Up @@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
, [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
}

test "tokenizer - char literal with unicode code point" {
testTokenize(
\\'💩'
, [_]Token.Id{.CharLiteral});
}

test "tokenizer - float literal e exponent" {
testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
Token.Id.Identifier,
Expand Down
51 changes: 40 additions & 11 deletions src/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ enum TokenizeState {
TokenizeStateStringEscapeUnicodeStart,
TokenizeStateCharLiteral,
TokenizeStateCharLiteralEnd,
TokenizeStateCharLiteralUnicode,
TokenizeStateSawStar,
TokenizeStateSawStarPercent,
TokenizeStateSawSlash,
Expand Down Expand Up @@ -247,6 +248,7 @@ struct Tokenize {
int exponent_in_bin_or_dec;
BigInt specified_exponent;
BigInt significand;
size_t remaining_code_units;
};

ATTRIBUTE_PRINTF(2, 3)
Expand Down Expand Up @@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) {
}
break;
case TokenizeStateCharLiteral:
switch (c) {
case '\'':
tokenize_error(&t, "expected character");
break;
case '\\':
t.state = TokenizeStateStringEscape;
break;
default:
t.cur_tok->data.char_lit.c = c;
t.state = TokenizeStateCharLiteralEnd;
break;
if (c == '\'') {
tokenize_error(&t, "expected character");
} else if (c == '\\') {
t.state = TokenizeStateStringEscape;
} else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
// 10xxxxxx
// 11111xxx
invalid_char_error(&t, c);
} else if (c >= 0xc0 && c <= 0xdf) {
// 110xxxxx
t.cur_tok->data.char_lit.c = c & 0x1f;
t.remaining_code_units = 1;
t.state = TokenizeStateCharLiteralUnicode;
} else if (c >= 0xe0 && c <= 0xef) {
// 1110xxxx
t.cur_tok->data.char_lit.c = c & 0x0f;
t.remaining_code_units = 2;
t.state = TokenizeStateCharLiteralUnicode;
} else if (c >= 0xf0 && c <= 0xf7) {
// 11110xxx
t.cur_tok->data.char_lit.c = c & 0x07;
t.remaining_code_units = 3;
t.state = TokenizeStateCharLiteralUnicode;
} else {
t.cur_tok->data.char_lit.c = c;
t.state = TokenizeStateCharLiteralEnd;
}
break;
case TokenizeStateCharLiteralEnd:
Expand All @@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) {
invalid_char_error(&t, c);
}
break;
case TokenizeStateCharLiteralUnicode:
if (c <= 0x7f || c >= 0xc0) {
invalid_char_error(&t, c);
}
t.cur_tok->data.char_lit.c <<= 6;
t.cur_tok->data.char_lit.c += c & 0x3f;
t.remaining_code_units--;
if (t.remaining_code_units == 0) {
t.state = TokenizeStateCharLiteralEnd;
}
break;
case TokenizeStateZero:
switch (c) {
case 'b':
Expand Down Expand Up @@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) {
break;
case TokenizeStateCharLiteral:
case TokenizeStateCharLiteralEnd:
case TokenizeStateCharLiteralUnicode:
tokenize_error(&t, "unterminated character literal");
break;
case TokenizeStateSymbol:
Expand Down
4 changes: 4 additions & 0 deletions test/stage1/behavior/misc.zig
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,10 @@ test "unicode escape in character literal" {
expect(a == 128169);
}

test "unicode character in character literal" {
expect('💩' == 128169);
}

test "result location zero sized array inside struct field implicit cast to slice" {
const E = struct {
entries: []u32,
Expand Down