From 4734249824c197b9ce0911a80ab1815b48cbcc3a Mon Sep 17 00:00:00 2001 From: takejohn Date: Thu, 11 Sep 2025 15:58:37 +0900 Subject: [PATCH 1/8] =?UTF-8?q?CharStream=E3=81=A7=E3=82=B5=E3=83=AD?= =?UTF-8?q?=E3=82=B2=E3=83=BC=E3=83=88=E3=83=9A=E3=82=A2=E3=82=921?= =?UTF-8?q?=E6=96=87=E5=AD=97=E3=81=A8=E3=81=97=E3=81=A6=E6=89=B1=E3=81=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parser/streams/char-stream.ts | 68 ++++++++++++++++++------ src/utils/characters.ts | 88 +++++++++++++++++++++++++++++++ test/parser.ts | 51 ++++++++++++++++++ 3 files changed, 192 insertions(+), 15 deletions(-) create mode 100644 src/utils/characters.ts diff --git a/src/parser/streams/char-stream.ts b/src/parser/streams/char-stream.ts index 58b36793c..a79d2594f 100644 --- a/src/parser/streams/char-stream.ts +++ b/src/parser/streams/char-stream.ts @@ -1,3 +1,5 @@ +import { isSurrogatePair } from '../../utils/characters.js'; + /** * 入力文字列から文字を読み取るクラス */ @@ -6,11 +8,13 @@ export class CharStream { private firstPageIndex: number; private lastPageIndex: number; private pageIndex: number; + /** based on UTF-16 code unit */ private address: number; + /** Unicode character */ private _char?: string; - /** zero-based number */ + /** zero-based number, based on Unicode code points */ private line: number; - /** zero-based number */ + /** zero-based number, based on Unicode code points */ private column: number; constructor(source: string, opts?: { line?: number, column?: number }) { @@ -70,8 +74,18 @@ export class CharStream { * カーソル位置を前の文字へ戻します。 */ public prev(): void { - this.decAddr(); this.movePrev(); + this.decAddr(); + if (!this.startOfFile && this._char === '\n') { + this.line--; + const page = this.pages.get(this.pageIndex)!; + const lastLineBreak = page.lastIndexOf('\n', this.address - 1); + const lineStart = lastLineBreak >= 0 ? lastLineBreak + 1 : 0; + const line = page.slice(lineStart, this.address); + this.column = [...line].length - 1; + } else { + this.column--; + } } private get isFirstPage(): boolean { @@ -87,6 +101,10 @@ export class CharStream { return (this.address >= page.length); } + private get startOfFile(): boolean { + return this.isFirstPage && this.address === 0; + } + private moveNext(): void { this.loadChar(); while (true) { @@ -101,7 +119,7 @@ export class CharStream { private incAddr(): void { if (!this.endOfPage) { - this.address++; + this.address += this._char!.length; } else if (!this.isLastPage) { this.pageIndex++; this.address = 0; @@ -109,23 +127,20 @@ export class CharStream { } private movePrev(): void { - this.loadChar(); - while (true) { - if (!this.eof && this._char === '\r') { - this.decAddr(); - this.loadChar(); - continue; - } - break; + this.loadPrevChar(); + while (!this.startOfFile && this._char === '\r') { + this.decAddr(); + this.loadPrevChar(); } } private decAddr(): void { if (this.address > 0) { - this.address--; + this.address -= getLastUnicodeChar(this.pages.get(this.pageIndex)!, this.address)!.length; } else if (!this.isFirstPage) { this.pageIndex--; - this.address = this.pages.get(this.pageIndex)!.length - 1; + const page = this.pages.get(this.pageIndex)!; + this.address = page.length - getLastUnicodeChar(page)!.length; } } @@ -133,7 +148,30 @@ export class CharStream { if (this.eof) { this._char = undefined; } else { - this._char = this.pages.get(this.pageIndex)![this.address]!; + this._char = getUnicodeChar(this.pages.get(this.pageIndex)!, this.address); + } + } + + private loadPrevChar(): void { + if (this.address > 0) { + this._char = getLastUnicodeChar(this.pages.get(this.pageIndex)!, this.address)!; + } else if (!this.isFirstPage) { + const page = this.pages.get(this.pageIndex - 1)!; + this._char = getLastUnicodeChar(page)!; } } } + +function getUnicodeChar(string: string, position = 0): string | undefined { + if (isSurrogatePair(string, position)) { + return string.slice(position, position + 2); + } + return string[position]; +} + +function getLastUnicodeChar(string: string, position = string.length): string | undefined { + if (isSurrogatePair(string, position - 2)) { + return string.slice(position - 2, position); + } + return string[position - 1]; +} diff --git a/src/utils/characters.ts b/src/utils/characters.ts new file mode 100644 index 000000000..112faa81e --- /dev/null +++ b/src/utils/characters.ts @@ -0,0 +1,88 @@ +const MIN_HIGH_SURROGATE = 0xD800; +const MAX_HIGH_SURROGATE = 0xDBFF; +const MIN_LOW_SURROGATE = 0xDC00; +const MAX_LOW_SURROGATE = 0xDFFF; +const UNICODE_LETTER = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_]$/u; +const UNICODE_COMBINING_MARK = /^[\p{Mn}\p{Mc}]$/u; +const UNICODE_DIGIT = /^\p{Nd}$/u; +const UNICODE_CONNECTOR_PUNCTUATION = /^\p{Pc}$/u; +const ZERO_WIDTH_NON_JOINER = String.fromCodePoint(0x200C); +const ZERO_WIDTH_JOINER = String.fromCharCode(0x200D); + +export function isHighSurrogate(string: string, index = 0): boolean { + if (index < 0 || index >= string.length) { + return false; + } + const charCode = string.charCodeAt(index); + return charCode >= MIN_HIGH_SURROGATE && charCode <= MAX_HIGH_SURROGATE; +} + +export function isLowSurrogate(string: string, index = 0): boolean { + if (index < 0 || index >= string.length) { + return false; + } + const charCode = string.charCodeAt(index); + return charCode >= MIN_LOW_SURROGATE && charCode <= MAX_LOW_SURROGATE; +} + +export function isSurrogatePair(string: string, start = 0): boolean { + return isHighSurrogate(string, start) && isLowSurrogate(string, start + 1); +} + +export function isIdentifierStart(char: string): boolean { + return UNICODE_LETTER.test(char) || char === '$' || char === '_'; +} + +export function isIdentifierPart(char: string): boolean { + return UNICODE_LETTER.test(char) + || UNICODE_COMBINING_MARK.test(char) + || UNICODE_DIGIT.test(char) + || UNICODE_CONNECTOR_PUNCTUATION.test(char) + || char === ZERO_WIDTH_NON_JOINER + || char === ZERO_WIDTH_JOINER; +} + +export function decodeUnicodeEscapeSequence(string: string): string { + let result = ''; + let state: 'string' | 'escape' | `digit` = 'string'; + let digits = ''; + + for (let i = 0; i < string.length; i++) { + const char = string[i]!; + + switch (state) { + case 'string': { + if (char === '\\') { + state = 'escape'; + } else { + result += char; + } + break; + } + + case 'escape': { + if (char !== 'u') { + throw new SyntaxError('invalid escape sequence'); + } + state = 'digit'; + break; + } + + case 'digit': { + if ((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f') || (char >= 'A' && char <= 'F')) { + digits += char; + } else { + throw new SyntaxError('invalid escape sequence'); + } + if (digits.length === 4) { + result += String.fromCharCode(Number.parseInt(digits, 16)); + state = 'string'; + digits = ''; + } + break; + } + } + } + + return result; +} diff --git a/test/parser.ts b/test/parser.ts index c2588b01f..47be63002 100644 --- a/test/parser.ts +++ b/test/parser.ts @@ -34,6 +34,49 @@ describe('CharStream', () => { stream.prev(); assert.strictEqual('a', stream.char); }); + + test.concurrent('line break', async () => { + const source = 'a\nb'; + const stream = new CharStream(source); + assert.strictEqual('a', stream.char); + stream.next(); + assert.strictEqual('\n', stream.char); + stream.next(); + assert.strictEqual('b', stream.char); + stream.prev(); + assert.strictEqual('\n', stream.char); + assert.deepStrictEqual(stream.getPos(), { line: 1, column: 1 }); + }); + + test.concurrent('line breaks', async () => { + const source = '\n\nc'; + const stream = new CharStream(source); + stream.next(); + stream.next(); + assert.strictEqual('c', stream.char); + stream.prev(); + assert.strictEqual('\n', stream.char); + assert.deepStrictEqual(stream.getPos(), { line: 2, column: 0 }); + }); + + test.concurrent('CRは読み飛ばされる', async () => { + const source = 'a\r\nb'; + const stream = new CharStream(source); + stream.next(); + assert.strictEqual('\n', stream.char); + stream.prev(); + assert.strictEqual('a', stream.char); + }); + + test.concurrent('surrogate pair', async () => { + const source = '\ud83e\udd2f'; + const stream = new CharStream(source); + assert.strictEqual('\ud83e\udd2f', stream.char); + stream.next(); + assert.strictEqual(true, stream.eof); + stream.prev(); + assert.strictEqual('\ud83e\udd2f', stream.char); + }); }); test.concurrent('eof', async () => { @@ -71,6 +114,14 @@ describe('CharStream', () => { stream.next(); assert.strictEqual(true, stream.eof); }); + + test.concurrent('surrogate pair', async () => { + const source = '\ud83e\udd2f'; + const stream = new CharStream(source); + assert.strictEqual('\ud83e\udd2f', stream.char); + stream.next(); + assert.strictEqual(true, stream.eof); + }); }); describe('Scanner', () => { From d89995ba9adb1483eb71b1cf1c9dccd3d9738cac Mon Sep 17 00:00:00 2001 From: takejohn Date: Fri, 12 Sep 2025 15:48:43 +0900 Subject: [PATCH 2/8] =?UTF-8?q?JSON5=E3=81=AE=E8=AD=98=E5=88=A5=E5=AD=90?= =?UTF-8?q?=E3=82=92=E4=BD=BF=E3=81=88=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parser/plugins/validate-keyword.ts | 94 +++++--- src/parser/scanner.ts | 92 +++++++- src/utils/characters.ts | 26 +- test/characters.ts | 187 +++++++++++++++ test/identifiers.ts | 315 +++++++++++++++++++++++++ test/keywords.ts | 167 ------------- test/parser.ts | 2 +- unreleased/json5-identifiers.md | 22 ++ 8 files changed, 684 insertions(+), 221 deletions(-) create mode 100644 test/characters.ts create mode 100644 test/identifiers.ts delete mode 100644 test/keywords.ts create mode 100644 unreleased/json5-identifiers.md diff --git a/src/parser/plugins/validate-keyword.ts b/src/parser/plugins/validate-keyword.ts index 0cbff01da..9446ed53b 100644 --- a/src/parser/plugins/validate-keyword.ts +++ b/src/parser/plugins/validate-keyword.ts @@ -3,7 +3,7 @@ import { visitNode } from '../visit.js'; import type * as Ast from '../../node.js'; // 予約語となっている識別子があるかを確認する。 -// - キーワードは字句解析の段階でそれぞれのKeywordトークンとなるため除外 +// - キーワードは字句解析の段階でそれぞれのKeywordトークンとなるが、エスケープシーケンスを含む場合はIdentifierトークンとなるので検証を行う。 // - 文脈キーワードは識別子に利用できるため除外 const reservedWord = [ @@ -52,25 +52,63 @@ const reservedWord = [ 'new', ]; -function throwReservedWordError(name: string, loc: Ast.Loc): void { - throw new AiScriptSyntaxError(`Reserved word "${name}" cannot be used as variable name.`, loc.start); +const keywords = [ + 'null', + 'true', + 'false', + 'each', + 'for', + 'loop', + 'do', + 'while', + 'break', + 'continue', + 'match', + 'case', + 'default', + 'if', + 'elif', + 'else', + 'return', + 'eval', + 'var', + 'let', + 'exists', +]; + +function validateName(name: string, pos: Ast.Pos): void { + if (reservedWord.includes(name)) { + throw new AiScriptSyntaxError(`Reserved word "${name}" cannot be used as identifier.`, pos); + } + if (keywords.includes(name)) { + throw new AiScriptSyntaxError(`Keyword "${name}" cannot be used as identifier.`, pos); + } +} + +function validateTypeName(name: string, pos: Ast.Pos): void { + if (name === 'null') { + return; + } + validateName(name, pos); +} + +function throwReservedWordError(name: string, pos: Ast.Pos): void { + throw new AiScriptSyntaxError(`Reserved word "${name}" cannot be used as variable name.`, pos); } function validateDest(node: Ast.Node): Ast.Node { return visitNode(node, node => { switch (node.type) { case 'null': { - throwReservedWordError(node.type, node.loc); + throwReservedWordError(node.type, node.loc.start); break; } case 'bool': { - throwReservedWordError(`${node.value}`, node.loc); + throwReservedWordError(`${node.value}`, node.loc.start); break; } case 'identifier': { - if (reservedWord.includes(node.name)) { - throwReservedWordError(node.name, node.loc); - } + validateName(node.name, node.loc.start); break; } } @@ -81,9 +119,7 @@ function validateDest(node: Ast.Node): Ast.Node { function validateTypeParams(node: Ast.Fn | Ast.FnTypeSource): void { for (const typeParam of node.typeParams) { - if (reservedWord.includes(typeParam.name)) { - throwReservedWordError(typeParam.name, node.loc); - } + validateTypeName(typeParam.name, node.loc.start); } } @@ -97,48 +133,46 @@ function validateNode(node: Ast.Node): Ast.Node { case 'attr': case 'identifier': case 'prop': { - if (reservedWord.includes(node.name)) { - throwReservedWordError(node.name, node.loc); - } + validateName(node.name, node.loc.start); break; } case 'meta': { - if (node.name != null && reservedWord.includes(node.name)) { - throwReservedWordError(node.name, node.loc); + if (node.name != null) { + validateName(node.name, node.loc.start); } break; } case 'each': { - if (node.label != null && reservedWord.includes(node.label)) { - throwReservedWordError(node.label, node.loc); + if (node.label != null) { + validateName(node.label, node.loc.start); } validateDest(node.var); break; } case 'for': { - if (node.label != null && reservedWord.includes(node.label)) { - throwReservedWordError(node.label, node.loc); + if (node.label != null) { + validateName(node.label, node.loc.start); } - if (node.var != null && reservedWord.includes(node.var)) { - throwReservedWordError(node.var, node.loc); + if (node.var != null) { + validateName(node.var, node.loc.start); } break; } case 'loop': { - if (node.label != null && reservedWord.includes(node.label)) { - throwReservedWordError(node.label, node.loc); + if (node.label != null) { + validateName(node.label, node.loc.start); } break; } case 'break': { - if (node.label != null && reservedWord.includes(node.label)) { - throwReservedWordError(node.label, node.loc); + if (node.label != null) { + validateName(node.label, node.loc.start); } break; } case 'continue': { - if (node.label != null && reservedWord.includes(node.label)) { - throwReservedWordError(node.label, node.loc); + if (node.label != null) { + validateName(node.label, node.loc.start); } break; } @@ -150,9 +184,7 @@ function validateNode(node: Ast.Node): Ast.Node { break; } case 'namedTypeSource': { - if (reservedWord.includes(node.name)) { - throwReservedWordError(node.name, node.loc); - } + validateTypeName(node.name, node.loc.start); break; } case 'fnTypeSource': { diff --git a/src/parser/scanner.ts b/src/parser/scanner.ts index ac55843fa..20109b35c 100644 --- a/src/parser/scanner.ts +++ b/src/parser/scanner.ts @@ -1,4 +1,5 @@ import { AiScriptSyntaxError, AiScriptUnexpectedEOFError } from '../error.js'; +import { decodeUnicodeEscapeSequence, isIdentifierPart, isIdentifierStart } from '../utils/characters.js'; import { CharStream } from './streams/char-stream.js'; import { TOKEN, TokenKind } from './token.js'; import { unexpectedTokenError } from './utils.js'; @@ -9,7 +10,7 @@ import type { Token, TokenPosition } from './token.js'; const spaceChars = [' ', '\t']; const lineBreakChars = ['\r', '\n']; const digit = /^[0-9]$/; -const wordChar = /^[A-Za-z0-9_]$/; +const hexDigit = /^[0-9a-fA-F]$/; const exponentIndicatorPattern = /^[eE]$/; /** @@ -282,6 +283,11 @@ export class Scanner implements ITokenStream { } case '\\': { this.stream.next(); + if (!this.stream.eof && (this.stream.char as string) === 'u') { + this.stream.prev(); + const wordToken = this.tryReadWord(hasLeftSpacing); + if (wordToken) return wordToken; + } return TOKEN(TokenKind.BackSlash, pos, { hasLeftSpacing }); } case ']': { @@ -332,17 +338,39 @@ export class Scanner implements ITokenStream { private tryReadWord(hasLeftSpacing: boolean): Token | undefined { // read a word - let value = ''; + if (this.stream.eof) { + return; + } const pos = this.stream.getPos(); - while (!this.stream.eof && wordChar.test(this.stream.char)) { - value += this.stream.char; - this.stream.next(); - } - if (value.length === 0) { + let rawValue = this.tryReadIdentifierStart(); + if (rawValue === undefined) { return; } + while (!(this.stream.eof as boolean)) { + const matchedIdentifierPart = this.tryReadIdentifierPart(); + if (matchedIdentifierPart === undefined) { + break; + } + rawValue += matchedIdentifierPart; + } + + const value = decodeUnicodeEscapeSequence(rawValue); + const [start, ...parts] = value; + if (!isIdentifierStart(start!)) { + throw new AiScriptSyntaxError(`Invalid identifier: "${value}"`, pos); + } + for (const part of parts) { + if (!isIdentifierPart(part)) { + throw new AiScriptSyntaxError(`Invalid identifier: "${value}"`, pos); + } + } + + if (value !== rawValue) { + return TOKEN(TokenKind.Identifier, pos, { hasLeftSpacing, value }); + } + // check word kind switch (value) { case 'null': { @@ -414,6 +442,56 @@ export class Scanner implements ITokenStream { } } + private tryReadIdentifierStart(): string | undefined { + if (this.stream.eof) { + return; + } + if (isIdentifierStart(this.stream.char)) { + const value = this.stream.char; + this.stream.next(); + return value; + } + if (this.stream.char === '\\') { + this.stream.next(); + return '\\' + this.readUnicodeEscapeSequence(); + } + return; + } + + private tryReadIdentifierPart(): string | undefined { + if (this.stream.eof) { + return; + } + const matchedIdentifierStart = this.tryReadIdentifierStart(); + if (matchedIdentifierStart !== undefined) { + return matchedIdentifierStart; + } + if (isIdentifierPart(this.stream.char)) { + const value = this.stream.char; + this.stream.next(); + return value; + } + return; + } + + private readUnicodeEscapeSequence(): `u${string}` { + if (this.stream.eof || (this.stream.char as string) !== 'u') { + throw new AiScriptSyntaxError('character "u" expected', this.stream.getPos()); + } + this.stream.next(); + + let code = ''; + for (let i = 0; i < 4; i++) { + if (this.stream.eof || !hexDigit.test(this.stream.char)) { + throw new AiScriptSyntaxError('hexadecimal digit expected', this.stream.getPos()); + } + code += this.stream.char; + this.stream.next(); + } + + return `u${code}`; + } + private tryReadDigits(hasLeftSpacing: boolean): Token | undefined { let wholeNumber = ''; let fractional = ''; diff --git a/src/utils/characters.ts b/src/utils/characters.ts index 112faa81e..f8a7825a7 100644 --- a/src/utils/characters.ts +++ b/src/utils/characters.ts @@ -2,12 +2,9 @@ const MIN_HIGH_SURROGATE = 0xD800; const MAX_HIGH_SURROGATE = 0xDBFF; const MIN_LOW_SURROGATE = 0xDC00; const MAX_LOW_SURROGATE = 0xDFFF; -const UNICODE_LETTER = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_]$/u; -const UNICODE_COMBINING_MARK = /^[\p{Mn}\p{Mc}]$/u; -const UNICODE_DIGIT = /^\p{Nd}$/u; -const UNICODE_CONNECTOR_PUNCTUATION = /^\p{Pc}$/u; -const ZERO_WIDTH_NON_JOINER = String.fromCodePoint(0x200C); -const ZERO_WIDTH_JOINER = String.fromCharCode(0x200D); +const IDENTIFIER_START_PATTERN = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_]$/u; +const IDENTIFIER_PART_PATTERN = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_\p{Mn}\p{Mc}\p{Nd}\p{Pc}\u200c\u200d]$/u; +const HEX_DIGIT = /^[0-9a-fA-F]$/; export function isHighSurrogate(string: string, index = 0): boolean { if (index < 0 || index >= string.length) { @@ -30,21 +27,16 @@ export function isSurrogatePair(string: string, start = 0): boolean { } export function isIdentifierStart(char: string): boolean { - return UNICODE_LETTER.test(char) || char === '$' || char === '_'; + return IDENTIFIER_START_PATTERN.test(char); } export function isIdentifierPart(char: string): boolean { - return UNICODE_LETTER.test(char) - || UNICODE_COMBINING_MARK.test(char) - || UNICODE_DIGIT.test(char) - || UNICODE_CONNECTOR_PUNCTUATION.test(char) - || char === ZERO_WIDTH_NON_JOINER - || char === ZERO_WIDTH_JOINER; + return IDENTIFIER_PART_PATTERN.test(char); } export function decodeUnicodeEscapeSequence(string: string): string { let result = ''; - let state: 'string' | 'escape' | `digit` = 'string'; + let state: 'string' | 'escape' | 'digit' = 'string'; let digits = ''; for (let i = 0; i < string.length; i++) { @@ -69,7 +61,7 @@ export function decodeUnicodeEscapeSequence(string: string): string { } case 'digit': { - if ((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f') || (char >= 'A' && char <= 'F')) { + if (HEX_DIGIT.test(char)) { digits += char; } else { throw new SyntaxError('invalid escape sequence'); @@ -84,5 +76,9 @@ export function decodeUnicodeEscapeSequence(string: string): string { } } + if (state !== 'string') { + throw new SyntaxError('invalid escape sequence'); + } + return result; } diff --git a/test/characters.ts b/test/characters.ts new file mode 100644 index 000000000..4d5925ad0 --- /dev/null +++ b/test/characters.ts @@ -0,0 +1,187 @@ +import { decodeUnicodeEscapeSequence, isHighSurrogate, isIdentifierPart, isIdentifierStart, isLowSurrogate, isSurrogatePair } from '../src/utils/characters'; +import { describe, expect, test } from 'vitest'; + +describe('isHighSurrogate', () => { + const cases: [string, boolean][] = [ + ['', false], + ['\ud7ff', false], + ['\ud800', true], + ['\udbff', true], + ['\udc00', false], + ['\udfff', false], + ['\ue000', false], + ]; + + test.concurrent.each(cases)('"%s" -> %s', (input, expected) => { + expect(isHighSurrogate(input)).toBe(expected); + }); + + test.concurrent('index out of range', () => { + expect(isHighSurrogate('\uD800', 1)).toBe(false); + }); +}); + +describe('isLowSurrogate', () => { + const cases: [string, boolean][] = [ + ['', false], + ['\ud7ff', false], + ['\ud800', false], + ['\udbff', false], + ['\udc00', true], + ['\udfff', true], + ['\ue000', false], + ]; + + test.concurrent.each(cases)('"%s" -> %s', (input, expected) => { + expect(isLowSurrogate(input)).toBe(expected); + }); + + test.concurrent('index out of range', () => { + expect(isLowSurrogate('\DC00', 1)).toBe(false); + }); +}); + +describe('isSurrogatePair', () => { + const cases: [string, boolean][] = [ + ['\ud842\udfb7', true], + ['\ud83e\udd2f', true], + ['a', false], + ['\u85cd', false], + ['\ud842', false], + ['\ud8000', false], + ['0\udc00', false], + ['_\ud842\udfb7', false], + ]; + + test.concurrent.each(cases)('"%s" -> %s', (input, expected) => { + expect(isSurrogatePair(input)).toBe(expected); + }); + + test.concurrent.each(cases)('start given', () => { + expect(isSurrogatePair('_\ud842\udfb7', 1)).toBe(true); + }); +}); + +describe('isIdentifierStart', () => { + const cases: [string, boolean][] = [ + // UnicodeLetter + ['\u0041', true], // U+0041 (LATIN CAPITAL LETTER A): Uppercase letter (Lu) + ['\u0061', true], // U+0061 (LATIN SMALL LETTER A ): Lowercase letter (Ll) + ['\u01c5', true], // U+01C5 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON): Titlecase letter (Lt) + ['\u01c8', true], // U+01C8 (LATIN CAPITAL LETTER L WITH SMALL LETTER J): Titlecase letter (Lt) + ['\u02b0', true], // U+02B0 (Modifier Letter Small H): Modifier letter (Lm) + ['\u03a9', true], // U+03A9 (GREEK CAPITAL LETTER OMEGA): Uppercase letter (Lu) + ['\u03b2', true], // U+03B2 (GREEK SMALL LETTER BETA): Lowercase letter (Ll) + ['\u16ee', true], // U+16EE (Runic Arlaug Symbol): Letter number (Nl) + ['\u2163', true], // U+2163 (Roman Numeral Four): Letter number (Nl) + ['\u3005', true], // U+3005 (Ideographic Iteration Mark): Modifier letter (Lm) + ['\u3042', true], // U+3042 (HIRAGANA LETTER A): Other letter (Lo) + ['\u85cd', true], // U+85CD (CJK Unified Ideograph-85CD): Other letter (Lo) + ['\ud842\udfb7', true], // U+20BB7 (CJK Unified Ideograph-20BB7): Other letter (Lo) + + // $ + ['$', true], + + // _ + ['_', true], + + // Invalid characters + ['\u0021', false], // U+0021 (Exclamation Mark): Other Punctuation (Po) + ['\u0030', false], // U+0030 (Digit Zero): Decimal number (Nd) + ['\u0301', false], // U+0301 (Combining Acute Accent): Non-spacing mark (Mn) + ['\u093e', false], // U+093E (Devanagari Vowel Sign Aa): Combining spacing mark (Mc) + ['\u200c', false], // U+200C (Zero Width Non-Joiner (ZWNJ)): Format (Cf) + ['\u200d', false], // U+200D (Zero Width Joiner (ZWJ)): Format (Cf) + ['\u203f', false], // U+203F (Undertie): Connector punctuation (Pc) + ['\ud83e\udd2f', false], // U+1F92F (Shocked Face with Exploding Head): Other Symbol (So) + ]; + + test.concurrent.each(cases)('"%s" -> %s', (input, expected) => { + expect(isIdentifierStart(input)).toBe(expected); + }); +}); + +describe('isIdentifierPart', () => { + const cases: [string, boolean][] = [ + // UnicodeLetter + ['\u0041', true], // U+0041 (LATIN CAPITAL LETTER A): Uppercase letter (Lu) + ['\u0061', true], // U+0061 (LATIN SMALL LETTER A ): Lowercase letter (Ll) + ['\u01c5', true], // U+01C5 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON): Titlecase letter (Lt) + ['\u01c8', true], // U+01C8 (LATIN CAPITAL LETTER L WITH SMALL LETTER J): Titlecase letter (Lt) + ['\u02b0', true], // U+02B0 (Modifier Letter Small H): Modifier letter (Lm) + ['\u03a9', true], // U+03A9 (GREEK CAPITAL LETTER OMEGA): Uppercase letter (Lu) + ['\u03b2', true], // U+03B2 (GREEK SMALL LETTER BETA): Lowercase letter (Ll) + ['\u16ee', true], // U+16EE (Runic Arlaug Symbol): Letter number (Nl) + ['\u2163', true], // U+2163 (Roman Numeral Four): Letter number (Nl) + ['\u3005', true], // U+3005 (Ideographic Iteration Mark): Modifier letter (Lm) + ['\u3042', true], // U+3042 (HIRAGANA LETTER A): Other letter (Lo) + ['\u85cd', true], // U+85CD (CJK Unified Ideograph-85CD): Other letter (Lo) + ['\ud842\udfb7', true], // U+20BB7 (CJK Unified Ideograph-20BB7): Other letter (Lo) + + // $ + ['$', true], + + // _ + ['_', true], + + // UnicodeCombiningMark + ['\u0301', true], // U+0301 (Combining Acute Accent): Non-spacing mark (Mn) + ['\u093e', true], // U+093E (Devanagari Vowel Sign Aa): Combining spacing mark (Mc) + + // UnicodeDigit + // Decimal number (Nd) + ['\u0030', true], // U+0030 (Digit Zero): Decimal number (Nd) + + // UnicodeConnectorPunctuation + // Connector punctuation (Pc) + ['\u203f', true], // U+203F (Undertie): Connector punctuation (Pc) + + // ZWNJ + ['\u200c', true], // U+200C (Zero Width Non-Joiner (ZWNJ)): Format (Cf) + + // ZWJ + ['\u200d', true], // U+200D (Zero Width Joiner (ZWJ)): Format (Cf) + + // Invalid characters + ['\u0021', false], // U+0021 (Exclamation Mark): Other Punctuation (Po) + ['\ud83e\udd2f', false], // U+1F92F (Shocked Face with Exploding Head): Other Symbol (So) + ]; + + test.concurrent.each(cases)('"%s" -> %s', (input, expected) => { + expect(isIdentifierPart(input)).toBe(expected); + }); +}); + +describe('decodeUnicodeEscapeSequence', () => { + test('plain', () => { + expect(decodeUnicodeEscapeSequence('abc123')).toBe('abc123'); + }); + + test('escape', () => { + expect(decodeUnicodeEscapeSequence('\\u0041')).toBe('A'); + }); + + test('escape lowercase', () => { + expect(decodeUnicodeEscapeSequence('\\u85cd')).toBe('藍'); + }); + + test('escape uppercase', () => { + expect(decodeUnicodeEscapeSequence('\\u85CD')).toBe('藍'); + }); + + test('expects "u", unexpected end', () => { + expect(() => decodeUnicodeEscapeSequence('\\')).toThrow(); + }); + + test('expects "u"', () => { + expect(() => decodeUnicodeEscapeSequence('\\0')).toThrow(); + }); + + test('expects digit, unexpected end', () => { + expect(() => decodeUnicodeEscapeSequence('\\u00')).toThrow(); + }); + + test('expects digit', () => { + expect(() => decodeUnicodeEscapeSequence('\\ug')).toThrow(); + }); +}); diff --git a/test/identifiers.ts b/test/identifiers.ts new file mode 100644 index 000000000..22155f460 --- /dev/null +++ b/test/identifiers.ts @@ -0,0 +1,315 @@ +import { describe, expect, test } from 'vitest'; +import { Parser } from '../src'; +import { AiScriptSyntaxError } from '../src/error'; +import { eq, exe } from './testutils'; +import { NULL, NUM, STR, Value } from '../src/interpreter/value'; + +const reservedWords = [ + // 使用中の語 + 'null', + 'true', + 'false', + 'each', + 'for', + 'do', + 'while', + 'loop', + 'break', + 'continue', + 'match', + 'case', + 'default', + 'if', + 'elif', + 'else', + 'return', + 'eval', + 'var', + 'let', + 'exists', + + // 使用予定の語 + // 文脈キーワードは識別子に利用できるため除外 + 'as', + 'async', + 'attr', + 'attribute', + 'await', + 'catch', + 'class', + // 'const', + 'component', + 'constructor', + // 'def', + 'dictionary', + 'enum', + 'export', + 'finally', + 'fn', + // 'func', + // 'function', + 'hash', + 'in', + 'interface', + 'out', + 'private', + 'public', + 'ref', + 'static', + 'struct', + 'table', + 'this', + 'throw', + 'trait', + 'try', + 'undefined', + 'use', + 'using', + 'when', + 'yield', + 'import', + 'is', + 'meta', + 'module', + 'namespace', + 'new', +] as const; + +const validIdentifiers = [ + // IdentifierStart + // UnicodeLetter + // Uppercase letter (Lu) + 'A', // U+0041 (LATIN CAPITAL LETTER A) + 'Ω', // U+03A9 (GREEK CAPITAL LETTER OMEGA) + + // Lowercase letter (Ll) + 'a', // U+0061 (LATIN SMALL LETTER A ) + 'β', // U+03B2 (GREEK SMALL LETTER BETA) + + // Titlecase letter (Lt) + 'Dž', // U+01C5 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON) + 'Lj', // U+01C8 (LATIN CAPITAL LETTER L WITH SMALL LETTER J) + + // Modifier letter (Lm) + 'ʰ', // U+02B0 (Modifier Letter Small H) + '々', // U+3005 (Ideographic Iteration Mark) + + // Other letter (Lo) + 'あ', // U+3042 (HIRAGANA LETTER A) + '藍', // U+85CD (CJK Unified Ideograph-85CD) + '𠮷', // U+20BB7 (CJK Unified Ideograph-20BB7) + + // Letter number (Nl) + 'ᛮ', // U+16EE (Runic Arlaug Symbol) + 'Ⅳ', // U+2163 (Roman Numeral Four) + + // $ + '$', + + // _ + '_', + + // IdentifierPart + // IdentifierStart + '_A', + '_Ω', + '_a', + '_β', + '_Dž', + '_Lj', + '_ʰ', + '_々', + '_あ', + '_藍', + '_𠮷', + '_ᛮ', + '_Ⅳ', + '_$', + '__', + + // UnicodeCombiningMark + // Non-spacing mark (Mn) + 'á', // U+0301 (Combining Acute Accent) + + // Combining spacing mark (Mc) + 'राम', // U+093E (Devanagari Vowel Sign Aa) + + // UnicodeDigit + // Decimal number (Nd) + 'a0', // U+0030 (Digit Zero) + + // UnicodeConnectorPunctuation + // Connector punctuation (Pc) + 'a‿b', // U+203F (Undertie) + + // + 'बि‌ना', + + // + 'क‍्', +]; + +const validEscapeIdentifiers: [string, string][] = [ + ['\\u85cd', '藍'], + ['\\u85CD', '藍'], + ['\\ud842\\udfb7', '𠮷'], + ['\\uD842\\uDFB7', '𠮷'], + ['_\\u85cd', '_藍'], + ['_\\u85CD', '_藍'], + ['_\\ud842\\udfb7', '_𠮷'], + ['_\\uD842\\uDFB7', '_𠮷'], +]; + +const invalidIdentifiers = [ + '\\u', + '\\u000x', + '\\u0021', // "!": Other Punctuation (Po) + '\\u0069\\u0066', // "if" + '\\ud83e\\udd2f', '\\uD83E\\uDD2F', // U+1F92F (Shocked Face with Exploding Head): Other Symbol (So) + '_\\u', + '_\\u000x', + '_\\u0021', + '_\\ud83e\\udd2f', + '_\\uD83E\\uDD2F', +]; + +const sampleCodes = Object.entries<[(definedName: string, referredName: string) => string, Value]>({ + variable: [(definedName, referredName) => + ` + let ${definedName} = "ai" + <: ${referredName} + `, STR("ai")], + + function: [(definedName, referredName) => + ` + @${definedName}() { 'ai' } + <: ${referredName}() + `, STR("ai")], + + attribute: [(definedName) => + ` + #[${definedName} 1] + @f() { 1 } + `, NULL], + + namespace: [(definedName, referredName) => + ` + :: ${definedName} { + @f() { 1 } + } + <: ${referredName}:f() + `, NUM(1)], + + prop: [(definedName, referredName) => + ` + let x = { ${definedName}: 1 } + x.${referredName} + `, NUM(1)], + + meta: [(definedName) => + ` + ### ${definedName} 1 + `, NULL], + + forBreak: [(definedName, referredName) => + ` + #${definedName}: for 1 { + break #${referredName} + } + `, NULL], + + eachBreak: [(definedName, referredName) => + ` + #${definedName}: each let v, [0] { + break #${referredName} + } + `, NULL], + + whileBreak: [(definedName, referredName) => + ` + #${definedName}: while false { + break #${referredName} + } + `, NULL], + + forContinue: [(definedName, referredName) => + ` + #${definedName}: for 1 { + continue #${referredName} + } + `, NULL], + + eachContinue: [(definedName, referredName) => + ` + #${definedName}: each let v, [0] { + break #${referredName} + } + `, NULL], + + whileContinue: [(definedName, referredName) => + ` + var flag = true + #${definedName}: while flag { + flag = false + continue #${referredName} + } + `, NULL], + + typeParam: [(definedName, referredName) => + ` + @f<${definedName}>(x): ${referredName} { x } + `, NULL], +}); + +const parser = new Parser(); + +describe.each( + sampleCodes +)('identifier validation on %s', (_, [sampleCode, expected]) => { + + test.concurrent.each( + reservedWords + )('%s must be rejected', (word) => { + expect(() => parser.parse(sampleCode(word, word))).toThrow(AiScriptSyntaxError); + }); + + test.concurrent.each( + reservedWords + )('%scat must be allowed', (word) => { + const wordCat = word + 'cat'; + parser.parse(sampleCode(wordCat, wordCat)); + }); + + test.concurrent.each( + validIdentifiers + )('%s must be allowed', (word) => { + parser.parse(sampleCode(word, word)); + }); + + test.concurrent.each( + validEscapeIdentifiers + )('$0 must be allowed (referred as $1)', (encoded, decoded) => { + parser.parse(sampleCode(encoded, decoded)); + }); + + test.concurrent.each( + validEscapeIdentifiers + )('$1 must be allowed (referred as $0)', async (encoded, decoded) => { + const res = await exe(sampleCode(decoded, encoded)); + eq(res, expected); + }); + + test.concurrent.each( + invalidIdentifiers + )('%s must be rejected', (word) => { + expect(() => parser.parse(sampleCode(word, word))).toThrow(AiScriptSyntaxError); + }); +}); + +test.concurrent('Keyword cannot contain escape characters', async () => { + await expect(async () => await exe(` + \\u0069\\u0066 true { + <: 1 + } + `)).rejects.toThrow(); +}) diff --git a/test/keywords.ts b/test/keywords.ts deleted file mode 100644 index 983b8ea1f..000000000 --- a/test/keywords.ts +++ /dev/null @@ -1,167 +0,0 @@ -import { describe, expect, test } from 'vitest'; -import { Parser } from '../src'; -import { AiScriptSyntaxError } from '../src/error'; - -const reservedWords = [ - // 使用中の語 - 'null', - 'true', - 'false', - 'each', - 'for', - 'do', - 'while', - 'loop', - 'break', - 'continue', - 'match', - 'case', - 'default', - 'if', - 'elif', - 'else', - 'return', - 'eval', - 'var', - 'let', - 'exists', - - // 使用予定の語 - // 文脈キーワードは識別子に利用できるため除外 - 'as', - 'async', - 'attr', - 'attribute', - 'await', - 'catch', - 'class', - // 'const', - 'component', - 'constructor', - // 'def', - 'dictionary', - 'enum', - 'export', - 'finally', - 'fn', - // 'func', - // 'function', - 'hash', - 'in', - 'interface', - 'out', - 'private', - 'public', - 'ref', - 'static', - 'struct', - 'table', - 'this', - 'throw', - 'trait', - 'try', - 'undefined', - 'use', - 'using', - 'when', - 'yield', - 'import', - 'is', - 'meta', - 'module', - 'namespace', - 'new', -] as const; - -const sampleCodes = Object.entries<(word: string) => string>({ - variable: word => - ` - let ${word} = "ai" - ${word} - `, - - function: word => - ` - @${word}() { 'ai' } - ${word}() - `, - - attribute: word => - ` - #[${word} 1] - @f() { 1 } - `, - - namespace: word => - ` - :: ${word} { - @f() { 1 } - } - ${word}:f() - `, - - prop: word => - ` - let x = { ${word}: 1 } - x.${word} - `, - - meta: word => - ` - ### ${word} 1 - `, - - for: word => - ` - #${word}: for 1 {} - `, - - each: word => - ` - #${word}: each let v, [0] {} - `, - - while: word => - ` - #${word}: while false {} - `, - - break: word => - ` - #${word}: for 1 { - break #${word} - } - `, - - continue: word => - ` - #${word}: for 1 { - continue #${word} - } - `, - - typeParam: word => - ` - @f<${word}>(x): ${word} { x } - `, -}); - -const parser = new Parser(); - -describe.each( - sampleCodes -)('reserved word validation on %s', (_, sampleCode) => { - - test.concurrent.each( - reservedWords - )('%s must be rejected', (word) => { - expect(() => parser.parse(sampleCode(word))).toThrow(AiScriptSyntaxError); - }); - - test.concurrent.each( - reservedWords - )('%scat must be allowed', (word) => { - parser.parse(sampleCode(word+'cat')); - }); - -}); diff --git a/test/parser.ts b/test/parser.ts index 47be63002..80cc15732 100644 --- a/test/parser.ts +++ b/test/parser.ts @@ -153,7 +153,7 @@ describe('Scanner', () => { next(stream, TokenKind.EOF, { line: 1, column: 4 }, { }); }); test.concurrent('invalid token', async () => { - const source = '$'; + const source = '~'; try { const stream = new Scanner(source); } catch (e) { diff --git a/unreleased/json5-identifiers.md b/unreleased/json5-identifiers.md new file mode 100644 index 000000000..1f69d641f --- /dev/null +++ b/unreleased/json5-identifiers.md @@ -0,0 +1,22 @@ +- 識別子に使用できる文字の種類を追加 + - 識別子には以下の文字を使用できます。 + - 以下のUnicodeカテゴリに含まれる全ての文字 + - Uppercase letter (Lu) + - Lowercase letter (Ll) + - Titlecase letter (Lt) + - Modifier letter (Lm) + - Other letter (Lo) + - Letter number (Nl) + - `$` + - `_` + - `\u`とそれに続く4桁の16進法の英数字 + - Unicodeエスケープシーケンスとして、与えられた値を持つUTF-16コード単位として解釈されます。 + - 識別子の最初を除いた部分では、以下の文字も使用できます。 + - 以下のUnicodeカテゴリに含まれる全ての文字 + - Non-spacing mark (Mn) + - Combining spacing mark (Mc) + - Decimal number (Nd) + - Connector punctuation (Pc) + - ゼロ幅非接合子 (ZWNJ) + - ゼロ幅接合子 (ZWJ) + - Unicodeエスケープシーケンスを用いないと上記の制約に抵触する文字列を、Unicodeエスケープシーケンスによって識別子とすることはできません。 From c8dc63189b95cb17e71db4c4452f50a5e98c0c3f Mon Sep 17 00:00:00 2001 From: takejohn Date: Fri, 12 Sep 2025 16:05:58 +0900 Subject: [PATCH 3/8] =?UTF-8?q?column=E3=82=92UTF-16=E3=82=B3=E3=83=BC?= =?UTF-8?q?=E3=83=89=E5=8D=98=E4=BD=8D=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parser/streams/char-stream.ts | 10 +++++----- test/parser.ts | 31 +++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/parser/streams/char-stream.ts b/src/parser/streams/char-stream.ts index a79d2594f..b60a477f1 100644 --- a/src/parser/streams/char-stream.ts +++ b/src/parser/streams/char-stream.ts @@ -12,9 +12,9 @@ export class CharStream { private address: number; /** Unicode character */ private _char?: string; - /** zero-based number, based on Unicode code points */ + /** zero-based number */ private line: number; - /** zero-based number, based on Unicode code points */ + /** zero-based number, based on UTF-16 code unit */ private column: number; constructor(source: string, opts?: { line?: number, column?: number }) { @@ -64,7 +64,7 @@ export class CharStream { this.line++; this.column = 0; } else { - this.column++; + this.column += this._char!.length; } this.incAddr(); this.moveNext(); @@ -82,9 +82,9 @@ export class CharStream { const lastLineBreak = page.lastIndexOf('\n', this.address - 1); const lineStart = lastLineBreak >= 0 ? lastLineBreak + 1 : 0; const line = page.slice(lineStart, this.address); - this.column = [...line].length - 1; + this.column = line.length; } else { - this.column--; + this.column -= this._char!.length; } } diff --git a/test/parser.ts b/test/parser.ts index 80cc15732..a73ec87c9 100644 --- a/test/parser.ts +++ b/test/parser.ts @@ -45,7 +45,7 @@ describe('CharStream', () => { assert.strictEqual('b', stream.char); stream.prev(); assert.strictEqual('\n', stream.char); - assert.deepStrictEqual(stream.getPos(), { line: 1, column: 1 }); + assert.deepStrictEqual(stream.getPos(), { line: 1, column: 2 }); }); test.concurrent('line breaks', async () => { @@ -56,7 +56,7 @@ describe('CharStream', () => { assert.strictEqual('c', stream.char); stream.prev(); assert.strictEqual('\n', stream.char); - assert.deepStrictEqual(stream.getPos(), { line: 2, column: 0 }); + assert.deepStrictEqual(stream.getPos(), { line: 2, column: 1 }); }); test.concurrent('CRは読み飛ばされる', async () => { @@ -77,6 +77,26 @@ describe('CharStream', () => { stream.prev(); assert.strictEqual('\ud83e\udd2f', stream.char); }); + + test.concurrent('column is based on UTF-16 code unit', async () => { + const source = '\ud83e\udd2f!'; + const stream = new CharStream(source); + stream.next(); + stream.next(); + stream.prev(); + assert.strictEqual(stream.char, '!'); + assert.deepStrictEqual(stream.getPos(), { line: 1, column: 3 }); + }); + + test.concurrent('column is based on UTF-16 code unit, line break', async () => { + const source = '\ud83e\udd2f\n'; + const stream = new CharStream(source); + stream.next(); + stream.next(); + stream.prev(); + assert.strictEqual(stream.char, '\n'); + assert.deepStrictEqual(stream.getPos(), { line: 1, column: 3 }); + }); }); test.concurrent('eof', async () => { @@ -122,6 +142,13 @@ describe('CharStream', () => { stream.next(); assert.strictEqual(true, stream.eof); }); + + test.concurrent('column is based on UTF-16 code unit', async () => { + const source = '\ud83e\udd2f'; + const stream = new CharStream(source); + stream.next(); + assert.deepStrictEqual(stream.getPos(), { line: 1, column: 3 }); + }); }); describe('Scanner', () => { From b47dcfeceff9af6fef27fb3661f38b6cc2f65ac2 Mon Sep 17 00:00:00 2001 From: takejohn Date: Fri, 12 Sep 2025 16:30:09 +0900 Subject: [PATCH 4/8] =?UTF-8?q?=E4=BD=BF=E7=94=A8=E4=B8=AD=E3=80=81?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=E4=BA=88=E5=AE=9A=E3=81=AE=E4=BA=88=E7=B4=84?= =?UTF-8?q?=E8=AA=9E=E3=81=AE=E3=82=A8=E3=83=A9=E3=83=BC=E3=82=92=E5=85=B1?= =?UTF-8?q?=E9=80=9A=E3=81=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parser/plugins/validate-keyword.ts | 55 ++++++++++++-------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/src/parser/plugins/validate-keyword.ts b/src/parser/plugins/validate-keyword.ts index 9446ed53b..ca79dc590 100644 --- a/src/parser/plugins/validate-keyword.ts +++ b/src/parser/plugins/validate-keyword.ts @@ -7,6 +7,30 @@ import type * as Ast from '../../node.js'; // - 文脈キーワードは識別子に利用できるため除外 const reservedWord = [ + // 使用中の語 + 'null', + 'true', + 'false', + 'each', + 'for', + 'loop', + 'do', + 'while', + 'break', + 'continue', + 'match', + 'case', + 'default', + 'if', + 'elif', + 'else', + 'return', + 'eval', + 'var', + 'let', + 'exists', + + // 使用予定の語 'as', 'async', 'attr', @@ -52,36 +76,9 @@ const reservedWord = [ 'new', ]; -const keywords = [ - 'null', - 'true', - 'false', - 'each', - 'for', - 'loop', - 'do', - 'while', - 'break', - 'continue', - 'match', - 'case', - 'default', - 'if', - 'elif', - 'else', - 'return', - 'eval', - 'var', - 'let', - 'exists', -]; - function validateName(name: string, pos: Ast.Pos): void { if (reservedWord.includes(name)) { - throw new AiScriptSyntaxError(`Reserved word "${name}" cannot be used as identifier.`, pos); - } - if (keywords.includes(name)) { - throw new AiScriptSyntaxError(`Keyword "${name}" cannot be used as identifier.`, pos); + throwReservedWordError(name, pos); } } @@ -92,7 +89,7 @@ function validateTypeName(name: string, pos: Ast.Pos): void { validateName(name, pos); } -function throwReservedWordError(name: string, pos: Ast.Pos): void { +function throwReservedWordError(name: string, pos: Ast.Pos): never { throw new AiScriptSyntaxError(`Reserved word "${name}" cannot be used as variable name.`, pos); } From ee45ee03dbf95ee20396790462ba34a4a4d694f1 Mon Sep 17 00:00:00 2001 From: takejohn Date: Fri, 12 Sep 2025 17:04:14 +0900 Subject: [PATCH 5/8] =?UTF-8?q?=E3=82=A8=E3=82=B9=E3=82=B1=E3=83=BC?= =?UTF-8?q?=E3=83=97=E3=81=95=E3=82=8C=E3=81=9F=E4=BA=88=E7=B4=84=E8=AA=9E?= =?UTF-8?q?=E3=82=92=E3=82=AD=E3=83=BC=E3=81=AB=E6=8C=81=E3=81=A4=E3=82=AA?= =?UTF-8?q?=E3=83=96=E3=82=B8=E3=82=A7=E3=82=AF=E3=83=88=E3=83=AA=E3=83=86?= =?UTF-8?q?=E3=83=A9=E3=83=AB=E3=81=AE=E3=83=86=E3=82=B9=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/literals.ts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/literals.ts b/test/literals.ts index 72e2fc304..9b52da5df 100644 --- a/test/literals.ts +++ b/test/literals.ts @@ -241,6 +241,15 @@ describe('literal', () => { }); }); + test.concurrent('obj (escaped reserved word as key)', async () => { + const res = await exe(` + <: { + \\u0064\\u0065\\u0066\\u0061\\u0075\\u006c\\u0074: 42, + } + `); + eq(res, OBJ(new Map([['default', NUM(42)]]))); + }) + test.concurrent('obj (invalid key)', async () => { assert.rejects(() => exe(` <: { From fef775bce7763af427c233bedca652ed452613d6 Mon Sep 17 00:00:00 2001 From: takejohn Date: Sat, 13 Sep 2025 20:31:05 +0900 Subject: [PATCH 6/8] =?UTF-8?q?isIdentifierStart,=20isIdentifierPart?= =?UTF-8?q?=E9=96=A2=E6=95=B0=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parser/scanner.ts | 12 +++--- src/utils/characters.ts | 10 ----- test/characters.ts | 92 +---------------------------------------- 3 files changed, 8 insertions(+), 106 deletions(-) diff --git a/src/parser/scanner.ts b/src/parser/scanner.ts index 20109b35c..e3caa1bff 100644 --- a/src/parser/scanner.ts +++ b/src/parser/scanner.ts @@ -1,5 +1,5 @@ import { AiScriptSyntaxError, AiScriptUnexpectedEOFError } from '../error.js'; -import { decodeUnicodeEscapeSequence, isIdentifierPart, isIdentifierStart } from '../utils/characters.js'; +import { decodeUnicodeEscapeSequence } from '../utils/characters.js'; import { CharStream } from './streams/char-stream.js'; import { TOKEN, TokenKind } from './token.js'; import { unexpectedTokenError } from './utils.js'; @@ -10,6 +10,8 @@ import type { Token, TokenPosition } from './token.js'; const spaceChars = [' ', '\t']; const lineBreakChars = ['\r', '\n']; const digit = /^[0-9]$/; +const identifierStart = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_]$/u; +const identifierPart = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_\p{Mn}\p{Mc}\p{Nd}\p{Pc}\u200c\u200d]$/u; const hexDigit = /^[0-9a-fA-F]$/; const exponentIndicatorPattern = /^[eE]$/; @@ -358,11 +360,11 @@ export class Scanner implements ITokenStream { const value = decodeUnicodeEscapeSequence(rawValue); const [start, ...parts] = value; - if (!isIdentifierStart(start!)) { + if (!identifierStart.test(start!)) { throw new AiScriptSyntaxError(`Invalid identifier: "${value}"`, pos); } for (const part of parts) { - if (!isIdentifierPart(part)) { + if (!identifierPart.test(part)) { throw new AiScriptSyntaxError(`Invalid identifier: "${value}"`, pos); } } @@ -446,7 +448,7 @@ export class Scanner implements ITokenStream { if (this.stream.eof) { return; } - if (isIdentifierStart(this.stream.char)) { + if (identifierStart.test(this.stream.char)) { const value = this.stream.char; this.stream.next(); return value; @@ -466,7 +468,7 @@ export class Scanner implements ITokenStream { if (matchedIdentifierStart !== undefined) { return matchedIdentifierStart; } - if (isIdentifierPart(this.stream.char)) { + if (identifierPart.test(this.stream.char)) { const value = this.stream.char; this.stream.next(); return value; diff --git a/src/utils/characters.ts b/src/utils/characters.ts index f8a7825a7..2c5a170ab 100644 --- a/src/utils/characters.ts +++ b/src/utils/characters.ts @@ -2,8 +2,6 @@ const MIN_HIGH_SURROGATE = 0xD800; const MAX_HIGH_SURROGATE = 0xDBFF; const MIN_LOW_SURROGATE = 0xDC00; const MAX_LOW_SURROGATE = 0xDFFF; -const IDENTIFIER_START_PATTERN = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_]$/u; -const IDENTIFIER_PART_PATTERN = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_\p{Mn}\p{Mc}\p{Nd}\p{Pc}\u200c\u200d]$/u; const HEX_DIGIT = /^[0-9a-fA-F]$/; export function isHighSurrogate(string: string, index = 0): boolean { @@ -26,14 +24,6 @@ export function isSurrogatePair(string: string, start = 0): boolean { return isHighSurrogate(string, start) && isLowSurrogate(string, start + 1); } -export function isIdentifierStart(char: string): boolean { - return IDENTIFIER_START_PATTERN.test(char); -} - -export function isIdentifierPart(char: string): boolean { - return IDENTIFIER_PART_PATTERN.test(char); -} - export function decodeUnicodeEscapeSequence(string: string): string { let result = ''; let state: 'string' | 'escape' | 'digit' = 'string'; diff --git a/test/characters.ts b/test/characters.ts index 4d5925ad0..71ace67ea 100644 --- a/test/characters.ts +++ b/test/characters.ts @@ -1,4 +1,4 @@ -import { decodeUnicodeEscapeSequence, isHighSurrogate, isIdentifierPart, isIdentifierStart, isLowSurrogate, isSurrogatePair } from '../src/utils/characters'; +import { decodeUnicodeEscapeSequence, isHighSurrogate, isLowSurrogate, isSurrogatePair } from '../src/utils/characters'; import { describe, expect, test } from 'vitest'; describe('isHighSurrogate', () => { @@ -62,96 +62,6 @@ describe('isSurrogatePair', () => { }); }); -describe('isIdentifierStart', () => { - const cases: [string, boolean][] = [ - // UnicodeLetter - ['\u0041', true], // U+0041 (LATIN CAPITAL LETTER A): Uppercase letter (Lu) - ['\u0061', true], // U+0061 (LATIN SMALL LETTER A ): Lowercase letter (Ll) - ['\u01c5', true], // U+01C5 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON): Titlecase letter (Lt) - ['\u01c8', true], // U+01C8 (LATIN CAPITAL LETTER L WITH SMALL LETTER J): Titlecase letter (Lt) - ['\u02b0', true], // U+02B0 (Modifier Letter Small H): Modifier letter (Lm) - ['\u03a9', true], // U+03A9 (GREEK CAPITAL LETTER OMEGA): Uppercase letter (Lu) - ['\u03b2', true], // U+03B2 (GREEK SMALL LETTER BETA): Lowercase letter (Ll) - ['\u16ee', true], // U+16EE (Runic Arlaug Symbol): Letter number (Nl) - ['\u2163', true], // U+2163 (Roman Numeral Four): Letter number (Nl) - ['\u3005', true], // U+3005 (Ideographic Iteration Mark): Modifier letter (Lm) - ['\u3042', true], // U+3042 (HIRAGANA LETTER A): Other letter (Lo) - ['\u85cd', true], // U+85CD (CJK Unified Ideograph-85CD): Other letter (Lo) - ['\ud842\udfb7', true], // U+20BB7 (CJK Unified Ideograph-20BB7): Other letter (Lo) - - // $ - ['$', true], - - // _ - ['_', true], - - // Invalid characters - ['\u0021', false], // U+0021 (Exclamation Mark): Other Punctuation (Po) - ['\u0030', false], // U+0030 (Digit Zero): Decimal number (Nd) - ['\u0301', false], // U+0301 (Combining Acute Accent): Non-spacing mark (Mn) - ['\u093e', false], // U+093E (Devanagari Vowel Sign Aa): Combining spacing mark (Mc) - ['\u200c', false], // U+200C (Zero Width Non-Joiner (ZWNJ)): Format (Cf) - ['\u200d', false], // U+200D (Zero Width Joiner (ZWJ)): Format (Cf) - ['\u203f', false], // U+203F (Undertie): Connector punctuation (Pc) - ['\ud83e\udd2f', false], // U+1F92F (Shocked Face with Exploding Head): Other Symbol (So) - ]; - - test.concurrent.each(cases)('"%s" -> %s', (input, expected) => { - expect(isIdentifierStart(input)).toBe(expected); - }); -}); - -describe('isIdentifierPart', () => { - const cases: [string, boolean][] = [ - // UnicodeLetter - ['\u0041', true], // U+0041 (LATIN CAPITAL LETTER A): Uppercase letter (Lu) - ['\u0061', true], // U+0061 (LATIN SMALL LETTER A ): Lowercase letter (Ll) - ['\u01c5', true], // U+01C5 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON): Titlecase letter (Lt) - ['\u01c8', true], // U+01C8 (LATIN CAPITAL LETTER L WITH SMALL LETTER J): Titlecase letter (Lt) - ['\u02b0', true], // U+02B0 (Modifier Letter Small H): Modifier letter (Lm) - ['\u03a9', true], // U+03A9 (GREEK CAPITAL LETTER OMEGA): Uppercase letter (Lu) - ['\u03b2', true], // U+03B2 (GREEK SMALL LETTER BETA): Lowercase letter (Ll) - ['\u16ee', true], // U+16EE (Runic Arlaug Symbol): Letter number (Nl) - ['\u2163', true], // U+2163 (Roman Numeral Four): Letter number (Nl) - ['\u3005', true], // U+3005 (Ideographic Iteration Mark): Modifier letter (Lm) - ['\u3042', true], // U+3042 (HIRAGANA LETTER A): Other letter (Lo) - ['\u85cd', true], // U+85CD (CJK Unified Ideograph-85CD): Other letter (Lo) - ['\ud842\udfb7', true], // U+20BB7 (CJK Unified Ideograph-20BB7): Other letter (Lo) - - // $ - ['$', true], - - // _ - ['_', true], - - // UnicodeCombiningMark - ['\u0301', true], // U+0301 (Combining Acute Accent): Non-spacing mark (Mn) - ['\u093e', true], // U+093E (Devanagari Vowel Sign Aa): Combining spacing mark (Mc) - - // UnicodeDigit - // Decimal number (Nd) - ['\u0030', true], // U+0030 (Digit Zero): Decimal number (Nd) - - // UnicodeConnectorPunctuation - // Connector punctuation (Pc) - ['\u203f', true], // U+203F (Undertie): Connector punctuation (Pc) - - // ZWNJ - ['\u200c', true], // U+200C (Zero Width Non-Joiner (ZWNJ)): Format (Cf) - - // ZWJ - ['\u200d', true], // U+200D (Zero Width Joiner (ZWJ)): Format (Cf) - - // Invalid characters - ['\u0021', false], // U+0021 (Exclamation Mark): Other Punctuation (Po) - ['\ud83e\udd2f', false], // U+1F92F (Shocked Face with Exploding Head): Other Symbol (So) - ]; - - test.concurrent.each(cases)('"%s" -> %s', (input, expected) => { - expect(isIdentifierPart(input)).toBe(expected); - }); -}); - describe('decodeUnicodeEscapeSequence', () => { test('plain', () => { expect(decodeUnicodeEscapeSequence('abc123')).toBe('abc123'); From a80a323b5d151c32034ca8ef5a240e91b48cdf95 Mon Sep 17 00:00:00 2001 From: takejohn Date: Sat, 13 Sep 2025 22:29:18 +0900 Subject: [PATCH 7/8] =?UTF-8?q?=E6=96=87=E5=AD=97=E7=A8=AE=E3=81=AE?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0=E3=82=92=E5=8F=96=E3=82=8A=E6=B6=88=E3=81=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parser/scanner.ts | 6 +- test/identifiers.ts | 135 +++++++++++++++++++++--------------------- test/literals.ts | 9 ++- 3 files changed, 73 insertions(+), 77 deletions(-) diff --git a/src/parser/scanner.ts b/src/parser/scanner.ts index e3caa1bff..1eec385d1 100644 --- a/src/parser/scanner.ts +++ b/src/parser/scanner.ts @@ -10,8 +10,8 @@ import type { Token, TokenPosition } from './token.js'; const spaceChars = [' ', '\t']; const lineBreakChars = ['\r', '\n']; const digit = /^[0-9]$/; -const identifierStart = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_]$/u; -const identifierPart = /^[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_\p{Mn}\p{Mc}\p{Nd}\p{Pc}\u200c\u200d]$/u; +const identifierStart = /^[A-Za-z_]$/u; +const identifierPart = /^[A-Za-z0-9_]$/u; const hexDigit = /^[0-9a-fA-F]$/; const exponentIndicatorPattern = /^[eE]$/; @@ -370,7 +370,7 @@ export class Scanner implements ITokenStream { } if (value !== rawValue) { - return TOKEN(TokenKind.Identifier, pos, { hasLeftSpacing, value }); + throw new AiScriptSyntaxError(`Cannot use escape characters in identifier: "${rawValue}"`, pos); } // check word kind diff --git a/test/identifiers.ts b/test/identifiers.ts index 22155f460..9b676b8ec 100644 --- a/test/identifiers.ts +++ b/test/identifiers.ts @@ -75,104 +75,108 @@ const reservedWords = [ 'new', ] as const; -const validIdentifiers = [ +// ['識別子', 使用可否] +const identifierCases: [string, boolean][] = [ + // IdentifierStart // UnicodeLetter // Uppercase letter (Lu) - 'A', // U+0041 (LATIN CAPITAL LETTER A) - 'Ω', // U+03A9 (GREEK CAPITAL LETTER OMEGA) + ['A', true], // U+0041 (LATIN CAPITAL LETTER A) + ['Ω', false], // U+03A9 (GREEK CAPITAL LETTER OMEGA) // Lowercase letter (Ll) - 'a', // U+0061 (LATIN SMALL LETTER A ) - 'β', // U+03B2 (GREEK SMALL LETTER BETA) + ['a', true], // U+0061 (LATIN SMALL LETTER A ) + ['β', false], // U+03B2 (GREEK SMALL LETTER BETA) // Titlecase letter (Lt) - 'Dž', // U+01C5 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON) - 'Lj', // U+01C8 (LATIN CAPITAL LETTER L WITH SMALL LETTER J) + ['Dž', false], // U+01C5 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON) + ['Lj', false], // U+01C8 (LATIN CAPITAL LETTER L WITH SMALL LETTER J) // Modifier letter (Lm) - 'ʰ', // U+02B0 (Modifier Letter Small H) - '々', // U+3005 (Ideographic Iteration Mark) + ['ʰ', false], // U+02B0 (Modifier Letter Small H) + ['々', false], // U+3005 (Ideographic Iteration Mark) // Other letter (Lo) - 'あ', // U+3042 (HIRAGANA LETTER A) - '藍', // U+85CD (CJK Unified Ideograph-85CD) - '𠮷', // U+20BB7 (CJK Unified Ideograph-20BB7) + ['あ', false], // U+3042 (HIRAGANA LETTER A) + ['藍', false], // U+85CD (CJK Unified Ideograph-85CD) + ['𠮷', false], // U+20BB7 (CJK Unified Ideograph-20BB7) // Letter number (Nl) - 'ᛮ', // U+16EE (Runic Arlaug Symbol) - 'Ⅳ', // U+2163 (Roman Numeral Four) + ['ᛮ', false], // U+16EE (Runic Arlaug Symbol) + ['Ⅳ', false], // U+2163 (Roman Numeral Four) // $ - '$', + ['$', false], // _ - '_', + ['_', true], // IdentifierPart // IdentifierStart - '_A', - '_Ω', - '_a', - '_β', - '_Dž', - '_Lj', - '_ʰ', - '_々', - '_あ', - '_藍', - '_𠮷', - '_ᛮ', - '_Ⅳ', - '_$', - '__', + ['_A', true], + ['_Ω', false], + ['_a', true], + ['_β', false], + ['_Dž', false], + ['_Lj', false], + ['_ʰ', false], + ['_々', false], + ['_あ', false], + ['_藍', false], + ['_𠮷', false], + ['_ᛮ', false], + ['_Ⅳ', false], + ['_$', false], + ['__', true], // UnicodeCombiningMark // Non-spacing mark (Mn) - 'á', // U+0301 (Combining Acute Accent) + ['á', false], // U+0301 (Combining Acute Accent) // Combining spacing mark (Mc) - 'राम', // U+093E (Devanagari Vowel Sign Aa) + ['राम', false], // U+093E (Devanagari Vowel Sign Aa) // UnicodeDigit // Decimal number (Nd) - 'a0', // U+0030 (Digit Zero) + ['a0', true], // U+0030 (Digit Zero) + ['a๑', false], // U+0E51 (Thai Digit One) // UnicodeConnectorPunctuation // Connector punctuation (Pc) - 'a‿b', // U+203F (Undertie) + ['a‿b', false], // U+203F (Undertie) // - 'बि‌ना', + ['बि‌ना', false], // - 'क‍्', + ['क‍्', false], + + ['\\u', false], + ['\\u000x', false], + ['\\u0021', false], // "!": Other Punctuation (Po) + ['\\u0069\\u0066', false], // "if" + ['\\ud83e\\udd2f', false], // U+1F92F (Shocked Face with Exploding Head): Other Symbol (So) + ['\\uD83E\\uDD2F', false], + ['_\\u', false], + ['_\\u000x', false], + ['_\\u0021', false], + ['_\\ud83e\\udd2f', false], + ['_\\uD83E\\uDD2F', false], ]; -const validEscapeIdentifiers: [string, string][] = [ +const escapeIdentifiers: [string, string][] = [ + ['\\u0041', 'A'], ['\\u85cd', '藍'], ['\\u85CD', '藍'], ['\\ud842\\udfb7', '𠮷'], ['\\uD842\\uDFB7', '𠮷'], + ['_\\u0041', '_A'], ['_\\u85cd', '_藍'], ['_\\u85CD', '_藍'], ['_\\ud842\\udfb7', '_𠮷'], ['_\\uD842\\uDFB7', '_𠮷'], ]; -const invalidIdentifiers = [ - '\\u', - '\\u000x', - '\\u0021', // "!": Other Punctuation (Po) - '\\u0069\\u0066', // "if" - '\\ud83e\\udd2f', '\\uD83E\\uDD2F', // U+1F92F (Shocked Face with Exploding Head): Other Symbol (So) - '_\\u', - '_\\u000x', - '_\\u0021', - '_\\ud83e\\udd2f', - '_\\uD83E\\uDD2F', -]; - const sampleCodes = Object.entries<[(definedName: string, referredName: string) => string, Value]>({ variable: [(definedName, referredName) => ` @@ -281,27 +285,20 @@ describe.each( }); test.concurrent.each( - validIdentifiers - )('%s must be allowed', (word) => { - parser.parse(sampleCode(word, word)); - }); - - test.concurrent.each( - validEscapeIdentifiers - )('$0 must be allowed (referred as $1)', (encoded, decoded) => { - parser.parse(sampleCode(encoded, decoded)); + identifierCases + )('%s is allowed: %s', async (word, allowed) => { + expect.hasAssertions(); + if (allowed) { + const res = await exe(sampleCode(word, word)); + eq(res, expected); + } else { + expect(() => parser.parse(sampleCode(word, word))).toThrow(AiScriptSyntaxError); + } }); test.concurrent.each( - validEscapeIdentifiers - )('$1 must be allowed (referred as $0)', async (encoded, decoded) => { - const res = await exe(sampleCode(decoded, encoded)); - eq(res, expected); - }); - - test.concurrent.each( - invalidIdentifiers - )('%s must be rejected', (word) => { + escapeIdentifiers + )('escape sequence is not allowed: %s', async (word) => { expect(() => parser.parse(sampleCode(word, word))).toThrow(AiScriptSyntaxError); }); }); diff --git a/test/literals.ts b/test/literals.ts index 9b52da5df..c5f587e8e 100644 --- a/test/literals.ts +++ b/test/literals.ts @@ -1,8 +1,8 @@ import * as assert from 'assert'; -import { describe, test } from 'vitest'; +import { describe, expect, test } from 'vitest'; import { } from '../src'; import { NUM, STR, NULL, ARR, OBJ, BOOL, TRUE, FALSE, ERROR ,FN_NATIVE } from '../src/interpreter/value'; -import { } from '../src/error'; +import { AiScriptSyntaxError } from '../src/error'; import { exe, eq } from './testutils'; describe('literal', () => { @@ -242,12 +242,11 @@ describe('literal', () => { }); test.concurrent('obj (escaped reserved word as key)', async () => { - const res = await exe(` + await expect(async () => await exe(` <: { \\u0064\\u0065\\u0066\\u0061\\u0075\\u006c\\u0074: 42, } - `); - eq(res, OBJ(new Map([['default', NUM(42)]]))); + `)).rejects.toThrow(AiScriptSyntaxError); }) test.concurrent('obj (invalid key)', async () => { From daeb89ce656640b792f3dd5c3921887d2c8f9981 Mon Sep 17 00:00:00 2001 From: takejohn Date: Sat, 13 Sep 2025 22:41:57 +0900 Subject: [PATCH 8/8] =?UTF-8?q?Unicode=E3=82=A8=E3=82=B9=E3=82=B1=E3=83=BC?= =?UTF-8?q?=E3=83=97=E3=82=B7=E3=83=BC=E3=82=B1=E3=83=B3=E3=82=B9=E3=81=AE?= =?UTF-8?q?=E6=A4=9C=E8=A8=BC=E5=87=A6=E7=90=86=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parser/scanner.ts | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/parser/scanner.ts b/src/parser/scanner.ts index 1eec385d1..30dc4a8d3 100644 --- a/src/parser/scanner.ts +++ b/src/parser/scanner.ts @@ -359,18 +359,8 @@ export class Scanner implements ITokenStream { } const value = decodeUnicodeEscapeSequence(rawValue); - const [start, ...parts] = value; - if (!identifierStart.test(start!)) { - throw new AiScriptSyntaxError(`Invalid identifier: "${value}"`, pos); - } - for (const part of parts) { - if (!identifierPart.test(part)) { - throw new AiScriptSyntaxError(`Invalid identifier: "${value}"`, pos); - } - } - if (value !== rawValue) { - throw new AiScriptSyntaxError(`Cannot use escape characters in identifier: "${rawValue}"`, pos); + throw new AiScriptSyntaxError(`Invalid identifier: "${rawValue}"`, pos); } // check word kind