-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlexer.py
More file actions
77 lines (67 loc) · 2.39 KB
/
lexer.py
File metadata and controls
77 lines (67 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import re
from typing import Dict, List, NamedTuple
# Token class to store information about each lexed token
class Token(NamedTuple):
type: str # Token type (e.g., KEYWORD, ID, etc)
value: str # Actual text of the token
line: int # Line number where token appears
column: int # Column number where token starts
# Complete token specification with regex patterns for each token type
KEYWORDS: List[str] = ["return", "void", "int"]
TOKEN_KEYWORDS: Dict[str, str] = {k.upper(): k+r'\b' for k in KEYWORDS}
TOKEN_SPECIFICATION: Dict[str, str] = TOKEN_KEYWORDS | {
"NEWLINE": r"\n",
"LPAREN": r"\(",
"RPAREN": r"\)",
"END": r";",
"LSQB": r"\[",
"RSQB": r"\]",
"LCUB": r"\{",
"RCUB": r"\}",
"CONSTANT": r"[0-9]+\b",
"ID": r"[a-zA-Z_]\w*\b",
"WHITESPACE": r"[ \t]+",
"MISMATCH": r".",
}
# Create single regex pattern that matches any token
token_regex: str = '|'.join(
f'(?P<{key}>{value})' for key, value in TOKEN_SPECIFICATION.items())
get_token = re.compile(token_regex).match
def lex(code: str) -> List[Token]:
"""
Tokenize input source code into a list of Token objects.
Args:
code: Source code string to tokenize
Returns:
List of Token objects representing the lexical structure
Raises:
RuntimeError: If an invalid character is encountered
"""
tokens: List[Token] = []
pos: int = 0
line: int = 1
column: int = 1
while pos < len(code):
match = get_token(code, pos)
if match:
kind = match.lastgroup if match.lastgroup else 'MISMATCH'
if kind != 'MISMATCH': # Ensure lastgroup is not None
value = match.group(kind)
else:
value = match.group()
raise RuntimeError(f'Lexer: Unexpected character {
value!r} at line {line} column {column}.')
if kind == 'WHITESPACE':
pos = match.end()
column = column + len(value)
continue
tokens.append(Token(kind, value, line, column))
column = column + len(value)
if kind == 'NEWLINE':
line += 1
column = 1
pos = match.end()
else:
# No valid token could be matched
raise RuntimeError(f'Lexer: Unexpected character {code[pos]!r}')
return tokens