python-visitor-c-compiler/lexer.py at master · JamesSullivan/python-visitor-c-compiler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import re
from typing import Dict, List, NamedTuple


# Token class to store information about each lexed token
class Token(NamedTuple):
    type: str      # Token type (e.g., KEYWORD, ID, etc)
    value: str     # Actual text of the token
    line: int      # Line number where token appears
    column: int    # Column number where token starts


# Complete token specification with regex patterns for each token type
KEYWORDS: List[str] = ["return", "void", "int"]
TOKEN_KEYWORDS: Dict[str, str] = {k.upper(): k+r'\b' for k in KEYWORDS}
TOKEN_SPECIFICATION: Dict[str, str] = TOKEN_KEYWORDS | {
    "NEWLINE": r"\n",
    "LPAREN": r"\(",
    "RPAREN": r"\)",
    "END": r";",
    "LSQB": r"\[",
    "RSQB": r"\]",
    "LCUB": r"\{",
    "RCUB": r"\}",
    "CONSTANT": r"[0-9]+\b",
    "ID": r"[a-zA-Z_]\w*\b",
    "WHITESPACE": r"[ \t]+",
    "MISMATCH": r".",
}

# Create single regex pattern that matches any token
token_regex: str = '|'.join(
    f'(?P<{key}>{value})' for key, value in TOKEN_SPECIFICATION.items())
get_token = re.compile(token_regex).match


def lex(code: str) -> List[Token]:
    """
    Tokenize input source code into a list of Token objects.

    Args:
        code: Source code string to tokenize

    Returns:
        List of Token objects representing the lexical structure

    Raises:
        RuntimeError: If an invalid character is encountered
    """
    tokens: List[Token] = []
    pos: int = 0
    line: int = 1
    column: int = 1
    while pos < len(code):
        match = get_token(code, pos)
        if match:
            kind = match.lastgroup if match.lastgroup else 'MISMATCH'
            if kind != 'MISMATCH':  # Ensure lastgroup is not None
                value = match.group(kind)
            else:
                value = match.group()
                raise RuntimeError(f'Lexer: Unexpected character {
                                   value!r} at line {line} column {column}.')
            if kind == 'WHITESPACE':
                pos = match.end()
                column = column + len(value)
                continue
            tokens.append(Token(kind, value, line, column))
            column = column + len(value)
            if kind == 'NEWLINE':
                line += 1
                column = 1
            pos = match.end()
        else:
            # No valid token could be matched
            raise RuntimeError(f'Lexer: Unexpected character {code[pos]!r}')
    return tokens