From af320d3ce3644a2bbbaac084b096e7a67910921d Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 26 Apr 2023 15:35:36 -0600 Subject: [PATCH 1/8] gh-103656: Transfer f-string buffers to parser to avoid use-after-free --- Grammar/python.gram | 11 ++- Lib/test/test_fstring.py | 14 ++++ Parser/action_helpers.c | 71 +++++++++++----- Parser/parser.c | 24 +++--- Parser/pegen.c | 12 +++ Parser/pegen.h | 13 ++- Parser/tokenizer.c | 176 +++++++++++++++++++++------------------ Parser/tokenizer.h | 1 + 8 files changed, 199 insertions(+), 123 deletions(-) diff --git a/Grammar/python.gram b/Grammar/python.gram index 3a356c65a75195..6361dcd0985b99 100644 --- a/Grammar/python.gram +++ b/Grammar/python.gram @@ -881,14 +881,13 @@ fstring_middle[expr_ty]: | fstring_replacement_field | t=FSTRING_MIDDLE { _PyPegen_constant_from_token(p, t) } fstring_replacement_field[expr_ty]: - | '{' a=(yield_expr | star_expressions) debug_expr="="? conversion=[fstring_conversion] format=[fstring_full_format_spec] '}' { - _PyPegen_formatted_value(p, a, debug_expr, conversion, format, EXTRA) - } + | '{' a=(yield_expr | star_expressions) debug_expr="="? conversion=[fstring_conversion] format=[fstring_full_format_spec] rbrace='}' { + _PyPegen_formatted_value(p, a, debug_expr, conversion, format, rbrace, EXTRA) } | invalid_replacement_field -fstring_conversion[expr_ty]: +fstring_conversion[ResultTokenWithMetadata*]: | conv_token="!" conv=NAME { _PyPegen_check_fstring_conversion(p, conv_token, conv) } -fstring_full_format_spec[expr_ty]: - | ':' spec=fstring_format_spec* { spec ? _PyAST_JoinedStr((asdl_expr_seq*)spec, EXTRA) : NULL } +fstring_full_format_spec[ResultTokenWithMetadata*]: + | colon=':' spec=fstring_format_spec* { _PyPegen_setup_full_format_spec(p, colon, (asdl_expr_seq *) spec, EXTRA) } fstring_format_spec[expr_ty]: | t=FSTRING_MIDDLE { _PyPegen_constant_from_token(p, t) } | fstring_replacement_field diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 9d5e16628f04b6..5e94c99ae65af1 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1535,5 +1535,19 @@ def test_not_closing_quotes(self): self.assertAllRaise(SyntaxError, "unterminated triple-quoted f-string literal", ['f"""', "f'''"]) + def test_syntax_error_after_debug(self): + self.assertAllRaise(SyntaxError, "f-string: expecting a valid expression after '{'", + [ + "f'{1=}{;'", + "f'{1=}{+;'", + "f'{1=}{2}{;'", + "f'{1=}{3}{;'", + ]) + self.assertAllRaise(SyntaxError, "f-string: expecting '=', or '!', or ':', or '}'", + [ + "f'{1=}{1;'", + "f'{1=}{1;}'", + ]) + if __name__ == '__main__': unittest.main() diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 55c0f6fdd620f4..168e01015fb761 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -965,17 +965,43 @@ _PyPegen_check_legacy_stmt(Parser *p, expr_ty name) { return 0; } -expr_ty -_PyPegen_check_fstring_conversion(Parser *p, Token* symbol, expr_ty conv) { - if (symbol->lineno != conv->lineno || symbol->end_col_offset != conv->col_offset) { +static ResultTokenWithMetadata * +result_token_with_metadata(Parser *p, void *result, PyObject *metadata) +{ + ResultTokenWithMetadata *res = _PyArena_Malloc(p->arena, sizeof(ResultTokenWithMetadata)); + if (res == NULL) { + return NULL; + } + res->metadata = metadata; + res->result = result; + return res; +} + +ResultTokenWithMetadata * +_PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv) +{ + if (conv_token->lineno != conv->lineno || conv_token->end_col_offset != conv->col_offset) { return RAISE_SYNTAX_ERROR_KNOWN_RANGE( - symbol, conv, + conv_token, conv, "f-string: conversion type must come right after the exclamanation mark" ); } - return conv; + return result_token_with_metadata(p, conv, conv_token->metadata); } +ResultTokenWithMetadata * +_PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, int lineno, int col_offset, + int end_lineno, int end_col_offset, PyArena *arena) +{ + if (!spec) { + return NULL; + } + expr_ty res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno, end_col_offset, p->arena); + if (!res) { + return NULL; + } + return result_token_with_metadata(p, res, colon->metadata); +} const char * _PyPegen_get_expr_name(expr_ty e) @@ -1386,19 +1412,20 @@ expr_ty _PyPegen_constant_from_string(Parser* p, Token* tok) { return _PyAST_Constant(s, kind, tok->lineno, tok->col_offset, tok->end_lineno, tok->end_col_offset, p->arena); } -expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, expr_ty conversion, - expr_ty format, int lineno, int col_offset, int end_lineno, int end_col_offset, - PyArena *arena) { +expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, ResultTokenWithMetadata *conversion, + ResultTokenWithMetadata *format, Token *closing_brace, int lineno, int col_offset, + int end_lineno, int end_col_offset, PyArena *arena) { int conversion_val = -1; if (conversion != NULL) { - assert(conversion->kind == Name_kind); - Py_UCS4 first = PyUnicode_READ_CHAR(conversion->v.Name.id, 0); + expr_ty conversion_expr = (expr_ty) conversion->result; + assert(conversion_expr->kind == Name_kind); + Py_UCS4 first = PyUnicode_READ_CHAR(conversion_expr->v.Name.id, 0); - if (PyUnicode_GET_LENGTH(conversion->v.Name.id) > 1 || + if (PyUnicode_GET_LENGTH(conversion_expr->v.Name.id) > 1 || !(first == 's' || first == 'r' || first == 'a')) { - RAISE_SYNTAX_ERROR_KNOWN_LOCATION(conversion, + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(conversion_expr, "f-string: invalid conversion character %R: expected 's', 'r', or 'a'", - conversion->v.Name.id); + conversion_expr->v.Name.id); return NULL; } @@ -1410,7 +1437,7 @@ expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, ex } expr_ty formatted_value = _PyAST_FormattedValue( - expression, conversion_val, format, + expression, conversion_val, format ? (expr_ty) format->result : NULL, lineno, col_offset, end_lineno, end_col_offset, arena ); @@ -1418,22 +1445,26 @@ expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, ex if (debug) { /* Find the non whitespace token after the "=" */ int debug_end_line, debug_end_offset; + PyObject *debug_metadata; if (conversion) { - debug_end_line = conversion->lineno; - debug_end_offset = conversion->col_offset; + debug_end_line = ((expr_ty) conversion->result)->lineno; + debug_end_offset = ((expr_ty) conversion->result)->col_offset; + debug_metadata = conversion->metadata; } else if (format) { - debug_end_line = format->lineno; - debug_end_offset = format->col_offset + 1; // HACK: ?? + debug_end_line = ((expr_ty) format->result)->lineno; + debug_end_offset = ((expr_ty) format->result)->col_offset + 1; + debug_metadata = format->metadata; } else { debug_end_line = end_lineno; debug_end_offset = end_col_offset; + debug_metadata = closing_brace->metadata; } - expr_ty debug_text = decode_fstring_buffer(p, lineno, col_offset + 1, - debug_end_line, debug_end_offset - 1); + expr_ty debug_text = _PyAST_Constant(debug_metadata, NULL, lineno, col_offset + 1, debug_end_line, + debug_end_offset - 1, p->arena); if (!debug_text) { return NULL; } diff --git a/Parser/parser.c b/Parser/parser.c index 771366844fc489..6eb985a7d3e123 100644 --- a/Parser/parser.c +++ b/Parser/parser.c @@ -738,8 +738,8 @@ static NameDefaultPair* lambda_param_maybe_default_rule(Parser *p); static arg_ty lambda_param_rule(Parser *p); static expr_ty fstring_middle_rule(Parser *p); static expr_ty fstring_replacement_field_rule(Parser *p); -static expr_ty fstring_conversion_rule(Parser *p); -static expr_ty fstring_full_format_spec_rule(Parser *p); +static ResultTokenWithMetadata* fstring_conversion_rule(Parser *p); +static ResultTokenWithMetadata* fstring_full_format_spec_rule(Parser *p); static expr_ty fstring_format_spec_rule(Parser *p); static expr_ty string_rule(Parser *p); static expr_ty strings_rule(Parser *p); @@ -15639,11 +15639,11 @@ fstring_replacement_field_rule(Parser *p) } D(fprintf(stderr, "%*c> fstring_replacement_field[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "'{' (yield_expr | star_expressions) \"=\"? fstring_conversion? fstring_full_format_spec? '}'")); Token * _literal; - Token * _literal_1; void *a; void *conversion; void *debug_expr; void *format; + Token * rbrace; if ( (_literal = _PyPegen_expect_token(p, 25)) // token='{' && @@ -15655,7 +15655,7 @@ fstring_replacement_field_rule(Parser *p) && (format = fstring_full_format_spec_rule(p), !p->error_indicator) // fstring_full_format_spec? && - (_literal_1 = _PyPegen_expect_token(p, 26)) // token='}' + (rbrace = _PyPegen_expect_token(p, 26)) // token='}' ) { D(fprintf(stderr, "%*c+ fstring_replacement_field[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "'{' (yield_expr | star_expressions) \"=\"? fstring_conversion? fstring_full_format_spec? '}'")); @@ -15668,7 +15668,7 @@ fstring_replacement_field_rule(Parser *p) UNUSED(_end_lineno); // Only used by EXTRA macro int _end_col_offset = _token->end_col_offset; UNUSED(_end_col_offset); // Only used by EXTRA macro - _res = _PyPegen_formatted_value ( p , a , debug_expr , conversion , format , EXTRA ); + _res = _PyPegen_formatted_value ( p , a , debug_expr , conversion , format , rbrace , EXTRA ); if (_res == NULL && PyErr_Occurred()) { p->error_indicator = 1; p->level--; @@ -15706,7 +15706,7 @@ fstring_replacement_field_rule(Parser *p) } // fstring_conversion: "!" NAME -static expr_ty +static ResultTokenWithMetadata* fstring_conversion_rule(Parser *p) { if (p->level++ == MAXSTACK) { @@ -15717,7 +15717,7 @@ fstring_conversion_rule(Parser *p) p->level--; return NULL; } - expr_ty _res = NULL; + ResultTokenWithMetadata* _res = NULL; int _mark = p->mark; { // "!" NAME if (p->error_indicator) { @@ -15753,7 +15753,7 @@ fstring_conversion_rule(Parser *p) } // fstring_full_format_spec: ':' fstring_format_spec* -static expr_ty +static ResultTokenWithMetadata* fstring_full_format_spec_rule(Parser *p) { if (p->level++ == MAXSTACK) { @@ -15764,7 +15764,7 @@ fstring_full_format_spec_rule(Parser *p) p->level--; return NULL; } - expr_ty _res = NULL; + ResultTokenWithMetadata* _res = NULL; int _mark = p->mark; if (p->mark == p->fill && _PyPegen_fill_token(p) < 0) { p->error_indicator = 1; @@ -15781,10 +15781,10 @@ fstring_full_format_spec_rule(Parser *p) return NULL; } D(fprintf(stderr, "%*c> fstring_full_format_spec[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "':' fstring_format_spec*")); - Token * _literal; + Token * colon; asdl_seq * spec; if ( - (_literal = _PyPegen_expect_token(p, 11)) // token=':' + (colon = _PyPegen_expect_token(p, 11)) // token=':' && (spec = _loop0_112_rule(p)) // fstring_format_spec* ) @@ -15799,7 +15799,7 @@ fstring_full_format_spec_rule(Parser *p) UNUSED(_end_lineno); // Only used by EXTRA macro int _end_col_offset = _token->end_col_offset; UNUSED(_end_col_offset); // Only used by EXTRA macro - _res = spec ? _PyAST_JoinedStr ( ( asdl_expr_seq* ) spec , EXTRA ) : NULL; + _res = _PyPegen_setup_full_format_spec ( p , colon , ( asdl_expr_seq* ) spec , EXTRA ); if (_res == NULL && PyErr_Occurred()) { p->error_indicator = 1; p->level--; diff --git a/Parser/pegen.c b/Parser/pegen.c index 262bfabfba7a25..97f86e1372f7c0 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -155,6 +155,17 @@ initialize_token(Parser *p, Token *parser_token, struct token *new_token, int to return -1; } + if (new_token->metadata != NULL) { + parser_token->metadata = new_token->metadata; + if (_PyArena_AddPyObject(p->arena, parser_token->metadata) < 0) { + Py_DECREF(parser_token->metadata); + return -1; + } + } + else { + parser_token->metadata = NULL; + } + parser_token->level = new_token->level; parser_token->lineno = new_token->lineno; parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset @@ -198,6 +209,7 @@ int _PyPegen_fill_token(Parser *p) { struct token new_token; + new_token.metadata = NULL; int type = _PyTokenizer_Get(p->tok, &new_token); // Record and skip '# type: ignore' comments diff --git a/Parser/pegen.h b/Parser/pegen.h index 6962013c2d18b4..8800e9f97f5e04 100644 --- a/Parser/pegen.h +++ b/Parser/pegen.h @@ -39,6 +39,7 @@ typedef struct { int level; int lineno, col_offset, end_lineno, end_col_offset; Memo *memo; + PyObject *metadata; } Token; typedef struct { @@ -118,6 +119,11 @@ typedef struct { int is_keyword; } KeywordOrStarred; +typedef struct { + void *result; + PyObject *metadata; +} ResultTokenWithMetadata; + // Internal parser functions #if defined(Py_DEBUG) void _PyPegen_clear_memo_statistics(void); @@ -310,7 +316,8 @@ StarEtc *_PyPegen_star_etc(Parser *, arg_ty, asdl_seq *, arg_ty); arguments_ty _PyPegen_make_arguments(Parser *, asdl_arg_seq *, SlashWithDefault *, asdl_arg_seq *, asdl_seq *, StarEtc *); arguments_ty _PyPegen_empty_arguments(Parser *); -expr_ty _PyPegen_formatted_value(Parser *, expr_ty, Token *, expr_ty, expr_ty, int, int, int, int, PyArena *); +expr_ty _PyPegen_formatted_value(Parser *, expr_ty, Token *, ResultTokenWithMetadata *, ResultTokenWithMetadata *, Token *, + int, int, int, int, PyArena *); AugOperator *_PyPegen_augoperator(Parser*, operator_ty type); stmt_ty _PyPegen_function_def_decorators(Parser *, asdl_expr_seq *, stmt_ty); stmt_ty _PyPegen_class_def_decorators(Parser *, asdl_expr_seq *, stmt_ty); @@ -329,7 +336,9 @@ expr_ty _PyPegen_ensure_real(Parser *p, expr_ty); asdl_seq *_PyPegen_join_sequences(Parser *, asdl_seq *, asdl_seq *); int _PyPegen_check_barry_as_flufl(Parser *, Token *); int _PyPegen_check_legacy_stmt(Parser *p, expr_ty t); -expr_ty _PyPegen_check_fstring_conversion(Parser *p, Token *, expr_ty t); +ResultTokenWithMetadata *_PyPegen_check_fstring_conversion(Parser *p, Token *, expr_ty t); +ResultTokenWithMetadata *_PyPegen_setup_full_format_spec(Parser *, Token *, asdl_expr_seq *, int, int, + int, int, PyArena *); mod_ty _PyPegen_make_module(Parser *, asdl_stmt_seq *); void *_PyPegen_arguments_parsing_error(Parser *, expr_ty); expr_ty _PyPegen_get_last_comprehension_item(comprehension_ty comprehension); diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index a8649b8547e256..4fd5343c364ef1 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -31,6 +31,7 @@ /* Don't ever change this -- it would break the portability of Python code */ #define TABSIZE 8 +#define TOK_NEXTC() tok_nextc(tok, token) #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) @@ -58,7 +59,7 @@ static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { /* Forward */ static struct tok_state *tok_new(void); -static int tok_nextc(struct tok_state *tok); +static int tok_nextc(struct tok_state *tok, struct token *token); static void tok_backup(struct tok_state *tok, int c); static int syntaxerror(struct tok_state *tok, const char *format, ...); @@ -391,7 +392,7 @@ restore_fstring_buffers(struct tok_state *tok) } static int -update_fstring_expr(struct tok_state *tok, char cur) +update_fstring_expr(struct tok_state *tok, struct token *token, char cur) { assert(tok->cur != NULL); @@ -432,6 +433,15 @@ update_fstring_expr(struct tok_state *tok, char cur) case ':': if (tok_mode->last_expr_end == -1) { tok_mode->last_expr_end = strlen(tok->start); + PyObject *res = PyUnicode_DecodeUTF8( + tok_mode->last_expr_buffer, + tok_mode->last_expr_size - tok_mode->last_expr_end, + NULL + ); + if (!res) { + goto error; + } + token->metadata = res; } break; default: @@ -959,7 +969,7 @@ _PyTokenizer_Free(struct tok_state *tok) } static int -tok_readline_raw(struct tok_state *tok) +tok_readline_raw(struct tok_state *tok, struct token *token) { do { if (!tok_reserve_buf(tok, BUFSIZ)) { @@ -971,7 +981,7 @@ tok_readline_raw(struct tok_state *tok) if (line == NULL) { return 1; } - if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, token, 0)) { return 0; } if (tok->fp_interactive && @@ -1009,7 +1019,7 @@ tok_underflow_string(struct tok_state *tok) { } static int -tok_underflow_interactive(struct tok_state *tok) { +tok_underflow_interactive(struct tok_state *tok, struct token *token) { if (tok->interactive_underflow == IUNDERFLOW_STOP) { tok->done = E_INTERACT_STOP; return 1; @@ -1094,14 +1104,14 @@ tok_underflow_interactive(struct tok_state *tok) { return 0; } - if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, token, 0)) { return 0; } return 1; } static int -tok_underflow_file(struct tok_state *tok) { +tok_underflow_file(struct tok_state *tok, struct token *token) { if (tok->start == NULL) { tok->cur = tok->inp = tok->buf; } @@ -1124,7 +1134,7 @@ tok_underflow_file(struct tok_state *tok) { } else { /* We want a 'raw' read. */ - if (!tok_readline_raw(tok)) { + if (!tok_readline_raw(tok, token)) { return 0; } } @@ -1192,7 +1202,7 @@ print_escape(FILE *f, const char *s, Py_ssize_t size) /* Get next char, updating state; error code goes into tok->done */ static int -tok_nextc(struct tok_state *tok) +tok_nextc(struct tok_state *tok, struct token *token) { int rc; for (;;) { @@ -1207,10 +1217,10 @@ tok_nextc(struct tok_state *tok) rc = tok_underflow_string(tok); } else if (tok->prompt != NULL) { - rc = tok_underflow_interactive(tok); + rc = tok_underflow_interactive(tok, token); } else { - rc = tok_underflow_file(tok); + rc = tok_underflow_file(tok, token); } #if defined(Py_DEBUG) if (tok->debug) { @@ -1399,12 +1409,12 @@ warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_cha } static int -lookahead(struct tok_state *tok, const char *test) +lookahead(struct tok_state *tok, struct token *token, const char *test) { const char *s = test; int res = 0; while (1) { - int c = tok_nextc(tok); + int c = TOK_NEXTC(); if (*s == 0) { res = !is_potential_identifier_char(c); } @@ -1422,7 +1432,7 @@ lookahead(struct tok_state *tok, const char *test) } static int -verify_end_of_number(struct tok_state *tok, int c, const char *kind) +verify_end_of_number(struct tok_state *tok, struct token *token, int c, const char *kind) { /* Emit a deprecation warning only if the numeric literal is immediately * followed by one of keywords which can occur after a numeric literal @@ -1436,26 +1446,26 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind) */ int r = 0; if (c == 'a') { - r = lookahead(tok, "nd"); + r = lookahead(tok, token, "nd"); } else if (c == 'e') { - r = lookahead(tok, "lse"); + r = lookahead(tok, token, "lse"); } else if (c == 'f') { - r = lookahead(tok, "or"); + r = lookahead(tok, token, "or"); } else if (c == 'i') { - int c2 = tok_nextc(tok); + int c2 = TOK_NEXTC(); if (c2 == 'f' || c2 == 'n' || c2 == 's') { r = 1; } tok_backup(tok, c2); } else if (c == 'o') { - r = lookahead(tok, "r"); + r = lookahead(tok, token, "r"); } else if (c == 'n') { - r = lookahead(tok, "ot"); + r = lookahead(tok, token, "ot"); } if (r) { tok_backup(tok, c); @@ -1464,7 +1474,7 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind) { return 0; } - tok_nextc(tok); + TOK_NEXTC(); } else /* In future releases, only error will remain. */ if (is_potential_identifier_char(c)) { @@ -1532,18 +1542,18 @@ verify_identifier(struct tok_state *tok) } static int -tok_decimal_tail(struct tok_state *tok) +tok_decimal_tail(struct tok_state *tok, struct token *token) { int c; while (1) { do { - c = tok_nextc(tok); + c = TOK_NEXTC(); } while (isdigit(c)); if (c != '_') { break; } - c = tok_nextc(tok); + c = TOK_NEXTC(); if (!isdigit(c)) { tok_backup(tok, c); syntaxerror(tok, "invalid decimal literal"); @@ -1555,13 +1565,13 @@ tok_decimal_tail(struct tok_state *tok) static inline int -tok_continuation_line(struct tok_state *tok) { - int c = tok_nextc(tok); +tok_continuation_line(struct tok_state *tok, struct token *token) { + int c = TOK_NEXTC(); if (c != '\n') { tok->done = E_LINECONT; return -1; } - c = tok_nextc(tok); + c = TOK_NEXTC(); if (c == EOF) { tok->done = E_EOF; tok->cur = tok->inp; @@ -1628,7 +1638,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->atbol = 0; int cont_line_col = 0; for (;;) { - c = tok_nextc(tok); + c = TOK_NEXTC(); if (c == ' ') { col++, altcol++; } @@ -1645,7 +1655,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t // preceded by whitespace, **the first one we find** determines // the level of indentation of whatever comes next. cont_line_col = cont_line_col ? cont_line_col : col; - if ((c = tok_continuation_line(tok)) == -1) { + if ((c = tok_continuation_line(tok, token)) == -1) { return MAKE_TOKEN(ERRORTOKEN); } } @@ -1733,7 +1743,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } /* Peek ahead at the next character */ - c = tok_nextc(tok); + c = TOK_NEXTC(); tok_backup(tok, c); /* Check if we are closing an async function */ if (tok->async_def @@ -1761,7 +1771,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->start = NULL; /* Skip spaces */ do { - c = tok_nextc(tok); + c = TOK_NEXTC(); } while (c == ' ' || c == '\t' || c == '\014'); /* Set start of current token */ @@ -1779,7 +1789,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t int current_starting_col_offset; while (c != EOF && c != '\n') { - c = tok_nextc(tok); + c = TOK_NEXTC(); } if (tok->type_comments) { @@ -1825,7 +1835,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* If this type ignore is the only thing on the line, consume the newline also. */ if (blankline) { - tok_nextc(tok); + TOK_NEXTC(); tok->atbol = 1; } return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); @@ -1874,7 +1884,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t else { break; } - c = tok_nextc(tok); + c = TOK_NEXTC(); if (c == '"' || c == '\'') { if (saw_f) { goto f_string_quote; @@ -1886,7 +1896,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (c >= 128) { nonascii = 1; } - c = tok_nextc(tok); + c = TOK_NEXTC(); } tok_backup(tok, c); if (nonascii && !verify_identifier(tok)) { @@ -1963,11 +1973,11 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Period or number starting with period? */ if (c == '.') { - c = tok_nextc(tok); + c = TOK_NEXTC(); if (isdigit(c)) { goto fraction; } else if (c == '.') { - c = tok_nextc(tok); + c = TOK_NEXTC(); if (c == '.') { p_start = tok->start; p_end = tok->cur; @@ -1990,32 +2000,32 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (isdigit(c)) { if (c == '0') { /* Hex, octal or binary -- maybe. */ - c = tok_nextc(tok); + c = TOK_NEXTC(); if (c == 'x' || c == 'X') { /* Hex */ - c = tok_nextc(tok); + c = TOK_NEXTC(); do { if (c == '_') { - c = tok_nextc(tok); + c = TOK_NEXTC(); } if (!isxdigit(c)) { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal")); } do { - c = tok_nextc(tok); + c = TOK_NEXTC(); } while (isxdigit(c)); } while (c == '_'); - if (!verify_end_of_number(tok, c, "hexadecimal")) { + if (!verify_end_of_number(tok, token, c, "hexadecimal")) { return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'o' || c == 'O') { /* Octal */ - c = tok_nextc(tok); + c = TOK_NEXTC(); do { if (c == '_') { - c = tok_nextc(tok); + c = TOK_NEXTC(); } if (c < '0' || c >= '8') { if (isdigit(c)) { @@ -2028,23 +2038,23 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } do { - c = tok_nextc(tok); + c = TOK_NEXTC(); } while ('0' <= c && c < '8'); } while (c == '_'); if (isdigit(c)) { return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in octal literal", c)); } - if (!verify_end_of_number(tok, c, "octal")) { + if (!verify_end_of_number(tok, token, c, "octal")) { return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'b' || c == 'B') { /* Binary */ - c = tok_nextc(tok); + c = TOK_NEXTC(); do { if (c == '_') { - c = tok_nextc(tok); + c = TOK_NEXTC(); } if (c != '0' && c != '1') { if (isdigit(c)) { @@ -2056,13 +2066,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } do { - c = tok_nextc(tok); + c = TOK_NEXTC(); } while (c == '0' || c == '1'); } while (c == '_'); if (isdigit(c)) { return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } - if (!verify_end_of_number(tok, c, "binary")) { + if (!verify_end_of_number(tok, token, c, "binary")) { return MAKE_TOKEN(ERRORTOKEN); } } @@ -2072,7 +2082,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* in any case, allow '0' as a literal */ while (1) { if (c == '_') { - c = tok_nextc(tok); + c = TOK_NEXTC(); if (!isdigit(c)) { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); @@ -2081,18 +2091,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (c != '0') { break; } - c = tok_nextc(tok); + c = TOK_NEXTC(); } char* zeros_end = tok->cur; if (isdigit(c)) { nonzero = 1; - c = tok_decimal_tail(tok); + c = tok_decimal_tail(tok, token); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } } if (c == '.') { - c = tok_nextc(tok); + c = TOK_NEXTC(); goto fraction; } else if (c == 'e' || c == 'E') { @@ -2111,25 +2121,25 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t "literals are not permitted; " "use an 0o prefix for octal integers")); } - if (!verify_end_of_number(tok, c, "decimal")) { + if (!verify_end_of_number(tok, token, c, "decimal")) { return MAKE_TOKEN(ERRORTOKEN); } } } else { /* Decimal */ - c = tok_decimal_tail(tok); + c = tok_decimal_tail(tok, token); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } { /* Accept floating point numbers. */ if (c == '.') { - c = tok_nextc(tok); + c = TOK_NEXTC(); fraction: /* Fraction */ if (isdigit(c)) { - c = tok_decimal_tail(tok); + c = tok_decimal_tail(tok, token); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } @@ -2140,16 +2150,16 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t exponent: e = c; /* Exponent part */ - c = tok_nextc(tok); + c = TOK_NEXTC(); if (c == '+' || c == '-') { - c = tok_nextc(tok); + c = TOK_NEXTC(); if (!isdigit(c)) { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } else if (!isdigit(c)) { tok_backup(tok, c); - if (!verify_end_of_number(tok, e, "decimal")) { + if (!verify_end_of_number(tok, token, e, "decimal")) { return MAKE_TOKEN(ERRORTOKEN); } tok_backup(tok, e); @@ -2157,7 +2167,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t p_end = tok->cur; return MAKE_TOKEN(NUMBER); } - c = tok_decimal_tail(tok); + c = tok_decimal_tail(tok, token); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } @@ -2165,12 +2175,12 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (c == 'j' || c == 'J') { /* Imaginary part */ imaginary: - c = tok_nextc(tok); - if (!verify_end_of_number(tok, c, "imaginary")) { + c = TOK_NEXTC(); + if (!verify_end_of_number(tok, token, c, "imaginary")) { return MAKE_TOKEN(ERRORTOKEN); } } - else if (!verify_end_of_number(tok, c, "decimal")) { + else if (!verify_end_of_number(tok, token, c, "decimal")) { return MAKE_TOKEN(ERRORTOKEN); } } @@ -2194,9 +2204,9 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->multi_line_start = tok->line_start; /* Find the quote size and start of string */ - int after_quote = tok_nextc(tok); + int after_quote = TOK_NEXTC(); if (after_quote == quote) { - int after_after_quote = tok_nextc(tok); + int after_after_quote = TOK_NEXTC(); if (after_after_quote == quote) { quote_size = 3; } @@ -2258,9 +2268,9 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->multi_line_start = tok->line_start; /* Find the quote size and start of string */ - c = tok_nextc(tok); + c = TOK_NEXTC(); if (c == quote) { - c = tok_nextc(tok); + c = TOK_NEXTC(); if (c == quote) { quote_size = 3; } @@ -2274,7 +2284,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Get rest of string */ while (end_quote_size != quote_size) { - c = tok_nextc(tok); + c = TOK_NEXTC(); if (tok->done == E_DECODE) break; if (c == EOF || (quote_size == 1 && c == '\n')) { @@ -2324,7 +2334,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t else { end_quote_size = 0; if (c == '\\') { - tok_nextc(tok); /* skip escaped char */ + TOK_NEXTC(); /* skip escaped char */ } } } @@ -2336,7 +2346,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Line continuation */ if (c == '\\') { - if ((c = tok_continuation_line(tok)) == -1) { + if ((c = tok_continuation_line(tok, token)) == -1) { return MAKE_TOKEN(ERRORTOKEN); } tok->cont_line = 1; @@ -2351,7 +2361,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t * to adjust it manually */ int cursor = current_tok->curly_bracket_depth - (c != '{'); - if (cursor == 0 && !update_fstring_expr(tok, c)) { + if (cursor == 0 && !update_fstring_expr(tok, token, c)) { return MAKE_TOKEN(ENDMARKER); } @@ -2365,10 +2375,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Check for two-character token */ { - int c2 = tok_nextc(tok); + int c2 = TOK_NEXTC(); int current_token = _PyToken_TwoChars(c, c2); if (current_token != OP) { - int c3 = tok_nextc(tok); + int c3 = TOK_NEXTC(); int current_token3 = _PyToken_ThreeChars(c, c2, c3); if (current_token3 != OP) { current_token = current_token3; @@ -2478,9 +2488,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize // before it. - int start_char = tok_nextc(tok); + int start_char = TOK_NEXTC(); if (start_char == '{') { - int peek1 = tok_nextc(tok); + int peek1 = TOK_NEXTC(); tok_backup(tok, peek1); tok_backup(tok, start_char); if (peek1 != '{') { @@ -2498,7 +2508,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct // Check if we are at the end of the string for (int i = 0; i < current_tok->f_string_quote_size; i++) { - int quote = tok_nextc(tok); + int quote = TOK_NEXTC(); if (quote != current_tok->f_string_quote) { tok_backup(tok, quote); goto f_string_middle; @@ -2520,7 +2530,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct f_string_middle: while (end_quote_size != current_tok->f_string_quote_size) { - int c = tok_nextc(tok); + int c = TOK_NEXTC(); if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) { assert(tok->multi_line_start != NULL); // shift the tok_state's location into @@ -2557,7 +2567,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct INSIDE_FSTRING_EXPR(current_tok) ); if (c == '{') { - int peek = tok_nextc(tok); + int peek = TOK_NEXTC(); if (peek != '{' || in_format_spec) { tok_backup(tok, peek); tok_backup(tok, c); @@ -2579,7 +2589,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct p_end = tok->cur; return MAKE_TOKEN(FSTRING_MIDDLE); } - int peek = tok_nextc(tok); + int peek = TOK_NEXTC(); // The tokenizer can only be in the format spec if we have already completed the expression // scanning (indicated by the end of the expression being set) and we are not at the top level @@ -2597,7 +2607,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct } return MAKE_TOKEN(FSTRING_MIDDLE); } else if (c == '\\') { - int peek = tok_nextc(tok); + int peek = TOK_NEXTC(); // Special case when the backslash is right before a curly // brace. We have to restore and return the control back // to the loop for the next iteration. @@ -2614,7 +2624,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct if (!current_tok->f_string_raw) { if (peek == 'N') { /* Handle named unicode escapes (\N{BULLET}) */ - peek = tok_nextc(tok); + peek = TOK_NEXTC(); if (peek == '{') { unicode_escape = 1; } else { diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 2b94aecce626c3..97b54dc2a625ef 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -31,6 +31,7 @@ struct token { int level; int lineno, col_offset, end_lineno, end_col_offset; const char *start, *end; + PyObject *metadata; }; enum tokenizer_mode_kind_t { From 51d7d83af3b2331e1b6e0f1262527edc8e177fe5 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 27 Apr 2023 00:18:02 +0100 Subject: [PATCH 2/8] fixup! gh-103656: Transfer f-string buffers to parser to avoid use-after-free --- Parser/action_helpers.c | 21 ----- Parser/tokenizer.c | 198 +++++++++++++++++++++------------------- 2 files changed, 104 insertions(+), 115 deletions(-) diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 168e01015fb761..0aaaed64c4037c 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -1223,27 +1223,6 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq // Fstring stuff -static expr_ty -decode_fstring_buffer(Parser *p, int lineno, int col_offset, int end_lineno, - int end_col_offset) -{ - tokenizer_mode *tok_mode = &(p->tok->tok_mode_stack[p->tok->tok_mode_stack_index]); - assert(tok_mode->last_expr_buffer != NULL); - assert(tok_mode->last_expr_size >= 0 && tok_mode->last_expr_end >= 0); - - PyObject *res = PyUnicode_DecodeUTF8( - tok_mode->last_expr_buffer, - tok_mode->last_expr_size - tok_mode->last_expr_end, - NULL - ); - if (!res || _PyArena_AddPyObject(p->arena, res) < 0) { - Py_XDECREF(res); - return NULL; - } - - return _PyAST_Constant(res, NULL, lineno, col_offset, end_lineno, end_col_offset, p->arena); -} - static expr_ty _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant) { assert(PyUnicode_CheckExact(constant->v.Constant.value)); diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 4fd5343c364ef1..045136d4c6af62 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -31,7 +31,6 @@ /* Don't ever change this -- it would break the portability of Python code */ #define TABSIZE 8 -#define TOK_NEXTC() tok_nextc(tok, token) #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) @@ -59,7 +58,7 @@ static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { /* Forward */ static struct tok_state *tok_new(void); -static int tok_nextc(struct tok_state *tok, struct token *token); +static int tok_nextc(struct tok_state *tok); static void tok_backup(struct tok_state *tok, int c); static int syntaxerror(struct tok_state *tok, const char *format, ...); @@ -392,7 +391,25 @@ restore_fstring_buffers(struct tok_state *tok) } static int -update_fstring_expr(struct tok_state *tok, struct token *token, char cur) +set_fstring_expr(struct tok_state* tok, struct token *token, char c) { + assert(token != NULL); + assert(c == '{' || c == ':' || c == '!'); + + tokenizer_mode *tok_mode = TOK_GET_MODE(tok); + PyObject *res = PyUnicode_DecodeUTF8( + tok_mode->last_expr_buffer, + tok_mode->last_expr_size - tok_mode->last_expr_end, + NULL + ); + if (!res) { + return -1; + } + token->metadata = res; + return 0; +} + +static int +update_fstring_expr(struct tok_state *tok, char cur) { assert(tok->cur != NULL); @@ -433,15 +450,6 @@ update_fstring_expr(struct tok_state *tok, struct token *token, char cur) case ':': if (tok_mode->last_expr_end == -1) { tok_mode->last_expr_end = strlen(tok->start); - PyObject *res = PyUnicode_DecodeUTF8( - tok_mode->last_expr_buffer, - tok_mode->last_expr_size - tok_mode->last_expr_end, - NULL - ); - if (!res) { - goto error; - } - token->metadata = res; } break; default: @@ -969,7 +977,7 @@ _PyTokenizer_Free(struct tok_state *tok) } static int -tok_readline_raw(struct tok_state *tok, struct token *token) +tok_readline_raw(struct tok_state *tok) { do { if (!tok_reserve_buf(tok, BUFSIZ)) { @@ -981,7 +989,7 @@ tok_readline_raw(struct tok_state *tok, struct token *token) if (line == NULL) { return 1; } - if (tok->tok_mode_stack_index && !update_fstring_expr(tok, token, 0)) { + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { return 0; } if (tok->fp_interactive && @@ -1019,7 +1027,7 @@ tok_underflow_string(struct tok_state *tok) { } static int -tok_underflow_interactive(struct tok_state *tok, struct token *token) { +tok_underflow_interactive(struct tok_state *tok) { if (tok->interactive_underflow == IUNDERFLOW_STOP) { tok->done = E_INTERACT_STOP; return 1; @@ -1104,14 +1112,14 @@ tok_underflow_interactive(struct tok_state *tok, struct token *token) { return 0; } - if (tok->tok_mode_stack_index && !update_fstring_expr(tok, token, 0)) { + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { return 0; } return 1; } static int -tok_underflow_file(struct tok_state *tok, struct token *token) { +tok_underflow_file(struct tok_state *tok) { if (tok->start == NULL) { tok->cur = tok->inp = tok->buf; } @@ -1134,7 +1142,7 @@ tok_underflow_file(struct tok_state *tok, struct token *token) { } else { /* We want a 'raw' read. */ - if (!tok_readline_raw(tok, token)) { + if (!tok_readline_raw(tok)) { return 0; } } @@ -1202,7 +1210,7 @@ print_escape(FILE *f, const char *s, Py_ssize_t size) /* Get next char, updating state; error code goes into tok->done */ static int -tok_nextc(struct tok_state *tok, struct token *token) +tok_nextc(struct tok_state *tok) { int rc; for (;;) { @@ -1217,10 +1225,10 @@ tok_nextc(struct tok_state *tok, struct token *token) rc = tok_underflow_string(tok); } else if (tok->prompt != NULL) { - rc = tok_underflow_interactive(tok, token); + rc = tok_underflow_interactive(tok); } else { - rc = tok_underflow_file(tok, token); + rc = tok_underflow_file(tok); } #if defined(Py_DEBUG) if (tok->debug) { @@ -1409,12 +1417,12 @@ warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_cha } static int -lookahead(struct tok_state *tok, struct token *token, const char *test) +lookahead(struct tok_state *tok, const char *test) { const char *s = test; int res = 0; while (1) { - int c = TOK_NEXTC(); + int c = tok_nextc(tok); if (*s == 0) { res = !is_potential_identifier_char(c); } @@ -1432,7 +1440,7 @@ lookahead(struct tok_state *tok, struct token *token, const char *test) } static int -verify_end_of_number(struct tok_state *tok, struct token *token, int c, const char *kind) +verify_end_of_number(struct tok_state *tok, int c, const char *kind) { /* Emit a deprecation warning only if the numeric literal is immediately * followed by one of keywords which can occur after a numeric literal @@ -1446,26 +1454,26 @@ verify_end_of_number(struct tok_state *tok, struct token *token, int c, const ch */ int r = 0; if (c == 'a') { - r = lookahead(tok, token, "nd"); + r = lookahead(tok, "nd"); } else if (c == 'e') { - r = lookahead(tok, token, "lse"); + r = lookahead(tok, "lse"); } else if (c == 'f') { - r = lookahead(tok, token, "or"); + r = lookahead(tok, "or"); } else if (c == 'i') { - int c2 = TOK_NEXTC(); + int c2 = tok_nextc(tok); if (c2 == 'f' || c2 == 'n' || c2 == 's') { r = 1; } tok_backup(tok, c2); } else if (c == 'o') { - r = lookahead(tok, token, "r"); + r = lookahead(tok, "r"); } else if (c == 'n') { - r = lookahead(tok, token, "ot"); + r = lookahead(tok, "ot"); } if (r) { tok_backup(tok, c); @@ -1474,7 +1482,7 @@ verify_end_of_number(struct tok_state *tok, struct token *token, int c, const ch { return 0; } - TOK_NEXTC(); + tok_nextc(tok); } else /* In future releases, only error will remain. */ if (is_potential_identifier_char(c)) { @@ -1542,18 +1550,18 @@ verify_identifier(struct tok_state *tok) } static int -tok_decimal_tail(struct tok_state *tok, struct token *token) +tok_decimal_tail(struct tok_state *tok) { int c; while (1) { do { - c = TOK_NEXTC(); + c = tok_nextc(tok); } while (isdigit(c)); if (c != '_') { break; } - c = TOK_NEXTC(); + c = tok_nextc(tok); if (!isdigit(c)) { tok_backup(tok, c); syntaxerror(tok, "invalid decimal literal"); @@ -1565,13 +1573,13 @@ tok_decimal_tail(struct tok_state *tok, struct token *token) static inline int -tok_continuation_line(struct tok_state *tok, struct token *token) { - int c = TOK_NEXTC(); +tok_continuation_line(struct tok_state *tok) { + int c = tok_nextc(tok); if (c != '\n') { tok->done = E_LINECONT; return -1; } - c = TOK_NEXTC(); + c = tok_nextc(tok); if (c == EOF) { tok->done = E_EOF; tok->cur = tok->inp; @@ -1638,7 +1646,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->atbol = 0; int cont_line_col = 0; for (;;) { - c = TOK_NEXTC(); + c = tok_nextc(tok); if (c == ' ') { col++, altcol++; } @@ -1655,7 +1663,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t // preceded by whitespace, **the first one we find** determines // the level of indentation of whatever comes next. cont_line_col = cont_line_col ? cont_line_col : col; - if ((c = tok_continuation_line(tok, token)) == -1) { + if ((c = tok_continuation_line(tok)) == -1) { return MAKE_TOKEN(ERRORTOKEN); } } @@ -1743,7 +1751,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } /* Peek ahead at the next character */ - c = TOK_NEXTC(); + c = tok_nextc(tok); tok_backup(tok, c); /* Check if we are closing an async function */ if (tok->async_def @@ -1771,7 +1779,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->start = NULL; /* Skip spaces */ do { - c = TOK_NEXTC(); + c = tok_nextc(tok); } while (c == ' ' || c == '\t' || c == '\014'); /* Set start of current token */ @@ -1789,7 +1797,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t int current_starting_col_offset; while (c != EOF && c != '\n') { - c = TOK_NEXTC(); + c = tok_nextc(tok); } if (tok->type_comments) { @@ -1835,7 +1843,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* If this type ignore is the only thing on the line, consume the newline also. */ if (blankline) { - TOK_NEXTC(); + tok_nextc(tok); tok->atbol = 1; } return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); @@ -1884,7 +1892,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t else { break; } - c = TOK_NEXTC(); + c = tok_nextc(tok); if (c == '"' || c == '\'') { if (saw_f) { goto f_string_quote; @@ -1896,7 +1904,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (c >= 128) { nonascii = 1; } - c = TOK_NEXTC(); + c = tok_nextc(tok); } tok_backup(tok, c); if (nonascii && !verify_identifier(tok)) { @@ -1973,11 +1981,11 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Period or number starting with period? */ if (c == '.') { - c = TOK_NEXTC(); + c = tok_nextc(tok); if (isdigit(c)) { goto fraction; } else if (c == '.') { - c = TOK_NEXTC(); + c = tok_nextc(tok); if (c == '.') { p_start = tok->start; p_end = tok->cur; @@ -2000,32 +2008,32 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (isdigit(c)) { if (c == '0') { /* Hex, octal or binary -- maybe. */ - c = TOK_NEXTC(); + c = tok_nextc(tok); if (c == 'x' || c == 'X') { /* Hex */ - c = TOK_NEXTC(); + c = tok_nextc(tok); do { if (c == '_') { - c = TOK_NEXTC(); + c = tok_nextc(tok); } if (!isxdigit(c)) { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal")); } do { - c = TOK_NEXTC(); + c = tok_nextc(tok); } while (isxdigit(c)); } while (c == '_'); - if (!verify_end_of_number(tok, token, c, "hexadecimal")) { + if (!verify_end_of_number(tok, c, "hexadecimal")) { return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'o' || c == 'O') { /* Octal */ - c = TOK_NEXTC(); + c = tok_nextc(tok); do { if (c == '_') { - c = TOK_NEXTC(); + c = tok_nextc(tok); } if (c < '0' || c >= '8') { if (isdigit(c)) { @@ -2038,23 +2046,23 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } do { - c = TOK_NEXTC(); + c = tok_nextc(tok); } while ('0' <= c && c < '8'); } while (c == '_'); if (isdigit(c)) { return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in octal literal", c)); } - if (!verify_end_of_number(tok, token, c, "octal")) { + if (!verify_end_of_number(tok, c, "octal")) { return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'b' || c == 'B') { /* Binary */ - c = TOK_NEXTC(); + c = tok_nextc(tok); do { if (c == '_') { - c = TOK_NEXTC(); + c = tok_nextc(tok); } if (c != '0' && c != '1') { if (isdigit(c)) { @@ -2066,13 +2074,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } do { - c = TOK_NEXTC(); + c = tok_nextc(tok); } while (c == '0' || c == '1'); } while (c == '_'); if (isdigit(c)) { return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } - if (!verify_end_of_number(tok, token, c, "binary")) { + if (!verify_end_of_number(tok, c, "binary")) { return MAKE_TOKEN(ERRORTOKEN); } } @@ -2082,7 +2090,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* in any case, allow '0' as a literal */ while (1) { if (c == '_') { - c = TOK_NEXTC(); + c = tok_nextc(tok); if (!isdigit(c)) { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); @@ -2091,18 +2099,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (c != '0') { break; } - c = TOK_NEXTC(); + c = tok_nextc(tok); } char* zeros_end = tok->cur; if (isdigit(c)) { nonzero = 1; - c = tok_decimal_tail(tok, token); + c = tok_decimal_tail(tok); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } } if (c == '.') { - c = TOK_NEXTC(); + c = tok_nextc(tok); goto fraction; } else if (c == 'e' || c == 'E') { @@ -2121,25 +2129,25 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t "literals are not permitted; " "use an 0o prefix for octal integers")); } - if (!verify_end_of_number(tok, token, c, "decimal")) { + if (!verify_end_of_number(tok, c, "decimal")) { return MAKE_TOKEN(ERRORTOKEN); } } } else { /* Decimal */ - c = tok_decimal_tail(tok, token); + c = tok_decimal_tail(tok); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } { /* Accept floating point numbers. */ if (c == '.') { - c = TOK_NEXTC(); + c = tok_nextc(tok); fraction: /* Fraction */ if (isdigit(c)) { - c = tok_decimal_tail(tok, token); + c = tok_decimal_tail(tok); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } @@ -2150,16 +2158,16 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t exponent: e = c; /* Exponent part */ - c = TOK_NEXTC(); + c = tok_nextc(tok); if (c == '+' || c == '-') { - c = TOK_NEXTC(); + c = tok_nextc(tok); if (!isdigit(c)) { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } else if (!isdigit(c)) { tok_backup(tok, c); - if (!verify_end_of_number(tok, token, e, "decimal")) { + if (!verify_end_of_number(tok, e, "decimal")) { return MAKE_TOKEN(ERRORTOKEN); } tok_backup(tok, e); @@ -2167,7 +2175,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t p_end = tok->cur; return MAKE_TOKEN(NUMBER); } - c = tok_decimal_tail(tok, token); + c = tok_decimal_tail(tok); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } @@ -2175,12 +2183,12 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (c == 'j' || c == 'J') { /* Imaginary part */ imaginary: - c = TOK_NEXTC(); - if (!verify_end_of_number(tok, token, c, "imaginary")) { + c = tok_nextc(tok); + if (!verify_end_of_number(tok, c, "imaginary")) { return MAKE_TOKEN(ERRORTOKEN); } } - else if (!verify_end_of_number(tok, token, c, "decimal")) { + else if (!verify_end_of_number(tok, c, "decimal")) { return MAKE_TOKEN(ERRORTOKEN); } } @@ -2204,9 +2212,9 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->multi_line_start = tok->line_start; /* Find the quote size and start of string */ - int after_quote = TOK_NEXTC(); + int after_quote = tok_nextc(tok); if (after_quote == quote) { - int after_after_quote = TOK_NEXTC(); + int after_after_quote = tok_nextc(tok); if (after_after_quote == quote) { quote_size = 3; } @@ -2268,9 +2276,9 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->multi_line_start = tok->line_start; /* Find the quote size and start of string */ - c = TOK_NEXTC(); + c = tok_nextc(tok); if (c == quote) { - c = TOK_NEXTC(); + c = tok_nextc(tok); if (c == quote) { quote_size = 3; } @@ -2284,7 +2292,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Get rest of string */ while (end_quote_size != quote_size) { - c = TOK_NEXTC(); + c = tok_nextc(tok); if (tok->done == E_DECODE) break; if (c == EOF || (quote_size == 1 && c == '\n')) { @@ -2334,7 +2342,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t else { end_quote_size = 0; if (c == '\\') { - TOK_NEXTC(); /* skip escaped char */ + tok_nextc(tok); /* skip escaped char */ } } } @@ -2346,7 +2354,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Line continuation */ if (c == '\\') { - if ((c = tok_continuation_line(tok, token)) == -1) { + if ((c = tok_continuation_line(tok)) == -1) { return MAKE_TOKEN(ERRORTOKEN); } tok->cont_line = 1; @@ -2360,10 +2368,12 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t * by the `{` case, so for ensuring that we are on the 0th level, we need * to adjust it manually */ int cursor = current_tok->curly_bracket_depth - (c != '{'); - - if (cursor == 0 && !update_fstring_expr(tok, token, c)) { + if (cursor == 0 && !update_fstring_expr(tok, c)) { return MAKE_TOKEN(ENDMARKER); } + if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) { + return MAKE_TOKEN(ERRORTOKEN); + } if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { current_tok->kind = TOK_FSTRING_MODE; @@ -2375,10 +2385,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Check for two-character token */ { - int c2 = TOK_NEXTC(); + int c2 = tok_nextc(tok); int current_token = _PyToken_TwoChars(c, c2); if (current_token != OP) { - int c3 = TOK_NEXTC(); + int c3 = tok_nextc(tok); int current_token3 = _PyToken_ThreeChars(c, c2, c3); if (current_token3 != OP) { current_token = current_token3; @@ -2488,9 +2498,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize // before it. - int start_char = TOK_NEXTC(); + int start_char = tok_nextc(tok); if (start_char == '{') { - int peek1 = TOK_NEXTC(); + int peek1 = tok_nextc(tok); tok_backup(tok, peek1); tok_backup(tok, start_char); if (peek1 != '{') { @@ -2508,7 +2518,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct // Check if we are at the end of the string for (int i = 0; i < current_tok->f_string_quote_size; i++) { - int quote = TOK_NEXTC(); + int quote = tok_nextc(tok); if (quote != current_tok->f_string_quote) { tok_backup(tok, quote); goto f_string_middle; @@ -2530,7 +2540,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct f_string_middle: while (end_quote_size != current_tok->f_string_quote_size) { - int c = TOK_NEXTC(); + int c = tok_nextc(tok); if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) { assert(tok->multi_line_start != NULL); // shift the tok_state's location into @@ -2567,7 +2577,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct INSIDE_FSTRING_EXPR(current_tok) ); if (c == '{') { - int peek = TOK_NEXTC(); + int peek = tok_nextc(tok); if (peek != '{' || in_format_spec) { tok_backup(tok, peek); tok_backup(tok, c); @@ -2589,7 +2599,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct p_end = tok->cur; return MAKE_TOKEN(FSTRING_MIDDLE); } - int peek = TOK_NEXTC(); + int peek = tok_nextc(tok); // The tokenizer can only be in the format spec if we have already completed the expression // scanning (indicated by the end of the expression being set) and we are not at the top level @@ -2607,7 +2617,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct } return MAKE_TOKEN(FSTRING_MIDDLE); } else if (c == '\\') { - int peek = TOK_NEXTC(); + int peek = tok_nextc(tok); // Special case when the backslash is right before a curly // brace. We have to restore and return the control back // to the loop for the next iteration. @@ -2624,7 +2634,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct if (!current_tok->f_string_raw) { if (peek == 'N') { /* Handle named unicode escapes (\N{BULLET}) */ - peek = TOK_NEXTC(); + peek = tok_nextc(tok); if (peek == '{') { unicode_escape = 1; } else { From 326059ea6cba7a7e94fafc53f3e4cf939b26ea9d Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 27 Apr 2023 00:29:58 +0100 Subject: [PATCH 3/8] fixup! fixup! gh-103656: Transfer f-string buffers to parser to avoid use-after-free --- Parser/pegen.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index 97f86e1372f7c0..07d5e9f4a59843 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -155,15 +155,13 @@ initialize_token(Parser *p, Token *parser_token, struct token *new_token, int to return -1; } + parser_token->metadata = NULL; if (new_token->metadata != NULL) { - parser_token->metadata = new_token->metadata; - if (_PyArena_AddPyObject(p->arena, parser_token->metadata) < 0) { + if (_PyArena_AddPyObject(p->arena, new_token->metadata) < 0) { Py_DECREF(parser_token->metadata); return -1; } - } - else { - parser_token->metadata = NULL; + parser_token->metadata = new_token->metadata; } parser_token->level = new_token->level; From f2754e1cc63e103d4925e22d89ed25291f52b04c Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 27 Apr 2023 00:53:52 +0100 Subject: [PATCH 4/8] fixup! fixup! fixup! gh-103656: Transfer f-string buffers to parser to avoid use-after-free --- Parser/pegen.c | 1 + Parser/tokenizer.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index 07d5e9f4a59843..83aad33aa906cd 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -161,6 +161,7 @@ initialize_token(Parser *p, Token *parser_token, struct token *new_token, int to Py_DECREF(parser_token->metadata); return -1; } + Py_DECREF(parser_token->metadata); parser_token->metadata = new_token->metadata; } diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 045136d4c6af62..fb41489a990d8b 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -393,7 +393,7 @@ restore_fstring_buffers(struct tok_state *tok) static int set_fstring_expr(struct tok_state* tok, struct token *token, char c) { assert(token != NULL); - assert(c == '{' || c == ':' || c == '!'); + assert(c == '}' || c == ':' || c == '!'); tokenizer_mode *tok_mode = TOK_GET_MODE(tok); PyObject *res = PyUnicode_DecodeUTF8( From ee801d8cde1c66ca59118ea6ed99aa5d12482bb1 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 27 Apr 2023 01:00:33 +0100 Subject: [PATCH 5/8] fixup! fixup! fixup! fixup! gh-103656: Transfer f-string buffers to parser to avoid use-after-free --- Parser/pegen.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index 83aad33aa906cd..07d5e9f4a59843 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -161,7 +161,6 @@ initialize_token(Parser *p, Token *parser_token, struct token *new_token, int to Py_DECREF(parser_token->metadata); return -1; } - Py_DECREF(parser_token->metadata); parser_token->metadata = new_token->metadata; } From e84a02bacc0183c10d3d905641eb5ad0282a127c Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 27 Apr 2023 01:26:30 +0100 Subject: [PATCH 6/8] Fix refleaks --- Parser/pegen.c | 10 +++++++--- Parser/pegen_errors.c | 2 ++ Parser/tokenizer.c | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index 07d5e9f4a59843..da410ea84ecb8e 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -162,6 +162,7 @@ initialize_token(Parser *p, Token *parser_token, struct token *new_token, int to return -1; } parser_token->metadata = new_token->metadata; + new_token->metadata = NULL; } parser_token->level = new_token->level; @@ -216,14 +217,14 @@ _PyPegen_fill_token(Parser *p) char *tag = PyMem_Malloc(len + 1); if (tag == NULL) { PyErr_NoMemory(); - return -1; + goto error; } strncpy(tag, new_token.start, len); tag[len] = '\0'; // Ownership of tag passes to the growable array if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) { PyErr_NoMemory(); - return -1; + goto error; } type = _PyTokenizer_Get(p->tok, &new_token); } @@ -244,11 +245,14 @@ _PyPegen_fill_token(Parser *p) // Check if we are at the limit of the token array capacity and resize if needed if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) { - return -1; + goto error; } Token *t = p->tokens[p->fill]; return initialize_token(p, t, &new_token, type); +error: + Py_XDECREF(new_token.metadata); + return -1; } #if defined(Py_DEBUG) diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index e26bad20a27575..1f227da0194e3c 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -165,6 +165,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { int ret = 0; struct token new_token; + new_token.metadata = NULL; for (;;) { switch (_PyTokenizer_Get(p->tok, &new_token)) { @@ -192,6 +193,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { exit: + Py_XDECREF(new_token.metadata); // If we're in an f-string, we want the syntax error in the expression part // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards // do not swallow it. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index fb41489a990d8b..32f7cfcb4c15bf 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -394,6 +394,9 @@ static int set_fstring_expr(struct tok_state* tok, struct token *token, char c) { assert(token != NULL); assert(c == '}' || c == ':' || c == '!'); + if (token->metadata) { + return 0; + } tokenizer_mode *tok_mode = TOK_GET_MODE(tok); PyObject *res = PyUnicode_DecodeUTF8( From 05293a4fb4d9ab538e60fada83c8b97ee7525b0e Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 27 Apr 2023 01:34:18 +0100 Subject: [PATCH 7/8] fixup! Fix refleaks --- Parser/tokenizer.c | 14 ++++++++++---- Parser/tokenizer.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 32f7cfcb4c15bf..6b5478b3a94ab4 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -111,7 +111,7 @@ tok_new(void) tok->interactive_underflow = IUNDERFLOW_NORMAL; tok->str = NULL; tok->report_warnings = 1; - tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0}; + tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0}; tok->tok_mode_stack_index = 0; tok->tok_report_warnings = 1; #ifdef Py_DEBUG @@ -394,11 +394,12 @@ static int set_fstring_expr(struct tok_state* tok, struct token *token, char c) { assert(token != NULL); assert(c == '}' || c == ':' || c == '!'); - if (token->metadata) { + tokenizer_mode *tok_mode = TOK_GET_MODE(tok); + + if (!tok_mode->f_string_debug || token->metadata) { return 0; } - tokenizer_mode *tok_mode = TOK_GET_MODE(tok); PyObject *res = PyUnicode_DecodeUTF8( tok_mode->last_expr_buffer, tok_mode->last_expr_size - tok_mode->last_expr_end, @@ -2237,7 +2238,6 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); the_current_tok->kind = TOK_FSTRING_MODE; the_current_tok->f_string_quote = quote; - the_current_tok->f_string_quote_size = quote_size; the_current_tok->f_string_start = tok->start; the_current_tok->f_string_multi_line_start = tok->line_start; the_current_tok->f_string_start_offset = -1; @@ -2245,6 +2245,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t the_current_tok->last_expr_buffer = NULL; the_current_tok->last_expr_size = 0; the_current_tok->last_expr_end = -1; + the_current_tok->f_string_debug = 0; switch (*tok->start) { case 'F': @@ -2468,6 +2469,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { current_tok->curly_bracket_expr_start_depth--; current_tok->kind = TOK_FSTRING_MODE; + current_tok->f_string_debug = 0; } } break; @@ -2481,6 +2483,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex)); } + if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) { + current_tok->f_string_debug = 1; + } + /* Punctuation character */ p_start = tok->start; p_end = tok->cur; diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 97b54dc2a625ef..8b4213c4ce3b5a 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -59,6 +59,7 @@ typedef struct _tokenizer_mode { Py_ssize_t last_expr_size; Py_ssize_t last_expr_end; char* last_expr_buffer; + int f_string_debug; } tokenizer_mode; /* Tokenizer state */ From 87805024cd787205dc8d7276f6e2e36e33cdfa3d Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 26 Apr 2023 19:04:01 -0600 Subject: [PATCH 8/8] Add quote_size back in --- Parser/tokenizer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 6b5478b3a94ab4..8de0572a1fc459 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -2238,6 +2238,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); the_current_tok->kind = TOK_FSTRING_MODE; the_current_tok->f_string_quote = quote; + the_current_tok->f_string_quote_size = quote_size; the_current_tok->f_string_start = tok->start; the_current_tok->f_string_multi_line_start = tok->line_start; the_current_tok->f_string_start_offset = -1;