Skip to content

Commit 125c375

Browse files
committed
Fix character literal forced encoding
If a character literal was followed by a string concatenation, then the forced encoding of the string concatenation could accidentally overwrite the explicit encoding of the character literal. We now handle this properly.
1 parent 25c7cfd commit 125c375

File tree

4 files changed

+59
-11
lines changed

4 files changed

+59
-11
lines changed

snapshots/character_literal.txt

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
@ ProgramNode (location: (2,0)-(2,11))
2+
├── flags: ∅
3+
├── locals: []
4+
└── statements:
5+
@ StatementsNode (location: (2,0)-(2,11))
6+
├── flags: ∅
7+
└── body: (length: 1)
8+
└── @ CallNode (location: (2,0)-(2,11))
9+
├── flags: newline, ignore_visibility
10+
├── receiver: ∅
11+
├── call_operator_loc: ∅
12+
├── name: :p
13+
├── message_loc: (2,0)-(2,1) = "p"
14+
├── opening_loc: ∅
15+
├── arguments:
16+
│ @ ArgumentsNode (location: (2,2)-(2,11))
17+
│ ├── flags: ∅
18+
│ └── arguments: (length: 1)
19+
│ └── @ InterpolatedStringNode (location: (2,2)-(2,11))
20+
│ ├── flags: static_literal
21+
│ ├── opening_loc: ∅
22+
│ ├── parts: (length: 2)
23+
│ │ ├── @ StringNode (location: (2,2)-(2,9))
24+
│ │ │ ├── flags: static_literal, forced_utf8_encoding, frozen
25+
│ │ │ ├── opening_loc: (2,2)-(2,3) = "?"
26+
│ │ │ ├── content_loc: (2,3)-(2,9) = "\\u3042"
27+
│ │ │ ├── closing_loc: ∅
28+
│ │ │ └── unescaped: "\x{E381}\x82"
29+
│ │ └── @ StringNode (location: (2,9)-(2,11))
30+
│ │ ├── flags: static_literal, frozen
31+
│ │ ├── opening_loc: (2,9)-(2,10) = "\""
32+
│ │ ├── content_loc: (2,10)-(2,10) = ""
33+
│ │ ├── closing_loc: (2,10)-(2,11) = "\""
34+
│ │ └── unescaped: ""
35+
│ └── closing_loc: ∅
36+
├── closing_loc: ∅
37+
└── block: ∅

src/prism.c

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18491,20 +18491,28 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
1849118491
return (pm_node_t *) node;
1849218492
}
1849318493
case PM_TOKEN_CHARACTER_LITERAL: {
18494-
parser_lex(parser);
18495-
18496-
pm_token_t opening = parser->previous;
18497-
opening.type = PM_TOKEN_STRING_BEGIN;
18498-
opening.end = opening.start + 1;
18499-
18500-
pm_token_t content = parser->previous;
18501-
content.type = PM_TOKEN_STRING_CONTENT;
18502-
content.start = content.start + 1;
18503-
1850418494
pm_token_t closing = not_provided(parser);
18505-
pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &content, &closing);
18495+
pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(
18496+
parser,
18497+
&(pm_token_t) {
18498+
.type = PM_TOKEN_STRING_BEGIN,
18499+
.start = parser->current.start,
18500+
.end = parser->current.start + 1
18501+
},
18502+
&(pm_token_t) {
18503+
.type = PM_TOKEN_STRING_CONTENT,
18504+
.start = parser->current.start + 1,
18505+
.end = parser->current.end
18506+
},
18507+
&closing
18508+
);
18509+
1850618510
pm_node_flag_set(node, parse_unescaped_encoding(parser));
1850718511

18512+
// Skip past the character literal here, since now we have handled
18513+
// parser->explicit_encoding correctly.
18514+
parser_lex(parser);
18515+
1850818516
// Characters can be followed by strings in which case they are
1850918517
// automatically concatenated.
1851018518
if (match1(parser, PM_TOKEN_STRING_BEGIN)) {
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# encoding: Windows-31J
2+
p ?\u3042""

test/prism/ruby/ruby_parser_test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
module Prism
1717
class RubyParserTest < TestCase
1818
todos = [
19+
"character_literal.txt",
1920
"encoding_euc_jp.txt",
2021
"regex_char_width.txt",
2122
"seattlerb/masgn_colon3.txt",

0 commit comments

Comments
 (0)