Fix character literal forced encoding

kddnewton · kddnewton · commit 125c375d74b8 · 2025-09-15T10:18:46.000-04:00
If a character literal was followed by a string concatenation, then
the forced encoding of the string concatenation could accidentally
overwrite the explicit encoding of the character literal. We now
handle this properly.
diff --git a/snapshots/character_literal.txt b/snapshots/character_literal.txt
@@ -0,0 +1,37 @@
+@ ProgramNode (location: (2,0)-(2,11))
+├── flags: ∅
+├── locals: []
+└── statements:
+    @ StatementsNode (location: (2,0)-(2,11))
+    ├── flags: ∅
+    └── body: (length: 1)
+        └── @ CallNode (location: (2,0)-(2,11))
+            ├── flags: newline, ignore_visibility
+            ├── receiver: ∅
+            ├── call_operator_loc: ∅
+            ├── name: :p
+            ├── message_loc: (2,0)-(2,1) = "p"
+            ├── opening_loc: ∅
+            ├── arguments:
+            │   @ ArgumentsNode (location: (2,2)-(2,11))
+            │   ├── flags: ∅
+            │   └── arguments: (length: 1)
+            │       └── @ InterpolatedStringNode (location: (2,2)-(2,11))
+            │           ├── flags: static_literal
+            │           ├── opening_loc: ∅
+            │           ├── parts: (length: 2)
+            │           │   ├── @ StringNode (location: (2,2)-(2,9))
+            │           │   │   ├── flags: static_literal, forced_utf8_encoding, frozen
+            │           │   │   ├── opening_loc: (2,2)-(2,3) = "?"
+            │           │   │   ├── content_loc: (2,3)-(2,9) = "\\u3042"
+            │           │   │   ├── closing_loc: ∅
+            │           │   │   └── unescaped: "\x{E381}\x82"
+            │           │   └── @ StringNode (location: (2,9)-(2,11))
+            │           │       ├── flags: static_literal, frozen
+            │           │       ├── opening_loc: (2,9)-(2,10) = "\""
+            │           │       ├── content_loc: (2,10)-(2,10) = ""
+            │           │       ├── closing_loc: (2,10)-(2,11) = "\""
+            │           │       └── unescaped: ""
+            │           └── closing_loc: ∅
+            ├── closing_loc: ∅
+            └── block: ∅
diff --git a/src/prism.c b/src/prism.c
@@ -18491,20 +18491,28 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
             return (pm_node_t *) node;
         }
         case PM_TOKEN_CHARACTER_LITERAL: {
-            parser_lex(parser);
-
-            pm_token_t opening = parser->previous;
-            opening.type = PM_TOKEN_STRING_BEGIN;
-            opening.end = opening.start + 1;
-
-            pm_token_t content = parser->previous;
-            content.type = PM_TOKEN_STRING_CONTENT;
-            content.start = content.start + 1;
-
             pm_token_t closing = not_provided(parser);
-            pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &content, &closing);
+            pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(
+                parser,
+                &(pm_token_t) {
+                    .type = PM_TOKEN_STRING_BEGIN,
+                    .start = parser->current.start,
+                    .end = parser->current.start + 1
+                },
+                &(pm_token_t) {
+                    .type = PM_TOKEN_STRING_CONTENT,
+                    .start = parser->current.start + 1,
+                    .end = parser->current.end
+                },
+                &closing
+            );
+
             pm_node_flag_set(node, parse_unescaped_encoding(parser));
 
+            // Skip past the character literal here, since now we have handled
+            // parser->explicit_encoding correctly.
+            parser_lex(parser);
+
             // Characters can be followed by strings in which case they are
             // automatically concatenated.
             if (match1(parser, PM_TOKEN_STRING_BEGIN)) {
diff --git a/test/prism/fixtures/character_literal.txt b/test/prism/fixtures/character_literal.txt
@@ -0,0 +1,2 @@
+# encoding: Windows-31J
+p ?\u3042""
diff --git a/test/prism/ruby/ruby_parser_test.rb b/test/prism/ruby/ruby_parser_test.rb
@@ -16,6 +16,7 @@
 module Prism
   class RubyParserTest < TestCase
     todos = [
+      "character_literal.txt",
       "encoding_euc_jp.txt",
       "regex_char_width.txt",
       "seattlerb/masgn_colon3.txt",

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# encoding: Windows-31J`
	`2`	`+p ?\u3042""`