Skip to content
This repository was archived by the owner on Nov 5, 2024. It is now read-only.

Commit bd35eed

Browse files
committed
added bytes_to_utf8_lossy fn; updated README
1 parent 4b17930 commit bd35eed

File tree

7 files changed

+143
-25
lines changed

7 files changed

+143
-25
lines changed

README.md

Lines changed: 104 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,31 @@ Each node has its own JavaScript class, so
66
1. it's possible to dinstinguish them by checking `instanceof`
77
2. they can be extended in pure JavaScript
88

9-
Here's how it looks like:
9+
Basic usage:
1010

1111
```js
12+
const inspect = require('util').inspect
13+
const { parse, Send } = require('lib-ruby-parser');
14+
15+
function print_parse_result(parser_result) {
16+
console.log(inspect(parser_result, { showHidden: false, depth: null }))
17+
}
18+
19+
// This function must be defined by you.
20+
// It takes a string and return an array of bytes.
21+
// The following code is just an example:
22+
function bytes(str) {
23+
const bytes = unescape(encodeURIComponent(str)).split('').map(c => c.charCodeAt(0))
24+
return new Uint8Array(bytes)
25+
}
26+
27+
const input = bytes("2 + 3 # x");
28+
const options = { record_tokens: true };
29+
const result = parse(input, options);
30+
print_parser_result(result);
31+
32+
// prints:
33+
1234
ParserResult {
1335
ast: Send {
1436
recv: Int {
@@ -34,37 +56,58 @@ ParserResult {
3456
tokens: [
3557
Token {
3658
name: 'tINTEGER',
37-
value: '2',
59+
value: Uint8Array(1) [ 50 ],
3860
loc: Loc { begin: 0, end: 1 }
3961
},
40-
Token { name: 'tPLUS', value: '+', loc: Loc { begin: 2, end: 3 } },
62+
Token {
63+
name: 'tPLUS',
64+
value: Uint8Array(1) [ 43 ],
65+
loc: Loc { begin: 2, end: 3 }
66+
},
4167
Token {
4268
name: 'tINTEGER',
43-
value: '3',
69+
value: Uint8Array(1) [ 51 ],
4470
loc: Loc { begin: 4, end: 5 }
4571
},
46-
Token { name: 'EOF', value: '', loc: Loc { begin: 5, end: 5 } }
72+
Token {
73+
name: 'tNL',
74+
value: Uint8Array(1) [ 10 ],
75+
loc: Loc { begin: 8, end: 9 }
76+
},
77+
Token {
78+
name: 'EOF',
79+
value: Uint8Array(0) [],
80+
loc: Loc { begin: 8, end: 8 }
81+
}
4782
],
4883
diagnostics: [],
49-
comments: [],
84+
comments: [
85+
Comment {
86+
kind: 'inline',
87+
location: Range { begin_pos: 6, end_pos: 9 }
88+
}
89+
],
5090
magic_comments: [],
51-
input: '2 + 3'
91+
input: Uint8Array(9) [
92+
50, 32, 43, 32, 51,
93+
32, 35, 32, 120
94+
]
5295
}
5396
```
5497

5598
## API
5699

57100
tldr; all classes mirror Rust implementation.
58101

59-
TypeScript definition:
102+
Rough TypeScript definition:
60103

61104
```ts
62105
interface Loc { begin: number, end: number }
63106
interface Range { begin_pos: number, end_pos: number }
64107

65108
interface Token {
66109
name: string,
67-
value: string,
110+
value: Uint8Array,
68111
loc: Loc
69112
}
70113

@@ -93,3 +136,55 @@ type Node = Args | Class | ... /* other nodes */;
93136

94137
function parse(code: String): ParserResult
95138
```
139+
140+
`String` and `Symbol` nodes are slightly exceptional as they contain `StringValue` Rust structure that is a `Uint8Array` here.
141+
142+
This structure can be converted into JS `String` by using `bytes_to_utf8_lossy` function (keep in mind that it replaces unknown chars with a special `Unicode Replacement Character U+FFFD`, if you want some other strategy you are free to define your own converting function):
143+
144+
```js
145+
const { parse, bytes_to_utf8_lossy } = require(path_to_require)
146+
147+
const result = parse(bytes('"a\\xFFb"'), { record_tokens: true });
148+
console.log(result.ast)
149+
console.log(bytes_to_utf8_lossy(result.ast.value))
150+
151+
// prints
152+
153+
Str {
154+
value: Uint8Array(3) [ 97, 255, 98 ], // "a" = 97, "\xFF" = 255, "b" = 98
155+
begin_l: Range { begin_pos: 0, end_pos: 1 },
156+
end_l: Range { begin_pos: 7, end_pos: 8 },
157+
expression_l: Range { begin_pos: 0, end_pos: 8 }
158+
}
159+
160+
a�b
161+
```
162+
163+
## Encodings
164+
165+
If you want to support encodings other than UTF-8/ASCII-8BIT/BINARY you need a custom decoder:
166+
167+
```js
168+
const custom_decoder = (encoding: String, input: Uint8Array) => {
169+
// Do some **real** decoding into UTF-8 here
170+
//
171+
// Here for simplicity we convert all "2" into "3"
172+
//
173+
assert(encoding === "US-ASCII");
174+
assert(input === bytes("# encoding: us-ascii\n2 + 2"));
175+
176+
return bytes("# encoding: us-ascii\n3 + 3");
177+
}
178+
179+
const result = parse(
180+
bytes("# encoding: us-ascii\n2 + 2"),
181+
{ custom_decoder }
182+
);
183+
assert(result.ast.recv.value === "3");
184+
```
185+
186+
## Platform support
187+
188+
Currently NPM packages include pre-compiled `.node` files for Mac and Linux.
189+
190+
If you need Windows support first go to [C++ bindings](https://github.com/lib-ruby-parser/cpp-bindings) and create an issue there. This repo is just a wrapper around its builds.

build-convert/build.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ fn main() {
115115
#define LIB_RUBY_PARSER_CONVERT_GEN_H
116116
117117
#include <napi.h>
118-
#include <iostream>
119118
#include \"lib-ruby-parser.h\"
120119
121120
template<class> inline constexpr bool always_false_v = false;

bytes.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,16 @@ namespace lib_ruby_parser_node
1111
class Bytes : public lib_ruby_parser::Bytes
1212
{
1313
public:
14-
explicit Bytes(lib_ruby_parser::Bytes bytes)
14+
explicit Bytes(lib_ruby_parser::Bytes bytes) : lib_ruby_parser::Bytes(bytes.into_ptr())
1515
{
16-
lib_ruby_parser::Bytes(bytes.into_ptr());
1716
}
1817

19-
Napi::Value to_v8_value(Napi::Env env) const
18+
Napi::Value ToV8(Napi::Env env) const
2019
{
21-
Napi::TypedArray array = Napi::Uint8Array::New(env, this->size());
20+
Napi::Uint8Array array = Napi::Uint8Array::New(env, this->size());
2221
for (size_t i = 0; i < this->size(); i++)
2322
{
24-
array.Set(i, this->at(i));
23+
array[i] = this->at(i);
2524
}
2625
return array;
2726
}

convert.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <napi.h>
55
#include "lib-ruby-parser.h"
66
#include "convert_gen.h"
7+
#include "bytes.h"
78

89
namespace lib_ruby_parser_node
910
{
@@ -111,7 +112,7 @@ namespace lib_ruby_parser_node
111112
{
112113
return TokenCtor.New({
113114
Napi::Value::From(env, token.name()),
114-
Napi::Value::From(env, token.token_value.to_string_lossy()),
115+
Napi::Value::From(env, Bytes(std::move(token.token_value)).ToV8(env)),
115116
convert(std::move(token.loc), env),
116117
});
117118
}
@@ -233,7 +234,7 @@ namespace lib_ruby_parser_node
233234
convert(std::move(result->diagnostics), env),
234235
convert(std::move(result->comments), env),
235236
convert(std::move(result->magic_comments), env),
236-
Napi::String::New(env, result->input.to_string_lossy()),
237+
Bytes(std::move(result->input)).ToV8(env),
237238
});
238239
}
239240

custom_decoder.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include <napi.h>
55
#include "lib-ruby-parser.h"
66
#include "bytes.h"
7-
#include <iostream>
87

98
namespace lib_ruby_parser_node
109
{

node_bindings.cc

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include "custom_decoder.h"
55
#include "bytes.h"
66
#include "result.h"
7-
#include <iostream>
87
#include <variant>
98
#include <tuple>
109

@@ -69,8 +68,6 @@ namespace lib_ruby_parser_node
6968

7069
Result<std::unique_ptr<lib_ruby_parser::ParserResult>> parse(const Napi::CallbackInfo &info)
7170
{
72-
Napi::Env env = info.Env();
73-
7471
if (info.Length() != 2)
7572
{
7673
return "Wrong number of arguments (expected 2)";
@@ -102,11 +99,39 @@ namespace lib_ruby_parser_node
10299
return convert(result.get(), env);
103100
}
104101

102+
Result<lib_ruby_parser::Bytes> bytes_to_utf8_lossy(const Napi::CallbackInfo &info)
103+
{
104+
if (info.Length() != 1)
105+
{
106+
return "Wrong number of arguments (expected 1)";
107+
}
108+
109+
UNWRAP_RESULT(bytes, Bytes::FromV8(info[0]));
110+
111+
return std::move(bytes);
112+
}
113+
114+
Napi::Value js_bytes_to_utf8_lossy(const Napi::CallbackInfo &info)
115+
{
116+
auto env = info.Env();
117+
118+
auto result = bytes_to_utf8_lossy(info);
119+
if (result.is_err())
120+
{
121+
return JsThrow(env, result.get_err());
122+
}
123+
auto bytes = result.get();
124+
return Napi::String::New(info.Env(), bytes.to_string_lossy());
125+
}
126+
105127
Napi::Object Init(Napi::Env env, Napi::Object exports)
106128
{
107129
exports.Set(Napi::String::New(env, "parse"),
108130
Napi::Function::New(env, js_parse));
109131

132+
exports.Set(Napi::String::New(env, "bytes_to_utf8_lossy"),
133+
Napi::Function::New(env, js_bytes_to_utf8_lossy));
134+
110135
InitCustomTypes(env, exports);
111136
InitNodeTypes(env, exports);
112137

test.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ if (!path_to_require) {
88
}
99

1010
console.log(`requiring ${path_to_require}`)
11-
const { parse, Range, Send, Self_, Int, Token, Loc, Diagnostic } = require(path_to_require)
11+
const { parse, Range, Send, Self_, Int, Token, Loc, Diagnostic, bytes_to_utf8_lossy } = require(path_to_require)
1212
const assert = require('assert').strict
1313
const inspect = require('util').inspect
1414

@@ -47,13 +47,13 @@ function assert_range(range, begin_pos, end_pos, prefix) {
4747
function assert_token(token, name, value, loc, prefix) {
4848
assert(token instanceof Token, `[${prefix}] expected ${token} to be an instance of Token`)
4949
assert_eq(token.name, name, `[${prefix}].name`)
50-
assert_eq(token.value, value, `[${prefix}].value`)
50+
assert_eq(bytes_to_utf8_lossy(token.value), value, `[${prefix}].value`)
5151
assert_eq(token.loc.begin, loc.begin, `[${prefix}].loc.begin`)
5252
assert_eq(token.loc.end, loc.end, `[${prefix}].loc.end`)
5353
}
5454

55-
function print_parse_result(parse_result) {
56-
console.log(inspect(result, { showHidden: false, depth: null }))
55+
function print_parser_result(parser_result) {
56+
console.log(inspect(parser_result, { showHidden: false, depth: null }))
5757
}
5858

5959
function bytes(str) {

0 commit comments

Comments
 (0)