Skip to content

Commit 3d261d3

Browse files
committed
icu: add icu module
* Moves transcode and normalize to buffer module statics * Moves getCharacterProperty and getColumnWidth to util
1 parent 6b443d1 commit 3d261d3

File tree

13 files changed

+1048
-99
lines changed

13 files changed

+1048
-99
lines changed

doc/api/buffer.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2304,6 +2304,27 @@ added: v3.0.0
23042304
On 32-bit architectures, this value is `(2^30)-1` (~1GB).
23052305
On 64-bit architectures, this value is `(2^31)-1` (~2GB).
23062306

2307+
## buffer.normalize(buf, form[, encoding])
2308+
2309+
* `buf` {Buffer} A `Buffer` instance
2310+
* `form` {String} A Unicode normalization form (one of: `'NFC'`, `'NFD'`,
2311+
`NFKC`, or `NFKD`)
2312+
* `encoding` {String} The source character encoding of the `buf`. Defaults to
2313+
`'utf8'`
2314+
2315+
Performs Unicode Normalization to the `buf` and returns a new `Buffer` instance
2316+
containing the UTF-8 encoded results. Throws if the `form` does not specify a
2317+
valid Normalization form or if the normalization cannot be successfully applied.
2318+
2319+
## buffer.transcode(buf, from_enc, to_enc)
2320+
2321+
* `buf` {Buffer} A `Buffer` instance
2322+
* `from_enc` {string} The current encoding
2323+
* `to_enc` {string} The target encoding
2324+
2325+
Re-encodes the given `Buffer` from one character encoding to another. Returns
2326+
a new `Buffer` instance.
2327+
23072328
## Class: SlowBuffer
23082329
<!-- YAML
23092330
deprecated: v6.0.0

doc/api/util.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ module developers as well. It can be accessed using:
1010
const util = require('util');
1111
```
1212

13+
## util.constants
14+
15+
Constants for use with `util.getCharacterProperty()`.
16+
1317
## util.debuglog(section)
1418
<!-- YAML
1519
added: v0.11.3
@@ -133,6 +137,20 @@ Each argument is converted to a string using `util.inspect()`.
133137
util.format(1, 2, 3); // '1 2 3'
134138
```
135139

140+
## util.getCharacterProperty(codepoint, property)
141+
142+
* `codepoint` {number} A Unicode codepoint value
143+
* `property` {number} A Unicode codepoint constant (from `util.constants.*`)
144+
145+
Returns a specific Unicode codepoint property for the given codepoint value.
146+
147+
## util.getColumnWidth(cp)
148+
149+
* `cp` {number | String} A Unicode codepoint value or a String
150+
151+
Returns the number of terminal columns to be used to display the given Unicode
152+
codepoint or string.
153+
136154
## util.inherits(constructor, superConstructor)
137155
<!-- YAML
138156
added: v0.3.0

lib/buffer.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ const binding = process.binding('buffer');
55
const { isArrayBuffer, isSharedArrayBuffer } = process.binding('util');
66
const bindingObj = {};
77
const internalUtil = require('internal/util');
8+
const internalBuffer = require('internal/buffer');
89

910
class FastBuffer extends Uint8Array {
1011
constructor(arg1, arg2, arg3) {
@@ -19,6 +20,8 @@ exports.Buffer = Buffer;
1920
exports.SlowBuffer = SlowBuffer;
2021
exports.INSPECT_MAX_BYTES = 50;
2122
exports.kMaxLength = binding.kMaxLength;
23+
exports.transcode = internalBuffer.transcode;
24+
exports.normalize = internalBuffer.normalize;
2225

2326
const kFromErrorMsg = 'First argument must be a string, Buffer, ' +
2427
'ArrayBuffer, Array, or array-like object.';

lib/internal/buffer.js

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
'use strict';
2+
3+
const Buffer = require('buffer').Buffer;
4+
const normalizeEncoding = require('internal/util').normalizeEncoding;
5+
6+
if (process.binding('config').hasIntl) {
7+
8+
const icu = process.binding('icu');
9+
10+
// Maps the supported transcoding conversions. The top key is the from_enc,
11+
// the child key is the to_enc. The value is the transcoding function to.
12+
const conversions = {
13+
'ascii': {
14+
'latin1': (source) => {
15+
return Buffer.from(source);
16+
},
17+
'utf8': (source) => {
18+
return Buffer.from(source);
19+
},
20+
'utf16le': (source) => {
21+
return icu.convertToUcs2('us-ascii', source);
22+
}
23+
},
24+
'latin1': {
25+
'ascii': (source) => {
26+
return icu.convert('us-ascii', 'iso8859-1', source);
27+
},
28+
'utf8': (source) => {
29+
return icu.convert('utf-8', 'iso8859-1', source);
30+
},
31+
'utf16le': (source) => {
32+
return icu.convertToUcs2('iso8859-1', source);
33+
}
34+
},
35+
'utf8': {
36+
'ascii': (source) => {
37+
return icu.convert('us-ascii', 'utf-8', source);
38+
},
39+
'latin1': (source) => {
40+
return icu.convert('iso-8859-1', 'utf-8', source);
41+
},
42+
'utf16le': icu.convertToUcs2FromUtf8,
43+
},
44+
'utf16le': {
45+
'ascii': (source) => {
46+
if (source.length % 2 !== 0)
47+
throw new TypeError('Invalid UCS2 Buffer');
48+
return icu.convertFromUcs2('us-ascii', source);
49+
},
50+
'latin1': (source) => {
51+
if (source.length % 2 !== 0)
52+
throw new TypeError('Invalid UCS2 Buffer');
53+
return icu.convertFromUcs2('iso-8859-1', source);
54+
},
55+
'utf8': (source) => {
56+
if (source.length % 2 !== 0)
57+
throw new TypeError('Invalid UCS2 Buffer');
58+
return icu.convertToUtf8FromUcs2(source);
59+
}
60+
}
61+
};
62+
63+
// Transcodes the Buffer from one encoding to another, returning a new
64+
// Buffer instance.
65+
exports.transcode = function transcode(source, from_enc, to_enc) {
66+
if (!source || !(source.buffer instanceof ArrayBuffer))
67+
throw new TypeError('"source" argument must be a Buffer');
68+
if (source.length === 0) return Buffer.alloc(0);
69+
70+
from_enc = normalizeEncoding(from_enc) || from_enc;
71+
to_enc = normalizeEncoding(to_enc) || to_enc;
72+
73+
if (from_enc === to_enc)
74+
return Buffer.from(source);
75+
76+
const cnv_from = conversions[from_enc];
77+
78+
if (cnv_from) {
79+
const cnv_to = cnv_from[to_enc];
80+
if (cnv_to)
81+
return cnv_to(source);
82+
}
83+
throw new TypeError(`Unsupported conversion: ${from_enc} to ${to_enc}`);
84+
};
85+
86+
// Perform Unicode Normalization on the Buffer.
87+
exports.normalize = function normalize(buf, form, encoding) {
88+
if (!buf || !(buf.buffer instanceof ArrayBuffer))
89+
throw new TypeError('First argument must be a Buffer');
90+
encoding = normalizeEncoding(encoding);
91+
if (encoding === 'ascii')
92+
encoding == 'us-ascii';
93+
return icu.normalize(buf, encoding, String(form).toUpperCase());
94+
};
95+
96+
}

lib/internal/readline.js

Lines changed: 99 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -2,102 +2,128 @@
22

33
// Regexes used for ansi escape code splitting
44
// eslint-disable-next-line no-control-regex
5-
const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/;
6-
const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [
7-
'(\\d+)(?:;(\\d+))?([~^$])',
8-
'(?:M([@ #!a`])(.)(.))', // mouse
9-
'(?:1;)?(\\d+)?([a-zA-Z])'
10-
].join('|') + ')');
5+
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
6+
// License: MIT, authors: @sindresorhus, Qix-, and arjunmehta
7+
const ansi =
8+
/[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g;
119

12-
13-
module.exports = {
10+
exports = module.exports = {
1411
emitKeys,
15-
getStringWidth,
16-
isFullWidthCodePoint,
1712
stripVTControlCharacters
1813
};
1914

2015

21-
/**
22-
* Returns the number of columns required to display the given string.
23-
*/
24-
function getStringWidth(str) {
25-
let width = 0;
26-
27-
str = stripVTControlCharacters(str);
28-
29-
for (let i = 0; i < str.length; i++) {
30-
const code = str.codePointAt(i);
16+
if (process.binding('config').hasIntl) {
17+
const util = require('util');
18+
exports.getStringWidth = function getStringWidth(str) {
19+
return util.getColumnWidth(stripVTControlCharacters(str));
20+
};
21+
22+
exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) {
23+
// Defined here largely for legacy support reasons. Updated to
24+
// use character properties rather than fixed ranges.
25+
const eaw =
26+
util.getCharacterProperty(code,
27+
util.constants.UCHAR_EAST_ASIAN_WIDTH);
28+
const emoji =
29+
util.getCharacterProperty(code,
30+
util.constants.UCHAR_EMOJI_PRESENTATION) &&
31+
!util.getCharacterProperty(code,
32+
util.constants.UCHAR_EMOJI_MODIFIER);
33+
return eaw === util.constants.U_EA_FULLWIDTH ||
34+
eaw === util.constants.U_EA_WIDE ||
35+
emoji;
36+
};
37+
38+
} else {
39+
// These old implementations are used as fallbacks only when Node.js
40+
// is compiled without ICU. The getStringWidth implementation here is
41+
// about 30% slower than the ICU based implementation and does not
42+
// work properly for emoji and newer unicode characters. The new impl
43+
// uses ICU's built in character properties data to provide more accurate
44+
// results.
45+
/**
46+
* Returns the number of columns required to display the given string.
47+
*/
48+
function getStringWidth(str) {
49+
let width = 0;
50+
51+
str = stripVTControlCharacters(str);
52+
53+
for (let i = 0; i < str.length; i++) {
54+
const code = str.codePointAt(i);
55+
56+
if (code >= 0x10000) { // surrogates
57+
i++;
58+
}
3159

32-
if (code >= 0x10000) { // surrogates
33-
i++;
60+
if (isFullWidthCodePoint(code)) {
61+
width += 2;
62+
} else {
63+
width++;
64+
}
3465
}
3566

36-
if (isFullWidthCodePoint(code)) {
37-
width += 2;
38-
} else {
39-
width++;
40-
}
67+
return width;
4168
}
4269

43-
return width;
44-
}
4570

71+
/**
72+
* Returns true if the character represented by a given
73+
* Unicode code point is full-width. Otherwise returns false.
74+
*/
75+
function isFullWidthCodePoint(code) {
76+
if (isNaN(code)) {
77+
return false;
78+
}
4679

47-
/**
48-
* Returns true if the character represented by a given
49-
* Unicode code point is full-width. Otherwise returns false.
50-
*/
51-
function isFullWidthCodePoint(code) {
52-
if (isNaN(code)) {
53-
return false;
54-
}
80+
// Code points are derived from:
81+
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
82+
if (code >= 0x1100 && (
83+
code <= 0x115f || // Hangul Jamo
84+
0x2329 === code || // LEFT-POINTING ANGLE BRACKET
85+
0x232a === code || // RIGHT-POINTING ANGLE BRACKET
86+
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
87+
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
88+
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
89+
0x3250 <= code && code <= 0x4dbf ||
90+
// CJK Unified Ideographs .. Yi Radicals
91+
0x4e00 <= code && code <= 0xa4c6 ||
92+
// Hangul Jamo Extended-A
93+
0xa960 <= code && code <= 0xa97c ||
94+
// Hangul Syllables
95+
0xac00 <= code && code <= 0xd7a3 ||
96+
// CJK Compatibility Ideographs
97+
0xf900 <= code && code <= 0xfaff ||
98+
// Vertical Forms
99+
0xfe10 <= code && code <= 0xfe19 ||
100+
// CJK Compatibility Forms .. Small Form Variants
101+
0xfe30 <= code && code <= 0xfe6b ||
102+
// Halfwidth and Fullwidth Forms
103+
0xff01 <= code && code <= 0xff60 ||
104+
0xffe0 <= code && code <= 0xffe6 ||
105+
// Kana Supplement
106+
0x1b000 <= code && code <= 0x1b001 ||
107+
// Enclosed Ideographic Supplement
108+
0x1f200 <= code && code <= 0x1f251 ||
109+
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
110+
0x20000 <= code && code <= 0x3fffd)) {
111+
return true;
112+
}
55113

56-
// Code points are derived from:
57-
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
58-
if (code >= 0x1100 && (
59-
code <= 0x115f || // Hangul Jamo
60-
0x2329 === code || // LEFT-POINTING ANGLE BRACKET
61-
0x232a === code || // RIGHT-POINTING ANGLE BRACKET
62-
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
63-
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
64-
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
65-
0x3250 <= code && code <= 0x4dbf ||
66-
// CJK Unified Ideographs .. Yi Radicals
67-
0x4e00 <= code && code <= 0xa4c6 ||
68-
// Hangul Jamo Extended-A
69-
0xa960 <= code && code <= 0xa97c ||
70-
// Hangul Syllables
71-
0xac00 <= code && code <= 0xd7a3 ||
72-
// CJK Compatibility Ideographs
73-
0xf900 <= code && code <= 0xfaff ||
74-
// Vertical Forms
75-
0xfe10 <= code && code <= 0xfe19 ||
76-
// CJK Compatibility Forms .. Small Form Variants
77-
0xfe30 <= code && code <= 0xfe6b ||
78-
// Halfwidth and Fullwidth Forms
79-
0xff01 <= code && code <= 0xff60 ||
80-
0xffe0 <= code && code <= 0xffe6 ||
81-
// Kana Supplement
82-
0x1b000 <= code && code <= 0x1b001 ||
83-
// Enclosed Ideographic Supplement
84-
0x1f200 <= code && code <= 0x1f251 ||
85-
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
86-
0x20000 <= code && code <= 0x3fffd)) {
87-
return true;
114+
return false;
88115
}
89116

90-
return false;
117+
exports.isFullWidthCodePoint = isFullWidthCodePoint;
118+
exports.getStringWidth = getStringWidth;
91119
}
92120

93-
94121
/**
95122
* Tries to remove all VT control characters. Use to estimate displayed
96123
* string width. May be buggy due to not running a real state machine
97124
*/
98125
function stripVTControlCharacters(str) {
99-
str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), '');
100-
return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), '');
126+
return str.replace(ansi, '');
101127
}
102128

103129

lib/util.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,3 +1053,15 @@ exports._exceptionWithHostPort = function(err,
10531053
}
10541054
return ex;
10551055
};
1056+
1057+
if (process.binding('config').hasIntl) {
1058+
const icu = process.binding('icu');
1059+
const constants = process.binding('constants').icu;
1060+
Object.defineProperty(exports, 'constants', {
1061+
configurable: false,
1062+
enumerable: true,
1063+
value: constants
1064+
});
1065+
exports.getCharacterProperty = icu.getCharacterProperty;
1066+
exports.getColumnWidth = icu.getColumnWidth;
1067+
}

node.gyp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
'lib/v8.js',
7575
'lib/vm.js',
7676
'lib/zlib.js',
77+
'lib/internal/buffer.js',
7778
'lib/internal/child_process.js',
7879
'lib/internal/cluster.js',
7980
'lib/internal/freelist.js',

0 commit comments

Comments
 (0)