Skip to content

Commit 72547fe

Browse files
committed
readline: use icu based string width calculation
Rather than the pseudo-wcwidth impl used currently, use the ICU character properties database to calculate string width and determine if a character is full width or not. This allows the algorithm to correctly identify emoji's as full width, ensures the algorithm will continue to fucntion properly as new unicode codepoints are added, and it's faster. This was originally part of a proposal to add a new unicode module, but has been split out. Refs: #8075 PR-URL: #9040 Reviewed-By: Ben Noordhuis <[email protected]> Reviewed-By: Steven R Loomis <[email protected]>
1 parent 52670fc commit 72547fe

File tree

4 files changed

+228
-73
lines changed

4 files changed

+228
-73
lines changed

β€Žlib/internal/readline.js

+87-73
Original file line numberDiff line numberDiff line change
@@ -1,103 +1,117 @@
11
'use strict';
22

3-
// Regexes used for ansi escape code splitting
3+
// Regex used for ansi escape code splitting
44
// eslint-disable-next-line no-control-regex
5-
const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/;
6-
const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [
7-
'(\\d+)(?:;(\\d+))?([~^$])',
8-
'(?:M([@ #!a`])(.)(.))', // mouse
9-
'(?:1;)?(\\d+)?([a-zA-Z])'
10-
].join('|') + ')');
5+
// Adopted from https://github.com./chalk/ansi-regex/blob/master/index.js
6+
// License: MIT, authors: @sindresorhus, Qix-, and arjunmehta
7+
// Matches all ansi escape code sequences in a string
8+
const ansi =
9+
/[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g;
1110

1211

1312
module.exports = {
1413
emitKeys,
15-
getStringWidth,
16-
isFullWidthCodePoint,
1714
stripVTControlCharacters
1815
};
1916

17+
if (process.binding('config').hasIntl) {
18+
const icu = process.binding('icu');
19+
module.exports.getStringWidth = function getStringWidth(str, options) {
20+
options = options || {};
21+
if (!Number.isInteger(str))
22+
str = stripVTControlCharacters(String(str));
23+
return icu.getStringWidth(str,
24+
Boolean(options.ambiguousAsFullWidth),
25+
Boolean(options.expandEmojiSequence));
26+
};
27+
module.exports.isFullWidthCodePoint =
28+
function isFullWidthCodePoint(code, options) {
29+
if (typeof code !== 'number')
30+
return false;
31+
return icu.getStringWidth(code, options) === 2;
32+
};
33+
} else {
34+
/**
35+
* Returns the number of columns required to display the given string.
36+
*/
37+
module.exports.getStringWidth = function getStringWidth(str) {
38+
if (Number.isInteger(str))
39+
return module.exports.isFullWidthCodePoint(str) ? 2 : 1;
2040

21-
/**
22-
* Returns the number of columns required to display the given string.
23-
*/
24-
function getStringWidth(str) {
25-
let width = 0;
41+
let width = 0;
2642

27-
str = stripVTControlCharacters(str);
43+
str = stripVTControlCharacters(String(str));
2844

29-
for (var i = 0; i < str.length; i++) {
30-
const code = str.codePointAt(i);
45+
for (var i = 0; i < str.length; i++) {
46+
const code = str.codePointAt(i);
3147

32-
if (code >= 0x10000) { // surrogates
33-
i++;
34-
}
48+
if (code >= 0x10000) { // surrogates
49+
i++;
50+
}
3551

36-
if (isFullWidthCodePoint(code)) {
37-
width += 2;
38-
} else {
39-
width++;
52+
if (module.exports.isFullWidthCodePoint(code)) {
53+
width += 2;
54+
} else {
55+
width++;
56+
}
4057
}
41-
}
42-
43-
return width;
44-
}
4558

59+
return width;
60+
};
4661

47-
/**
48-
* Returns true if the character represented by a given
49-
* Unicode code point is full-width. Otherwise returns false.
50-
*/
51-
function isFullWidthCodePoint(code) {
52-
if (isNaN(code)) {
53-
return false;
54-
}
62+
/**
63+
* Returns true if the character represented by a given
64+
* Unicode code point is full-width. Otherwise returns false.
65+
*/
66+
module.exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) {
67+
if (!Number.isInteger(code)) {
68+
return false;
69+
}
5570

56-
// Code points are derived from:
57-
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
58-
if (code >= 0x1100 && (
59-
code <= 0x115f || // Hangul Jamo
60-
0x2329 === code || // LEFT-POINTING ANGLE BRACKET
61-
0x232a === code || // RIGHT-POINTING ANGLE BRACKET
62-
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
63-
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
64-
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
65-
0x3250 <= code && code <= 0x4dbf ||
66-
// CJK Unified Ideographs .. Yi Radicals
67-
0x4e00 <= code && code <= 0xa4c6 ||
68-
// Hangul Jamo Extended-A
69-
0xa960 <= code && code <= 0xa97c ||
70-
// Hangul Syllables
71-
0xac00 <= code && code <= 0xd7a3 ||
72-
// CJK Compatibility Ideographs
73-
0xf900 <= code && code <= 0xfaff ||
74-
// Vertical Forms
75-
0xfe10 <= code && code <= 0xfe19 ||
76-
// CJK Compatibility Forms .. Small Form Variants
77-
0xfe30 <= code && code <= 0xfe6b ||
78-
// Halfwidth and Fullwidth Forms
79-
0xff01 <= code && code <= 0xff60 ||
80-
0xffe0 <= code && code <= 0xffe6 ||
81-
// Kana Supplement
82-
0x1b000 <= code && code <= 0x1b001 ||
83-
// Enclosed Ideographic Supplement
84-
0x1f200 <= code && code <= 0x1f251 ||
85-
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
86-
0x20000 <= code && code <= 0x3fffd)) {
87-
return true;
88-
}
71+
// Code points are derived from:
72+
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
73+
if (code >= 0x1100 && (
74+
code <= 0x115f || // Hangul Jamo
75+
0x2329 === code || // LEFT-POINTING ANGLE BRACKET
76+
0x232a === code || // RIGHT-POINTING ANGLE BRACKET
77+
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
78+
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
79+
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
80+
0x3250 <= code && code <= 0x4dbf ||
81+
// CJK Unified Ideographs .. Yi Radicals
82+
0x4e00 <= code && code <= 0xa4c6 ||
83+
// Hangul Jamo Extended-A
84+
0xa960 <= code && code <= 0xa97c ||
85+
// Hangul Syllables
86+
0xac00 <= code && code <= 0xd7a3 ||
87+
// CJK Compatibility Ideographs
88+
0xf900 <= code && code <= 0xfaff ||
89+
// Vertical Forms
90+
0xfe10 <= code && code <= 0xfe19 ||
91+
// CJK Compatibility Forms .. Small Form Variants
92+
0xfe30 <= code && code <= 0xfe6b ||
93+
// Halfwidth and Fullwidth Forms
94+
0xff01 <= code && code <= 0xff60 ||
95+
0xffe0 <= code && code <= 0xffe6 ||
96+
// Kana Supplement
97+
0x1b000 <= code && code <= 0x1b001 ||
98+
// Enclosed Ideographic Supplement
99+
0x1f200 <= code && code <= 0x1f251 ||
100+
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
101+
0x20000 <= code && code <= 0x3fffd)) {
102+
return true;
103+
}
89104

90-
return false;
105+
return false;
106+
};
91107
}
92108

93-
94109
/**
95110
* Tries to remove all VT control characters. Use to estimate displayed
96111
* string width. May be buggy due to not running a real state machine
97112
*/
98113
function stripVTControlCharacters(str) {
99-
str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), '');
100-
return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), '');
114+
return str.replace(ansi, '');
101115
}
102116

103117

β€Žlib/readline.js

+8
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,14 @@ function Interface(input, output, completer, terminal) {
124124

125125
function onkeypress(s, key) {
126126
self._ttyWrite(s, key);
127+
if (key && key.sequence) {
128+
// if the key.sequence is half of a surrogate pair
129+
// (>= 0xd800 and <= 0xdfff), refresh the line so
130+
// the character is displayed appropriately.
131+
const ch = key.sequence.codePointAt(0);
132+
if (ch >= 0xd800 && ch <= 0xdfff)
133+
self._refreshLine();
134+
}
127135
}
128136

129137
function onresize() {

β€Žsrc/node_i18n.cc

+90
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "v8.h"
3232

3333
#include <unicode/putil.h>
34+
#include <unicode/uchar.h>
3435
#include <unicode/udata.h>
3536
#include <unicode/uidna.h>
3637

@@ -185,13 +186,102 @@ static void ToASCII(const FunctionCallbackInfo<Value>& args) {
185186
len).ToLocalChecked());
186187
}
187188

189+
// This is similar to wcwidth except that it takes the current unicode
190+
// character properties database into consideration, allowing it to
191+
// correctly calculate the column widths of things like emoji's and
192+
// newer wide characters. wcwidth, on the other hand, uses a fixed
193+
// algorithm that does not take things like emoji into proper
194+
// consideration.
195+
static int GetColumnWidth(UChar32 codepoint,
196+
bool ambiguous_as_full_width = false) {
197+
if (!u_isdefined(codepoint) ||
198+
u_iscntrl(codepoint) ||
199+
u_getCombiningClass(codepoint) > 0 ||
200+
u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER)) {
201+
return 0;
202+
}
203+
// UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
204+
// codepoint as being full width, wide, ambiguous, neutral, narrow,
205+
// or halfwidth.
206+
const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
207+
switch (eaw) {
208+
case U_EA_FULLWIDTH:
209+
case U_EA_WIDE:
210+
return 2;
211+
case U_EA_AMBIGUOUS:
212+
// See: http://www.unicode.org/reports/tr11/#Ambiguous for details
213+
if (ambiguous_as_full_width) {
214+
return 2;
215+
}
216+
// Fall through if ambiguous_as_full_width if false.
217+
case U_EA_NEUTRAL:
218+
if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
219+
return 2;
220+
}
221+
// Fall through
222+
case U_EA_HALFWIDTH:
223+
case U_EA_NARROW:
224+
default:
225+
return 1;
226+
}
227+
}
228+
229+
// Returns the column width for the given String.
230+
static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
231+
Environment* env = Environment::GetCurrent(args);
232+
if (args.Length() < 1)
233+
return;
234+
235+
bool ambiguous_as_full_width = args[1]->BooleanValue();
236+
bool expand_emoji_sequence = args[2]->BooleanValue();
237+
238+
if (args[0]->IsNumber()) {
239+
args.GetReturnValue().Set(
240+
GetColumnWidth(args[0]->Uint32Value(),
241+
ambiguous_as_full_width));
242+
return;
243+
}
244+
245+
TwoByteValue value(env->isolate(), args[0]);
246+
// reinterpret_cast is required by windows to compile
247+
UChar* str = reinterpret_cast<UChar*>(*value);
248+
UChar32 c;
249+
UChar32 p;
250+
size_t n = 0;
251+
uint32_t width = 0;
252+
253+
while (n < value.length()) {
254+
p = c;
255+
U16_NEXT(str, n, value.length(), c);
256+
// Don't count individual emoji codepoints that occur within an
257+
// emoji sequence. This is not necessarily foolproof. Some
258+
// environments display emoji sequences in the appropriate
259+
// condensed form (as a single emoji glyph), other environments
260+
// may not understand an emoji sequence and will display each
261+
// individual emoji separately. When this happens, the width
262+
// calculated will be off, and there's no reliable way of knowing
263+
// in advance if a particular sequence is going to be supported.
264+
// The expand_emoji_sequence option allows the caller to skip this
265+
// check and count each code within an emoji sequence separately.
266+
if (!expand_emoji_sequence &&
267+
n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner)
268+
(u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
269+
u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
270+
continue;
271+
}
272+
width += GetColumnWidth(c, ambiguous_as_full_width);
273+
}
274+
args.GetReturnValue().Set(width);
275+
}
276+
188277
void Init(Local<Object> target,
189278
Local<Value> unused,
190279
Local<Context> context,
191280
void* priv) {
192281
Environment* env = Environment::GetCurrent(context);
193282
env->SetMethod(target, "toUnicode", ToUnicode);
194283
env->SetMethod(target, "toASCII", ToASCII);
284+
env->SetMethod(target, "getStringWidth", GetStringWidth);
195285
}
196286

197287
} // namespace i18n
+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// Flags: --expose_internals
2+
'use strict';
3+
4+
const common = require('../common');
5+
const assert = require('assert');
6+
const readline = require('internal/readline');
7+
8+
if (!process.binding('config').hasIntl) {
9+
common.skip('missing intl... skipping test');
10+
return;
11+
}
12+
13+
// Test column width
14+
assert.strictEqual(readline.getStringWidth('a'), 1);
15+
assert.strictEqual(readline.getStringWidth('丁'), 2);
16+
assert.strictEqual(readline.getStringWidth('\ud83d\udc78\ud83c\udfff'), 2);
17+
assert.strictEqual(readline.getStringWidth('πŸ‘…'), 2);
18+
assert.strictEqual(readline.getStringWidth('\n'), 0);
19+
assert.strictEqual(readline.getStringWidth('\u200Ef\u200F'), 1);
20+
assert.strictEqual(readline.getStringWidth(97), 1);
21+
22+
// The following is an emoji sequence. In some implementations, it is
23+
// represented as a single glyph, in other implementations as a sequence
24+
// of individual glyphs. By default, the algorithm will assume the single
25+
// glyph interpretation and return a value of 2. By passing the
26+
// expandEmojiSequence: true option, each component will be counted
27+
// individually.
28+
assert.strictEqual(readline.getStringWidth('πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§'), 2);
29+
assert.strictEqual(
30+
readline.getStringWidth('πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§', {expandEmojiSequence: true}), 8);
31+
32+
// By default, unicode characters whose width is considered ambiguous will
33+
// be considered half-width. For these characters, getStringWidth will return
34+
// 1. In some contexts, however, it is more appropriate to consider them full
35+
// width. By default, the algorithm will assume half width. By passing
36+
// the ambiguousAsFullWidth: true option, ambiguous characters will be counted
37+
// as 2 columns.
38+
assert.strictEqual(readline.getStringWidth('\u01d4'), 1);
39+
assert.strictEqual(
40+
readline.getStringWidth('\u01d4', {ambiguousAsFullWidth: true}), 2);
41+
42+
// Control chars and combining chars are zero
43+
assert.strictEqual(readline.getStringWidth('\u200E\n\u220A\u20D2'), 1);

0 commit comments

Comments
Β (0)