Skip to content

Commit 3168d60

Browse files
committed
fs: add encoding option to glob and globSync
fs.glob, fs.globSync, and fsPromises.glob always decoded directory entries as UTF-8, so on POSIX any non-UTF-8 bytes in file names were silently replaced with U+FFFD and the original byte sequence could not be recovered. Add an `encoding` option mirroring fs.readdir. When set to 'buffer', glob returns Buffer paths and (when withFileTypes is true) Dirents whose `name` and `parentPath` are Buffers. The internal walker continues to operate on strings (latin1-encoded so each byte maps 1:1 to a code unit, preserving arbitrary bytes), and only the fs binding boundary and the result-emission boundary round-trip through Buffer; minimatch, path joining, and the seen-path cache are unchanged. Fixes: #59202 Signed-off-by: Maruthan G <[email protected]>
1 parent 21436f0 commit 3168d60

3 files changed

Lines changed: 345 additions & 24 deletions

File tree

doc/api/fs.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1354,6 +1354,9 @@ changes:
13541354
- version: REPLACEME
13551355
pr-url: https://github.com/nodejs/node/pull/62695
13561356
description: Add support for the `followSymlinks` option.
1357+
- version: REPLACEME
1358+
pr-url: https://github.com/nodejs/node/issues/59202
1359+
description: Add support for the `encoding` option.
13571360
- version:
13581361
- v24.1.0
13591362
- v22.17.0
@@ -1377,6 +1380,13 @@ changes:
13771380
* `pattern` {string|string\[]}
13781381
* `options` {Object}
13791382
* `cwd` {string|URL} current working directory. **Default:** `process.cwd()`
1383+
* `encoding` {string} The character encoding to use for the yielded paths.
1384+
If set to `'buffer'`, paths are yielded as {Buffer} objects (and, when
1385+
`withFileTypes` is `true`, the `name` and `parentPath` of each yielded
1386+
{fs.Dirent} are {Buffer}s). This is useful when matching file names that
1387+
contain byte sequences which are not valid UTF-8, since otherwise such
1388+
bytes are silently replaced when decoded into a string. **Default:**
1389+
`'utf8'`.
13801390
* `exclude` {Function|string\[]} Function to filter out files/directories or a
13811391
list of glob patterns to be excluded. If a function is provided, return
13821392
`true` to exclude the item, `false` to include it. **Default:** `undefined`.
@@ -3475,6 +3485,9 @@ changes:
34753485
- version: REPLACEME
34763486
pr-url: https://github.com/nodejs/node/pull/62695
34773487
description: Add support for the `followSymlinks` option.
3488+
- version: REPLACEME
3489+
pr-url: https://github.com/nodejs/node/issues/59202
3490+
description: Add support for the `encoding` option.
34783491
- version:
34793492
- v24.1.0
34803493
- v22.17.0
@@ -3499,6 +3512,13 @@ changes:
34993512
35003513
* `options` {Object}
35013514
* `cwd` {string|URL} current working directory. **Default:** `process.cwd()`
3515+
* `encoding` {string} The character encoding to use for the returned paths.
3516+
If set to `'buffer'`, paths are returned as {Buffer} objects (and, when
3517+
`withFileTypes` is `true`, the `name` and `parentPath` of each returned
3518+
{fs.Dirent} are {Buffer}s). This is useful when matching file names that
3519+
contain byte sequences which are not valid UTF-8, since otherwise such
3520+
bytes are silently replaced when decoded into a string. **Default:**
3521+
`'utf8'`.
35023522
* `exclude` {Function|string\[]} Function to filter out files/directories or a
35033523
list of glob patterns to be excluded. If a function is provided, return
35043524
`true` to exclude the item, `false` to include it. **Default:** `undefined`.
@@ -6057,6 +6077,9 @@ changes:
60576077
- version: REPLACEME
60586078
pr-url: https://github.com/nodejs/node/pull/62695
60596079
description: Add support for the `followSymlinks` option.
6080+
- version: REPLACEME
6081+
pr-url: https://github.com/nodejs/node/issues/59202
6082+
description: Add support for the `encoding` option.
60606083
- version:
60616084
- v24.1.0
60626085
- v22.17.0
@@ -6080,14 +6103,22 @@ changes:
60806103
* `pattern` {string|string\[]}
60816104
* `options` {Object}
60826105
* `cwd` {string|URL} current working directory. **Default:** `process.cwd()`
6106+
* `encoding` {string} The character encoding to use for the returned paths.
6107+
If set to `'buffer'`, paths are returned as {Buffer} objects (and, when
6108+
`withFileTypes` is `true`, the `name` and `parentPath` of each returned
6109+
{fs.Dirent} are {Buffer}s). This is useful when matching file names that
6110+
contain byte sequences which are not valid UTF-8, since otherwise such
6111+
bytes are silently replaced when decoded into a string. **Default:**
6112+
`'utf8'`.
60836113
* `exclude` {Function|string\[]} Function to filter out files/directories or a
60846114
list of glob patterns to be excluded. If a function is provided, return
60856115
`true` to exclude the item, `false` to include it. **Default:** `undefined`.
60866116
* `followSymlinks` {boolean} When `true`, symbolic links to directories are
60876117
followed while expanding `**` patterns. **Default:** `false`.
60886118
* `withFileTypes` {boolean} `true` if the glob should return paths as Dirents,
60896119
`false` otherwise. **Default:** `false`.
6090-
* Returns: {string\[]} paths of files that match the pattern.
6120+
* Returns: {string\[]|Buffer\[]|Dirent\[]} paths (or Dirents, when
6121+
`withFileTypes` is `true`) of files that match the pattern.
60916122
60926123
When `followSymlinks` is enabled, detected symbolic link cycles are not
60936124
traversed recursively.

lib/internal/fs/glob.js

Lines changed: 132 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ const {
2828
realpath,
2929
stat,
3030
} = require('fs/promises');
31+
const { Buffer } = require('buffer');
3132
const { join, resolve, basename, isAbsolute, dirname } = require('path');
3233

3334
const {
@@ -41,7 +42,7 @@ const {
4142
validateString,
4243
validateStringArray,
4344
} = require('internal/validators');
44-
const { DirentFromStats } = require('internal/fs/utils');
45+
const { assertEncoding, DirentFromStats } = require('internal/fs/utils');
4546
const {
4647
codes: {
4748
ERR_INVALID_ARG_TYPE,
@@ -58,13 +59,60 @@ function lazyMinimatch() {
5859
}
5960

6061
/**
62+
* Convert a latin1-encoded string path into a Buffer that preserves the
63+
* original byte sequence, suitable for passing to the fs bindings when the
64+
* caller requested `encoding: 'buffer'`.
6165
* @param {string} path
66+
* @returns {Buffer}
67+
*/
68+
function toRawBuffer(path) {
69+
return Buffer.from(path, 'latin1');
70+
}
71+
72+
/**
73+
* When `readdir` is called with `encoding: 'buffer'`, each Dirent's `name`
74+
* (and `parentPath`) is a Buffer. The matching machinery in this module
75+
* operates on strings, so we decode them losslessly via latin1 (so each byte
76+
* maps to a single 0..255 code unit). Names can be re-encoded to Buffer at
77+
* the result-emission boundary via `toRawBuffer`.
78+
* @param {Dirent} dirent
79+
* @returns {Dirent}
80+
*/
81+
function decodeDirentName(dirent) {
82+
if (Buffer.isBuffer(dirent.name)) {
83+
dirent.name = dirent.name.toString('latin1');
84+
}
85+
if (Buffer.isBuffer(dirent.parentPath)) {
86+
dirent.parentPath = dirent.parentPath.toString('latin1');
87+
}
88+
return dirent;
89+
}
90+
91+
/**
92+
* Convert a Dirent whose name/parentPath are latin1-encoded strings into a
93+
* Dirent whose name/parentPath are the original Buffer byte sequences.
94+
* @param {Dirent} dirent
95+
* @returns {Dirent}
96+
*/
97+
function bufferifyDirent(dirent) {
98+
if (typeof dirent.name === 'string') {
99+
dirent.name = toRawBuffer(dirent.name);
100+
}
101+
if (typeof dirent.parentPath === 'string') {
102+
dirent.parentPath = toRawBuffer(dirent.parentPath);
103+
}
104+
return dirent;
105+
}
106+
107+
/**
108+
* @param {string} path
109+
* @param {boolean} useBuffer
62110
* @returns {Promise<DirentFromStats|null>}
63111
*/
64-
async function getDirent(path) {
112+
async function getDirent(path, useBuffer) {
65113
let stat;
66114
try {
67-
stat = await lstat(path);
115+
stat = await lstat(useBuffer ? toRawBuffer(path) : path);
68116
} catch {
69117
return null;
70118
}
@@ -73,12 +121,13 @@ async function getDirent(path) {
73121

74122
/**
75123
* @param {string} path
124+
* @param {boolean} useBuffer
76125
* @returns {DirentFromStats|null}
77126
*/
78-
function getDirentSync(path) {
127+
function getDirentSync(path, useBuffer) {
79128
let stat;
80129
try {
81-
stat = lstatSync(path);
130+
stat = lstatSync(useBuffer ? toRawBuffer(path) : path);
82131
} catch {
83132
return null;
84133
}
@@ -138,13 +187,26 @@ class Cache {
138187
#followStatsCache = new SafeMap();
139188
#readdirCache = new SafeMap();
140189
#realpathCache = new SafeMap();
190+
#useBuffer = false;
191+
192+
setUseBuffer(useBuffer) {
193+
this.#useBuffer = useBuffer;
194+
}
195+
196+
// When the caller requested `encoding: 'buffer'`, paths are tracked
197+
// internally as latin1 strings (so each code unit corresponds to a single
198+
// byte), but every fs binding call must receive a Buffer with the original
199+
// byte sequence so that non-UTF-8 file names round-trip correctly.
200+
#fsPath(path) {
201+
return this.#useBuffer ? toRawBuffer(path) : path;
202+
}
141203

142204
stat(path) {
143205
const cached = this.#statsCache.get(path);
144206
if (cached) {
145207
return cached;
146208
}
147-
const promise = getDirent(path);
209+
const promise = getDirent(path, this.#useBuffer);
148210
this.#statsCache.set(path, promise);
149211
return promise;
150212
}
@@ -154,7 +216,7 @@ class Cache {
154216
if (cached && !(cached instanceof Promise)) {
155217
return cached;
156218
}
157-
const val = getDirentSync(path);
219+
const val = getDirentSync(path, this.#useBuffer);
158220
this.#statsCache.set(path, val);
159221
return val;
160222
}
@@ -163,7 +225,7 @@ class Cache {
163225
if (cached) {
164226
return cached;
165227
}
166-
const promise = PromisePrototypeThen(stat(path), null, () => null);
228+
const promise = PromisePrototypeThen(stat(this.#fsPath(path)), null, () => null);
167229
this.#followStatsCache.set(path, promise);
168230
return promise;
169231
}
@@ -174,7 +236,7 @@ class Cache {
174236
}
175237
let val;
176238
try {
177-
val = statSync(path);
239+
val = statSync(this.#fsPath(path));
178240
} catch {
179241
val = null;
180242
}
@@ -186,7 +248,12 @@ class Cache {
186248
if (cached) {
187249
return cached;
188250
}
189-
const promise = PromisePrototypeThen(realpath(path), null, () => null);
251+
const useBuffer = this.#useBuffer;
252+
const promise = PromisePrototypeThen(
253+
realpath(this.#fsPath(path), useBuffer ? { __proto__: null, encoding: 'buffer' } : undefined),
254+
(val) => (useBuffer && Buffer.isBuffer(val) ? val.toString('latin1') : val),
255+
() => null,
256+
);
190257
this.#realpathCache.set(path, promise);
191258
return promise;
192259
}
@@ -197,7 +264,10 @@ class Cache {
197264
}
198265
let val;
199266
try {
200-
val = realpathSync(path);
267+
val = realpathSync(this.#fsPath(path), this.#useBuffer ? { __proto__: null, encoding: 'buffer' } : undefined);
268+
if (this.#useBuffer && Buffer.isBuffer(val)) {
269+
val = val.toString('latin1');
270+
}
201271
} catch {
202272
val = null;
203273
}
@@ -212,7 +282,15 @@ class Cache {
212282
if (cached) {
213283
return cached;
214284
}
215-
const promise = PromisePrototypeThen(readdir(path, { __proto__: null, withFileTypes: true }), null, () => []);
285+
const useBuffer = this.#useBuffer;
286+
const opts = useBuffer ?
287+
{ __proto__: null, withFileTypes: true, encoding: 'buffer' } :
288+
{ __proto__: null, withFileTypes: true };
289+
const promise = PromisePrototypeThen(
290+
readdir(this.#fsPath(path), opts),
291+
(entries) => (useBuffer ? ArrayPrototypeMap(entries, decodeDirentName) : entries),
292+
() => [],
293+
);
216294
this.#readdirCache.set(path, promise);
217295
return promise;
218296
}
@@ -223,7 +301,14 @@ class Cache {
223301
}
224302
let val;
225303
try {
226-
val = readdirSync(path, { __proto__: null, withFileTypes: true });
304+
const useBuffer = this.#useBuffer;
305+
const opts = useBuffer ?
306+
{ __proto__: null, withFileTypes: true, encoding: 'buffer' } :
307+
{ __proto__: null, withFileTypes: true };
308+
val = readdirSync(this.#fsPath(path), opts);
309+
if (useBuffer) {
310+
val = ArrayPrototypeMap(val, decodeDirentName);
311+
}
227312
} catch {
228313
val = [];
229314
}
@@ -336,15 +421,23 @@ class Glob {
336421
#patterns;
337422
#withFileTypes;
338423
#followSymlinks = false;
424+
#useBuffer = false;
339425
#isExcluded = () => false;
340426
constructor(pattern, options = kEmptyObject) {
341427
validateObject(options, 'options');
342-
const { exclude, cwd, followSymlinks, withFileTypes } = options;
428+
const { encoding, exclude, cwd, followSymlinks, withFileTypes } = options;
343429
this.#root = toPathIfFileURL(cwd) ?? '.';
344430
if (followSymlinks != null) {
345431
validateBoolean(followSymlinks, 'options.followSymlinks');
346432
this.#followSymlinks = followSymlinks;
347433
}
434+
if (encoding !== undefined && encoding !== null) {
435+
if (encoding !== 'buffer') {
436+
assertEncoding(encoding);
437+
}
438+
this.#useBuffer = encoding === 'buffer';
439+
this.#cache.setUseBuffer(this.#useBuffer);
440+
}
348441
this.#withFileTypes = !!withFileTypes;
349442
if (exclude != null) {
350443
validateStringArrayOrFunction(exclude, 'options.exclude');
@@ -391,14 +484,21 @@ class Glob {
391484
.forEach((patterns, path) => ArrayPrototypePush(this.#queue, { __proto__: null, path, patterns }));
392485
this.#subpatterns.clear();
393486
}
394-
return ArrayFrom(
395-
this.#results,
396-
this.#withFileTypes ? (path) => this.#cache.statSync(
397-
isAbsolute(path) ?
398-
path :
399-
join(this.#root, path),
400-
) : undefined,
401-
);
487+
const useBuffer = this.#useBuffer;
488+
let mapper;
489+
if (this.#withFileTypes) {
490+
mapper = (path) => {
491+
const dirent = this.#cache.statSync(
492+
isAbsolute(path) ?
493+
path :
494+
join(this.#root, path),
495+
);
496+
return useBuffer && dirent ? bufferifyDirent(dirent) : dirent;
497+
};
498+
} else if (useBuffer) {
499+
mapper = toRawBuffer;
500+
}
501+
return ArrayFrom(this.#results, mapper);
402502
}
403503
#isDirectorySync(path, stat, pattern) {
404504
if (stat?.isDirectory()) {
@@ -686,11 +786,20 @@ class Glob {
686786

687787

688788
async* glob() {
789+
const useBuffer = this.#useBuffer;
790+
const withFileTypes = this.#withFileTypes;
689791
ArrayPrototypePush(this.#queue, { __proto__: null, path: '.', patterns: this.#patterns });
690792
while (this.#queue.length > 0) {
691793
const item = ArrayPrototypePop(this.#queue);
692794
for (let i = 0; i < item.patterns.length; i++) {
693-
yield* this.#iterateSubpatterns(item.path, item.patterns[i]);
795+
const iter = this.#iterateSubpatterns(item.path, item.patterns[i]);
796+
if (useBuffer) {
797+
for await (const value of iter) {
798+
yield withFileTypes ? bufferifyDirent(value) : toRawBuffer(value);
799+
}
800+
} else {
801+
yield* iter;
802+
}
694803
}
695804
this.#subpatterns
696805
.forEach((patterns, path) => ArrayPrototypePush(this.#queue, { __proto__: null, path, patterns }));

0 commit comments

Comments
 (0)