diff --git a/doc/api/fs.md b/doc/api/fs.md index b2231bd20cc420..71d0c976207bac 100644 --- a/doc/api/fs.md +++ b/doc/api/fs.md @@ -1354,6 +1354,9 @@ changes: - version: REPLACEME pr-url: https://github.com/nodejs/node/pull/62695 description: Add support for the `followSymlinks` option. + - version: REPLACEME + pr-url: https://github.com/nodejs/node/issues/59202 + description: Add support for the `encoding` option. - version: - v24.1.0 - v22.17.0 @@ -1377,6 +1380,13 @@ changes: * `pattern` {string|string\[]} * `options` {Object} * `cwd` {string|URL} current working directory. **Default:** `process.cwd()` + * `encoding` {string} The character encoding to use for the yielded paths. + If set to `'buffer'`, paths are yielded as {Buffer} objects (and, when + `withFileTypes` is `true`, the `name` and `parentPath` of each yielded + {fs.Dirent} are {Buffer}s). This is useful when matching file names that + contain byte sequences which are not valid UTF-8, since otherwise such + bytes are silently replaced when decoded into a string. **Default:** + `'utf8'`. * `exclude` {Function|string\[]} Function to filter out files/directories or a list of glob patterns to be excluded. If a function is provided, return `true` to exclude the item, `false` to include it. **Default:** `undefined`. @@ -3475,6 +3485,9 @@ changes: - version: REPLACEME pr-url: https://github.com/nodejs/node/pull/62695 description: Add support for the `followSymlinks` option. + - version: REPLACEME + pr-url: https://github.com/nodejs/node/issues/59202 + description: Add support for the `encoding` option. - version: - v24.1.0 - v22.17.0 @@ -3499,6 +3512,13 @@ changes: * `options` {Object} * `cwd` {string|URL} current working directory. **Default:** `process.cwd()` + * `encoding` {string} The character encoding to use for the returned paths. + If set to `'buffer'`, paths are returned as {Buffer} objects (and, when + `withFileTypes` is `true`, the `name` and `parentPath` of each returned + {fs.Dirent} are {Buffer}s). This is useful when matching file names that + contain byte sequences which are not valid UTF-8, since otherwise such + bytes are silently replaced when decoded into a string. **Default:** + `'utf8'`. * `exclude` {Function|string\[]} Function to filter out files/directories or a list of glob patterns to be excluded. If a function is provided, return `true` to exclude the item, `false` to include it. **Default:** `undefined`. @@ -6057,6 +6077,9 @@ changes: - version: REPLACEME pr-url: https://github.com/nodejs/node/pull/62695 description: Add support for the `followSymlinks` option. + - version: REPLACEME + pr-url: https://github.com/nodejs/node/issues/59202 + description: Add support for the `encoding` option. - version: - v24.1.0 - v22.17.0 @@ -6080,6 +6103,13 @@ changes: * `pattern` {string|string\[]} * `options` {Object} * `cwd` {string|URL} current working directory. **Default:** `process.cwd()` + * `encoding` {string} The character encoding to use for the returned paths. + If set to `'buffer'`, paths are returned as {Buffer} objects (and, when + `withFileTypes` is `true`, the `name` and `parentPath` of each returned + {fs.Dirent} are {Buffer}s). This is useful when matching file names that + contain byte sequences which are not valid UTF-8, since otherwise such + bytes are silently replaced when decoded into a string. **Default:** + `'utf8'`. * `exclude` {Function|string\[]} Function to filter out files/directories or a list of glob patterns to be excluded. If a function is provided, return `true` to exclude the item, `false` to include it. **Default:** `undefined`. @@ -6087,7 +6117,8 @@ changes: followed while expanding `**` patterns. **Default:** `false`. * `withFileTypes` {boolean} `true` if the glob should return paths as Dirents, `false` otherwise. **Default:** `false`. -* Returns: {string\[]} paths of files that match the pattern. +* Returns: {string\[]|Buffer\[]|Dirent\[]} paths (or Dirents, when + `withFileTypes` is `true`) of files that match the pattern. When `followSymlinks` is enabled, detected symbolic link cycles are not traversed recursively. diff --git a/lib/internal/fs/glob.js b/lib/internal/fs/glob.js index c5bbdb9813c0d1..be385cbe255499 100644 --- a/lib/internal/fs/glob.js +++ b/lib/internal/fs/glob.js @@ -28,6 +28,7 @@ const { realpath, stat, } = require('fs/promises'); +const { Buffer } = require('buffer'); const { join, resolve, basename, isAbsolute, dirname } = require('path'); const { @@ -41,7 +42,7 @@ const { validateString, validateStringArray, } = require('internal/validators'); -const { DirentFromStats } = require('internal/fs/utils'); +const { assertEncoding, DirentFromStats } = require('internal/fs/utils'); const { codes: { ERR_INVALID_ARG_TYPE, @@ -58,13 +59,60 @@ function lazyMinimatch() { } /** + * Convert a latin1-encoded string path into a Buffer that preserves the + * original byte sequence, suitable for passing to the fs bindings when the + * caller requested `encoding: 'buffer'`. * @param {string} path + * @returns {Buffer} + */ +function toRawBuffer(path) { + return Buffer.from(path, 'latin1'); +} + +/** + * When `readdir` is called with `encoding: 'buffer'`, each Dirent's `name` + * (and `parentPath`) is a Buffer. The matching machinery in this module + * operates on strings, so we decode them losslessly via latin1 (so each byte + * maps to a single 0..255 code unit). Names can be re-encoded to Buffer at + * the result-emission boundary via `toRawBuffer`. + * @param {Dirent} dirent + * @returns {Dirent} + */ +function decodeDirentName(dirent) { + if (Buffer.isBuffer(dirent.name)) { + dirent.name = dirent.name.toString('latin1'); + } + if (Buffer.isBuffer(dirent.parentPath)) { + dirent.parentPath = dirent.parentPath.toString('latin1'); + } + return dirent; +} + +/** + * Convert a Dirent whose name/parentPath are latin1-encoded strings into a + * Dirent whose name/parentPath are the original Buffer byte sequences. + * @param {Dirent} dirent + * @returns {Dirent} + */ +function bufferifyDirent(dirent) { + if (typeof dirent.name === 'string') { + dirent.name = toRawBuffer(dirent.name); + } + if (typeof dirent.parentPath === 'string') { + dirent.parentPath = toRawBuffer(dirent.parentPath); + } + return dirent; +} + +/** + * @param {string} path + * @param {boolean} useBuffer * @returns {Promise} */ -async function getDirent(path) { +async function getDirent(path, useBuffer) { let stat; try { - stat = await lstat(path); + stat = await lstat(useBuffer ? toRawBuffer(path) : path); } catch { return null; } @@ -73,12 +121,13 @@ async function getDirent(path) { /** * @param {string} path + * @param {boolean} useBuffer * @returns {DirentFromStats|null} */ -function getDirentSync(path) { +function getDirentSync(path, useBuffer) { let stat; try { - stat = lstatSync(path); + stat = lstatSync(useBuffer ? toRawBuffer(path) : path); } catch { return null; } @@ -138,13 +187,26 @@ class Cache { #followStatsCache = new SafeMap(); #readdirCache = new SafeMap(); #realpathCache = new SafeMap(); + #useBuffer = false; + + setUseBuffer(useBuffer) { + this.#useBuffer = useBuffer; + } + + // When the caller requested `encoding: 'buffer'`, paths are tracked + // internally as latin1 strings (so each code unit corresponds to a single + // byte), but every fs binding call must receive a Buffer with the original + // byte sequence so that non-UTF-8 file names round-trip correctly. + #fsPath(path) { + return this.#useBuffer ? toRawBuffer(path) : path; + } stat(path) { const cached = this.#statsCache.get(path); if (cached) { return cached; } - const promise = getDirent(path); + const promise = getDirent(path, this.#useBuffer); this.#statsCache.set(path, promise); return promise; } @@ -154,7 +216,7 @@ class Cache { if (cached && !(cached instanceof Promise)) { return cached; } - const val = getDirentSync(path); + const val = getDirentSync(path, this.#useBuffer); this.#statsCache.set(path, val); return val; } @@ -163,7 +225,7 @@ class Cache { if (cached) { return cached; } - const promise = PromisePrototypeThen(stat(path), null, () => null); + const promise = PromisePrototypeThen(stat(this.#fsPath(path)), null, () => null); this.#followStatsCache.set(path, promise); return promise; } @@ -174,7 +236,7 @@ class Cache { } let val; try { - val = statSync(path); + val = statSync(this.#fsPath(path)); } catch { val = null; } @@ -186,7 +248,12 @@ class Cache { if (cached) { return cached; } - const promise = PromisePrototypeThen(realpath(path), null, () => null); + const useBuffer = this.#useBuffer; + const promise = PromisePrototypeThen( + realpath(this.#fsPath(path), useBuffer ? { __proto__: null, encoding: 'buffer' } : undefined), + (val) => (useBuffer && Buffer.isBuffer(val) ? val.toString('latin1') : val), + () => null, + ); this.#realpathCache.set(path, promise); return promise; } @@ -197,7 +264,10 @@ class Cache { } let val; try { - val = realpathSync(path); + val = realpathSync(this.#fsPath(path), this.#useBuffer ? { __proto__: null, encoding: 'buffer' } : undefined); + if (this.#useBuffer && Buffer.isBuffer(val)) { + val = val.toString('latin1'); + } } catch { val = null; } @@ -212,7 +282,15 @@ class Cache { if (cached) { return cached; } - const promise = PromisePrototypeThen(readdir(path, { __proto__: null, withFileTypes: true }), null, () => []); + const useBuffer = this.#useBuffer; + const opts = useBuffer ? + { __proto__: null, withFileTypes: true, encoding: 'buffer' } : + { __proto__: null, withFileTypes: true }; + const promise = PromisePrototypeThen( + readdir(this.#fsPath(path), opts), + (entries) => (useBuffer ? ArrayPrototypeMap(entries, decodeDirentName) : entries), + () => [], + ); this.#readdirCache.set(path, promise); return promise; } @@ -223,7 +301,14 @@ class Cache { } let val; try { - val = readdirSync(path, { __proto__: null, withFileTypes: true }); + const useBuffer = this.#useBuffer; + const opts = useBuffer ? + { __proto__: null, withFileTypes: true, encoding: 'buffer' } : + { __proto__: null, withFileTypes: true }; + val = readdirSync(this.#fsPath(path), opts); + if (useBuffer) { + val = ArrayPrototypeMap(val, decodeDirentName); + } } catch { val = []; } @@ -336,15 +421,23 @@ class Glob { #patterns; #withFileTypes; #followSymlinks = false; + #useBuffer = false; #isExcluded = () => false; constructor(pattern, options = kEmptyObject) { validateObject(options, 'options'); - const { exclude, cwd, followSymlinks, withFileTypes } = options; + const { encoding, exclude, cwd, followSymlinks, withFileTypes } = options; this.#root = toPathIfFileURL(cwd) ?? '.'; if (followSymlinks != null) { validateBoolean(followSymlinks, 'options.followSymlinks'); this.#followSymlinks = followSymlinks; } + if (encoding !== undefined && encoding !== null) { + if (encoding !== 'buffer') { + assertEncoding(encoding); + } + this.#useBuffer = encoding === 'buffer'; + this.#cache.setUseBuffer(this.#useBuffer); + } this.#withFileTypes = !!withFileTypes; if (exclude != null) { validateStringArrayOrFunction(exclude, 'options.exclude'); @@ -391,14 +484,21 @@ class Glob { .forEach((patterns, path) => ArrayPrototypePush(this.#queue, { __proto__: null, path, patterns })); this.#subpatterns.clear(); } - return ArrayFrom( - this.#results, - this.#withFileTypes ? (path) => this.#cache.statSync( - isAbsolute(path) ? - path : - join(this.#root, path), - ) : undefined, - ); + const useBuffer = this.#useBuffer; + let mapper; + if (this.#withFileTypes) { + mapper = (path) => { + const dirent = this.#cache.statSync( + isAbsolute(path) ? + path : + join(this.#root, path), + ); + return useBuffer && dirent ? bufferifyDirent(dirent) : dirent; + }; + } else if (useBuffer) { + mapper = toRawBuffer; + } + return ArrayFrom(this.#results, mapper); } #isDirectorySync(path, stat, pattern) { if (stat?.isDirectory()) { @@ -686,11 +786,20 @@ class Glob { async* glob() { + const useBuffer = this.#useBuffer; + const withFileTypes = this.#withFileTypes; ArrayPrototypePush(this.#queue, { __proto__: null, path: '.', patterns: this.#patterns }); while (this.#queue.length > 0) { const item = ArrayPrototypePop(this.#queue); for (let i = 0; i < item.patterns.length; i++) { - yield* this.#iterateSubpatterns(item.path, item.patterns[i]); + const iter = this.#iterateSubpatterns(item.path, item.patterns[i]); + if (useBuffer) { + for await (const value of iter) { + yield withFileTypes ? bufferifyDirent(value) : toRawBuffer(value); + } + } else { + yield* iter; + } } this.#subpatterns .forEach((patterns, path) => ArrayPrototypePush(this.#queue, { __proto__: null, path, patterns })); diff --git a/test/parallel/test-fs-glob-encoding.mjs b/test/parallel/test-fs-glob-encoding.mjs new file mode 100644 index 00000000000000..ba225ca741193d --- /dev/null +++ b/test/parallel/test-fs-glob-encoding.mjs @@ -0,0 +1,181 @@ +// Regression test for https://github.com/nodejs/node/issues/59202 +// +// `fs.glob*` APIs lacked an `encoding` option, so they decoded directory +// entries as UTF-8 and silently replaced invalid byte sequences (mojibake). +// With `encoding: 'buffer'`, the raw bytes of the file name must round-trip +// through `globSync`, `glob` (callback) and `fsPromises.glob`, in both +// path-string and `withFileTypes` modes. + +import * as common from '../common/index.mjs'; +import tmpdir from '../common/tmpdir.js'; +import { mkdir, writeFile, glob as asyncGlob } from 'node:fs/promises'; +import { glob, globSync, Dirent } from 'node:fs'; +import { test, describe } from 'node:test'; +import { promisify } from 'node:util'; +import { Buffer } from 'node:buffer'; +import { sep } from 'node:path'; +import assert from 'node:assert'; + +const promisifiedGlob = promisify(glob); + +// Latin1 byte 0xE9 (`é`) is a stand-in for any non-ASCII byte. The point of +// this test is not the specific code unit but that `globSync` must not lose +// information when decoding a file name with `encoding: 'buffer'`. +const fileBuffer = Buffer.from([0x66, 0x6f, 0xe9, 0x2e, 0x74, 0x78, 0x74]); // "foé.txt" +const dirBuffer = Buffer.from([0x73, 0x75, 0x62, 0xe9]); // "subé" +const nestedBuffer = Buffer.from([0x62, 0x61, 0xe9, 0x72]); // "baér" + +// On Windows, file names are UTF-16 internally — non-UTF-8 byte sequences +// can not be created on disk and the OS will reinterpret the bytes. Skip the +// non-UTF-8 portion of the test there. Pure ASCII Buffer round-tripping +// (which exercises the new option's plumbing) is still validated below. +const supportsNonUtf8 = !common.isWindows; + +tmpdir.refresh(); + +const fixtureDir = tmpdir.resolve('glob-encoding'); +await mkdir(fixtureDir, { recursive: true }); + +if (supportsNonUtf8) { + // Place the byte-named file inside fixtureDir using a Buffer path so + // Node's UTF-8 conversion never touches it. + const filePath = Buffer.concat([ + Buffer.from(fixtureDir + sep), + fileBuffer, + ]); + await writeFile(filePath, 'hello'); + + const subDir = Buffer.concat([ + Buffer.from(fixtureDir + sep), + dirBuffer, + ]); + await mkdir(subDir); + const nestedFile = Buffer.concat([ + subDir, + Buffer.from(sep), + nestedBuffer, + ]); + await writeFile(nestedFile, 'world'); +} + +// Regardless of whether non-UTF-8 names can be created, also drop a regular +// ASCII file in the directory so the basic plumbing of the option is +// exercised on every platform. +await writeFile(`${fixtureDir}/plain.txt`, 'plain'); + +describe('fs.globSync with encoding option', () => { + test('returns Buffer paths when encoding is "buffer"', () => { + const matches = globSync('*', { cwd: fixtureDir, encoding: 'buffer' }); + for (const m of matches) { + assert.ok(Buffer.isBuffer(m), `expected Buffer, got ${typeof m}`); + } + // The plain ASCII file must always be present and decodable as UTF-8. + const names = matches.map((m) => m.toString('utf8')); + assert.ok(names.includes('plain.txt')); + }); + + test('preserves non-UTF-8 bytes in returned Buffer paths', { skip: !supportsNonUtf8 }, () => { + const matches = globSync('*', { cwd: fixtureDir, encoding: 'buffer' }); + const names = matches.map((m) => (Buffer.isBuffer(m) ? m : Buffer.from(m))); + const found = names.some((n) => n.equals(fileBuffer)); + assert.ok(found, `expected to find ${fileBuffer.toString('hex')} ` + + `in ${names.map((n) => n.toString('hex')).join(', ')}`); + }); + + test('returns Dirent with Buffer name and parentPath when withFileTypes', () => { + const matches = globSync('*', { + cwd: fixtureDir, + encoding: 'buffer', + withFileTypes: true, + }); + for (const m of matches) { + assert.ok(m instanceof Dirent); + assert.ok(Buffer.isBuffer(m.name), `expected Dirent.name to be a Buffer, got ${typeof m.name}`); + assert.ok(Buffer.isBuffer(m.parentPath), `expected Dirent.parentPath to be a Buffer, got ${typeof m.parentPath}`); + } + }); + + test('preserves non-UTF-8 bytes in Dirent.name when withFileTypes', { skip: !supportsNonUtf8 }, () => { + const matches = globSync('*', { + cwd: fixtureDir, + encoding: 'buffer', + withFileTypes: true, + }); + const names = matches.map((d) => d.name); + const found = names.some((n) => n.equals(fileBuffer)); + assert.ok(found, `expected to find ${fileBuffer.toString('hex')} ` + + `in ${names.map((n) => n.toString('hex')).join(', ')}`); + }); + + test('returns strings when encoding is omitted (existing behavior)', () => { + const matches = globSync('*', { cwd: fixtureDir }); + for (const m of matches) { + assert.strictEqual(typeof m, 'string'); + } + }); + + test('rejects unknown encodings', () => { + assert.throws(() => globSync('*', { cwd: fixtureDir, encoding: 'not-an-encoding' }), { + code: 'ERR_INVALID_ARG_VALUE', + }); + }); + + test('walks into directories with non-UTF-8 names', { skip: !supportsNonUtf8 }, () => { + const matches = globSync('**/*', { cwd: fixtureDir, encoding: 'buffer' }); + const names = matches.map((m) => m.toString('latin1')); + const expectedNested = `${dirBuffer.toString('latin1')}/${nestedBuffer.toString('latin1')}`; + assert.ok(names.some((n) => n === expectedNested), + `expected to find nested entry ${expectedNested} in ${names.join(', ')}`); + }); +}); + +describe('fs.glob (callback) with encoding option', () => { + test('returns Buffer paths when encoding is "buffer"', async () => { + const matches = await promisifiedGlob('*', { cwd: fixtureDir, encoding: 'buffer' }); + for (const m of matches) { + assert.ok(Buffer.isBuffer(m)); + } + }); + + test('preserves non-UTF-8 bytes in returned paths', { skip: !supportsNonUtf8 }, async () => { + const matches = await promisifiedGlob('*', { cwd: fixtureDir, encoding: 'buffer' }); + const found = matches.some((n) => Buffer.isBuffer(n) && n.equals(fileBuffer)); + assert.ok(found); + }); +}); + +describe('fsPromises.glob with encoding option', () => { + test('yields Buffer paths when encoding is "buffer"', async () => { + const collected = []; + for await (const item of asyncGlob('*', { cwd: fixtureDir, encoding: 'buffer' })) { + collected.push(item); + } + for (const m of collected) { + assert.ok(Buffer.isBuffer(m)); + } + }); + + test('yields Dirent with Buffer name when withFileTypes and encoding is "buffer"', async () => { + const collected = []; + for await (const item of asyncGlob('*', { + cwd: fixtureDir, + encoding: 'buffer', + withFileTypes: true, + })) { + collected.push(item); + } + for (const d of collected) { + assert.ok(d instanceof Dirent); + assert.ok(Buffer.isBuffer(d.name)); + } + }); + + test('preserves non-UTF-8 bytes in yielded paths', { skip: !supportsNonUtf8 }, async () => { + const collected = []; + for await (const item of asyncGlob('*', { cwd: fixtureDir, encoding: 'buffer' })) { + collected.push(item); + } + const found = collected.some((n) => Buffer.isBuffer(n) && n.equals(fileBuffer)); + assert.ok(found); + }); +});