Fix garbled characters (Mojibake) in Output panel for UTF-8 build output on non-UTF-8 Windows (#4724)

Copilot · hanniavalera · web-flow · commit 9e99b21fdffc · 2026-02-23T22:59:24.000Z
When cmake.outputLogEncoding is 'auto' on Windows, the build output is now validated as UTF-8 before falling back to the system code page encoding. This fixes the issue where MSVC compiler output encoded as UTF-8 (via /utf-8) was incorrectly decoded using the system code page (e.g. GBK on Chinese Windows). Fixes #4520 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: hanniavalera <90047725+hanniavalera@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ Improvements:
 - Allow preset modification commands to target CMakeUserPresets.json. The target file is determined by the focused editor, or by prompting the user when both files exist. [#4564](https://github.com/microsoft/vscode-cmake-tools/issues/4564)
 
 Bug Fixes:
+- Fix garbled characters (Mojibake) in the Output panel when MSVC outputs UTF-8 (e.g., with `/utf-8`) on non-UTF-8 Windows systems. When `cmake.outputLogEncoding` is `auto`, the build output is now validated as UTF-8 before falling back to the system code page. [#4520](https://github.com/microsoft/vscode-cmake-tools/issues/4520)
 - Fix CMakePresets.json discovery failing in multi-folder workspaces when the presets file is in a subdirectory specified by `cmake.sourceDirectory`. [#4727](https://github.com/microsoft/vscode-cmake-tools/issues/4727)
 - Fix initial kit scan ignoring `cmake.enableAutomaticKitScan: false` on first workspace open, and prevent redundant concurrent scans in multi-project workspaces. [#4726](https://github.com/microsoft/vscode-cmake-tools/issues/4726)
 - Fix `cmake.installPrefix` not being passed to CMake as `CMAKE_INSTALL_PREFIX` when using presets. [#4358](https://github.com/microsoft/vscode-cmake-tools/issues/4358)
diff --git a/src/drivers/cmakeDriver.ts b/src/drivers/cmakeDriver.ts
@@ -2036,7 +2036,8 @@ export abstract class CMakeDriver implements vscode.Disposable {
         const buildcmd = await this.getCMakeBuildCommand(targets);
         if (buildcmd) {
             let outputEnc = this.config.outputLogEncoding;
-            if (outputEnc === 'auto') {
+            const isAutoEncoding = outputEnc === 'auto';
+            if (isAutoEncoding) {
                 if (process.platform === 'win32') {
                     outputEnc = await codepages.getWindowsCodepage();
                 } else {
@@ -2053,7 +2054,7 @@ export abstract class CMakeDriver implements vscode.Disposable {
                     }
                 }
             } else {
-                const exeOpt: proc.ExecutionOptions = { environment: buildcmd.build_env, outputEncoding: outputEnc };
+                const exeOpt: proc.ExecutionOptions = { environment: buildcmd.build_env, outputEncoding: outputEnc, useAutoEncoding: isAutoEncoding };
                 this.cmakeBuildRunner.setBuildProcess(this.executeCommand(buildcmd.command, buildcmd.args, consumer, exeOpt));
             }
             const result = await this.cmakeBuildRunner.getResult();
diff --git a/src/encodingUtils.ts b/src/encodingUtils.ts
@@ -0,0 +1,80 @@
+/**
+ * Check whether a buffer contains valid UTF-8 byte sequences.
+ * Incomplete multi-byte sequences at the end of the buffer are treated as valid
+ * (they may continue in the next data chunk).
+ */
+export function isValidUtf8(buffer: Buffer): boolean {
+    let i = 0;
+    while (i < buffer.length) {
+        const byte = buffer[i];
+        if (byte <= 0x7F) {
+            // Single-byte ASCII
+            i += 1;
+        } else if (byte >= 0xC2 && byte <= 0xDF) {
+            // 2-byte sequence
+            if (i + 1 >= buffer.length) {
+                return true; // Incomplete at end — OK
+            }
+            if (buffer[i + 1] < 0x80 || buffer[i + 1] > 0xBF) {
+                return false;
+            }
+            i += 2;
+        } else if (byte >= 0xE0 && byte <= 0xEF) {
+            // 3-byte sequence
+            if (i + 1 >= buffer.length) {
+                return true;
+            }
+            const b1 = buffer[i + 1];
+            if (b1 < 0x80 || b1 > 0xBF) {
+                return false;
+            }
+            // Reject overlong 3-byte sequences and surrogates
+            if (byte === 0xE0 && b1 < 0xA0) {
+                return false;
+            }
+            if (byte === 0xED && b1 > 0x9F) {
+                return false;
+            }
+            if (i + 2 >= buffer.length) {
+                return true;
+            }
+            if (buffer[i + 2] < 0x80 || buffer[i + 2] > 0xBF) {
+                return false;
+            }
+            i += 3;
+        } else if (byte >= 0xF0 && byte <= 0xF4) {
+            // 4-byte sequence
+            if (i + 1 >= buffer.length) {
+                return true;
+            }
+            const b1 = buffer[i + 1];
+            if (b1 < 0x80 || b1 > 0xBF) {
+                return false;
+            }
+            // Reject overlong 4-byte sequences and code points > U+10FFFF
+            if (byte === 0xF0 && b1 < 0x90) {
+                return false;
+            }
+            if (byte === 0xF4 && b1 > 0x8F) {
+                return false;
+            }
+            if (i + 2 >= buffer.length) {
+                return true;
+            }
+            if (buffer[i + 2] < 0x80 || buffer[i + 2] > 0xBF) {
+                return false;
+            }
+            if (i + 3 >= buffer.length) {
+                return true;
+            }
+            if (buffer[i + 3] < 0x80 || buffer[i + 3] > 0xBF) {
+                return false;
+            }
+            i += 4;
+        } else {
+            // Invalid leading byte (0x80-0xBF, 0xC0-0xC1, 0xF5-0xFF)
+            return false;
+        }
+    }
+    return true;
+}
diff --git a/src/proc.ts b/src/proc.ts
@@ -8,6 +8,7 @@ import * as proc from 'child_process';
 import * as iconv from 'iconv-lite';
 
 import { createLogger } from '@cmt/logging';
+import { isValidUtf8 } from '@cmt/encodingUtils';
 import rollbar from '@cmt/rollbar';
 import * as util from '@cmt/util';
 import * as nls from 'vscode-nls';
@@ -118,11 +119,14 @@ export interface ExecutionOptions {
     cwd?: string;
     encoding?: BufferEncoding;
     outputEncoding?: string;
+    useAutoEncoding?: boolean;
     overrideLocale?: boolean;
     timeout?: number;
     showOutputOnError?: boolean;
 }
 
+export { isValidUtf8 } from '@cmt/encodingUtils';
+
 export function buildCmdStr(command: string, args?: string[]): string {
     let cmdarr = [command];
     if (args) {
@@ -219,7 +223,20 @@ export function execute(command: string, args?: string[], outputConsumer?: Outpu
         child.stdout?.setEncoding(options.encoding);
     }
 
-    const encoding = options.outputEncoding && iconv.encodingExists(options.outputEncoding) ? options.outputEncoding : 'utf8';
+    const fallbackEncoding = options.outputEncoding && iconv.encodingExists(options.outputEncoding) ? options.outputEncoding : 'utf8';
+    // When useAutoEncoding is true (i.e., outputLogEncoding is 'auto' on Windows),
+    // try UTF-8 first for each chunk and fall back to the system code page encoding
+    // only if the chunk contains bytes that are not valid UTF-8.
+    // This correctly handles compilers that output UTF-8 (e.g., MSVC with /utf-8)
+    // on systems where the default code page is non-UTF-8 (e.g., GBK on Chinese Windows).
+    const useAutoEncoding = options.useAutoEncoding === true && fallbackEncoding !== 'utf8';
+    const decodeData = (data: Uint8Array): string => {
+        const buf = Buffer.from(data);
+        if (useAutoEncoding) {
+            return isValidUtf8(buf) ? iconv.decode(buf, 'utf8') : iconv.decode(buf, fallbackEncoding);
+        }
+        return iconv.decode(buf, fallbackEncoding);
+    };
     const accumulate = (str1: string, str2: string) => {
         try {
             return str1 + str2;
@@ -259,7 +276,7 @@ export function execute(command: string, args?: string[], outputConsumer?: Outpu
         });
         child?.stdout?.on('data', (data: Uint8Array) => {
             rollbar.invoke(localize('processing.data.event.stdout', 'Processing {0} event from proc stdout', "\"data\""), { data, command, args }, () => {
-                const str = iconv.decode(Buffer.from(data), encoding);
+                const str = decodeData(data);
                 const lines = str.split('\n').map(l => l.endsWith('\r') ? l.substr(0, l.length - 1) : l);
                 while (lines.length > 1) {
                     line_acc = accumulate(line_acc, lines[0]);
@@ -279,7 +296,7 @@ export function execute(command: string, args?: string[], outputConsumer?: Outpu
         });
         child?.stderr?.on('data', (data: Uint8Array) => {
             rollbar.invoke(localize('processing.data.event.stderr', 'Processing {0} event from proc stderr', "\"data\""), { data, command, args }, () => {
-                const str = iconv.decode(Buffer.from(data), encoding);
+                const str = decodeData(data);
                 const lines = str.split('\n').map(l => l.endsWith('\r') ? l.substr(0, l.length - 1) : l);
                 while (lines.length > 1) {
                     stderr_line_acc = accumulate(stderr_line_acc, lines[0]);
diff --git a/test/unit-tests/backend/encoding.test.ts b/test/unit-tests/backend/encoding.test.ts
@@ -0,0 +1,102 @@
+import { expect } from 'chai';
+import { isValidUtf8 } from '@cmt/encodingUtils';
+
+suite('isValidUtf8', () => {
+    test('ASCII-only content is valid UTF-8', () => {
+        expect(isValidUtf8(Buffer.from('Hello, World!', 'ascii'))).to.be.true;
+    });
+
+    test('Empty buffer is valid UTF-8', () => {
+        expect(isValidUtf8(Buffer.alloc(0))).to.be.true;
+    });
+
+    test('Valid UTF-8 with Chinese characters', () => {
+        // "无法打开包括文件" in UTF-8
+        const utf8Buf = Buffer.from('无法打开包括文件', 'utf8');
+        expect(isValidUtf8(utf8Buf)).to.be.true;
+    });
+
+    test('Valid UTF-8 with mixed ASCII and multibyte', () => {
+        // MSVC error message: "fatal error C1083: 无法打开包括文件"
+        const utf8Buf = Buffer.from('fatal error C1083: 无法打开包括文件: "a.h": No such file', 'utf8');
+        expect(isValidUtf8(utf8Buf)).to.be.true;
+    });
+
+    test('Valid UTF-8 with 2-byte sequences (Latin)', () => {
+        // "café" contains é (U+00E9) → 0xC3 0xA9
+        const utf8Buf = Buffer.from('café', 'utf8');
+        expect(isValidUtf8(utf8Buf)).to.be.true;
+    });
+
+    test('Valid UTF-8 with 4-byte sequences (emoji)', () => {
+        const utf8Buf = Buffer.from('Hello 🌍', 'utf8');
+        expect(isValidUtf8(utf8Buf)).to.be.true;
+    });
+
+    test('GBK-encoded Chinese is NOT valid UTF-8', () => {
+        // "无法打开" in GBK encoding: each Chinese character is 2 bytes in GBK
+        // GBK bytes for "无法" are: 0xCE 0xDE 0xB7 0xA8
+        // 0xCE 0xDE: 0xCE is a valid UTF-8 leading byte (2-byte), but 0xDE > 0xBF so invalid continuation
+        const gbkBuf = Buffer.from([0xCE, 0xDE, 0xB7, 0xA8]);
+        expect(isValidUtf8(gbkBuf)).to.be.false;
+    });
+
+    test('GBK-encoded MSVC error message is NOT valid UTF-8', () => {
+        // Simulate GBK output from cl.exe without /utf-8
+        // "无法打开包括文件" in GBK
+        const iconv = require('iconv-lite');
+        const gbkBuf: Buffer = iconv.encode('无法打开包括文件', 'gbk');
+        expect(isValidUtf8(gbkBuf)).to.be.false;
+    });
+
+    test('Invalid: bare continuation byte', () => {
+        expect(isValidUtf8(Buffer.from([0x80]))).to.be.false;
+    });
+
+    test('Invalid: overlong 2-byte sequence (0xC0 0x80)', () => {
+        expect(isValidUtf8(Buffer.from([0xC0, 0x80]))).to.be.false;
+    });
+
+    test('Invalid: overlong 2-byte sequence (0xC1 0xBF)', () => {
+        expect(isValidUtf8(Buffer.from([0xC1, 0xBF]))).to.be.false;
+    });
+
+    test('Invalid: overlong 3-byte sequence (0xE0 0x80 0x80)', () => {
+        expect(isValidUtf8(Buffer.from([0xE0, 0x80, 0x80]))).to.be.false;
+    });
+
+    test('Invalid: surrogate pair (0xED 0xA0 0x80 = U+D800)', () => {
+        expect(isValidUtf8(Buffer.from([0xED, 0xA0, 0x80]))).to.be.false;
+    });
+
+    test('Invalid: byte 0xFF', () => {
+        expect(isValidUtf8(Buffer.from([0xFF]))).to.be.false;
+    });
+
+    test('Invalid: byte 0xFE', () => {
+        expect(isValidUtf8(Buffer.from([0xFE]))).to.be.false;
+    });
+
+    test('Incomplete 2-byte sequence at end is valid (boundary split)', () => {
+        // 0xC3 is a valid 2-byte leading byte; incomplete at end
+        expect(isValidUtf8(Buffer.from([0x41, 0xC3]))).to.be.true;
+    });
+
+    test('Incomplete 3-byte sequence at end is valid (boundary split)', () => {
+        // 0xE4 0xB8 is start of a 3-byte CJK character
+        expect(isValidUtf8(Buffer.from([0x41, 0xE4, 0xB8]))).to.be.true;
+    });
+
+    test('Incomplete 4-byte sequence at end is valid (boundary split)', () => {
+        // 0xF0 0x9F is start of an emoji 4-byte sequence
+        expect(isValidUtf8(Buffer.from([0x41, 0xF0, 0x9F]))).to.be.true;
+    });
+
+    test('Invalid continuation in 2-byte sequence', () => {
+        expect(isValidUtf8(Buffer.from([0xC3, 0x00]))).to.be.false;
+    });
+
+    test('Invalid continuation in 3-byte sequence', () => {
+        expect(isValidUtf8(Buffer.from([0xE4, 0xB8, 0x00]))).to.be.false;
+    });
+});

Original file line number	Diff line number	Diff line change
`@@ -2036,7 +2036,8 @@ export abstract class CMakeDriver implements vscode.Disposable {`
`2036`	`2036`	`const buildcmd = await this.getCMakeBuildCommand(targets);`
`2037`	`2037`	`if (buildcmd) {`
`2038`	`2038`	`let outputEnc = this.config.outputLogEncoding;`
`2039`		`- if (outputEnc === 'auto') {`
	`2039`	`+ const isAutoEncoding = outputEnc === 'auto';`
	`2040`	`+ if (isAutoEncoding) {`
`2040`	`2041`	`if (process.platform === 'win32') {`
`2041`	`2042`	`outputEnc = await codepages.getWindowsCodepage();`
`2042`	`2043`	`} else {`
`@@ -2053,7 +2054,7 @@ export abstract class CMakeDriver implements vscode.Disposable {`
`2053`	`2054`	`}`
`2054`	`2055`	`}`
`2055`	`2056`	`} else {`
`2056`		`- const exeOpt: proc.ExecutionOptions = { environment: buildcmd.build_env, outputEncoding: outputEnc };`
	`2057`	`+ const exeOpt: proc.ExecutionOptions = { environment: buildcmd.build_env, outputEncoding: outputEnc, useAutoEncoding: isAutoEncoding };`
`2057`	`2058`	`this.cmakeBuildRunner.setBuildProcess(this.executeCommand(buildcmd.command, buildcmd.args, consumer, exeOpt));`
`2058`	`2059`	`}`
`2059`	`2060`	`const result = await this.cmakeBuildRunner.getResult();`