Skip to content

Commit 9e99b21

Browse files
Fix garbled characters (Mojibake) in Output panel for UTF-8 build output on non-UTF-8 Windows (#4724)
When cmake.outputLogEncoding is 'auto' on Windows, the build output is now validated as UTF-8 before falling back to the system code page encoding. This fixes the issue where MSVC compiler output encoded as UTF-8 (via /utf-8) was incorrectly decoded using the system code page (e.g. GBK on Chinese Windows). Fixes #4520 Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: hanniavalera <[email protected]>
1 parent f842235 commit 9e99b21

5 files changed

Lines changed: 206 additions & 5 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Improvements:
1818
- Allow preset modification commands to target CMakeUserPresets.json. The target file is determined by the focused editor, or by prompting the user when both files exist. [#4564](https://github.com/microsoft/vscode-cmake-tools/issues/4564)
1919

2020
Bug Fixes:
21+
- Fix garbled characters (Mojibake) in the Output panel when MSVC outputs UTF-8 (e.g., with `/utf-8`) on non-UTF-8 Windows systems. When `cmake.outputLogEncoding` is `auto`, the build output is now validated as UTF-8 before falling back to the system code page. [#4520](https://github.com/microsoft/vscode-cmake-tools/issues/4520)
2122
- Fix CMakePresets.json discovery failing in multi-folder workspaces when the presets file is in a subdirectory specified by `cmake.sourceDirectory`. [#4727](https://github.com/microsoft/vscode-cmake-tools/issues/4727)
2223
- Fix initial kit scan ignoring `cmake.enableAutomaticKitScan: false` on first workspace open, and prevent redundant concurrent scans in multi-project workspaces. [#4726](https://github.com/microsoft/vscode-cmake-tools/issues/4726)
2324
- Fix `cmake.installPrefix` not being passed to CMake as `CMAKE_INSTALL_PREFIX` when using presets. [#4358](https://github.com/microsoft/vscode-cmake-tools/issues/4358)

src/drivers/cmakeDriver.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2036,7 +2036,8 @@ export abstract class CMakeDriver implements vscode.Disposable {
20362036
const buildcmd = await this.getCMakeBuildCommand(targets);
20372037
if (buildcmd) {
20382038
let outputEnc = this.config.outputLogEncoding;
2039-
if (outputEnc === 'auto') {
2039+
const isAutoEncoding = outputEnc === 'auto';
2040+
if (isAutoEncoding) {
20402041
if (process.platform === 'win32') {
20412042
outputEnc = await codepages.getWindowsCodepage();
20422043
} else {
@@ -2053,7 +2054,7 @@ export abstract class CMakeDriver implements vscode.Disposable {
20532054
}
20542055
}
20552056
} else {
2056-
const exeOpt: proc.ExecutionOptions = { environment: buildcmd.build_env, outputEncoding: outputEnc };
2057+
const exeOpt: proc.ExecutionOptions = { environment: buildcmd.build_env, outputEncoding: outputEnc, useAutoEncoding: isAutoEncoding };
20572058
this.cmakeBuildRunner.setBuildProcess(this.executeCommand(buildcmd.command, buildcmd.args, consumer, exeOpt));
20582059
}
20592060
const result = await this.cmakeBuildRunner.getResult();

src/encodingUtils.ts

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/**
2+
* Check whether a buffer contains valid UTF-8 byte sequences.
3+
* Incomplete multi-byte sequences at the end of the buffer are treated as valid
4+
* (they may continue in the next data chunk).
5+
*/
6+
export function isValidUtf8(buffer: Buffer): boolean {
7+
let i = 0;
8+
while (i < buffer.length) {
9+
const byte = buffer[i];
10+
if (byte <= 0x7F) {
11+
// Single-byte ASCII
12+
i += 1;
13+
} else if (byte >= 0xC2 && byte <= 0xDF) {
14+
// 2-byte sequence
15+
if (i + 1 >= buffer.length) {
16+
return true; // Incomplete at end — OK
17+
}
18+
if (buffer[i + 1] < 0x80 || buffer[i + 1] > 0xBF) {
19+
return false;
20+
}
21+
i += 2;
22+
} else if (byte >= 0xE0 && byte <= 0xEF) {
23+
// 3-byte sequence
24+
if (i + 1 >= buffer.length) {
25+
return true;
26+
}
27+
const b1 = buffer[i + 1];
28+
if (b1 < 0x80 || b1 > 0xBF) {
29+
return false;
30+
}
31+
// Reject overlong 3-byte sequences and surrogates
32+
if (byte === 0xE0 && b1 < 0xA0) {
33+
return false;
34+
}
35+
if (byte === 0xED && b1 > 0x9F) {
36+
return false;
37+
}
38+
if (i + 2 >= buffer.length) {
39+
return true;
40+
}
41+
if (buffer[i + 2] < 0x80 || buffer[i + 2] > 0xBF) {
42+
return false;
43+
}
44+
i += 3;
45+
} else if (byte >= 0xF0 && byte <= 0xF4) {
46+
// 4-byte sequence
47+
if (i + 1 >= buffer.length) {
48+
return true;
49+
}
50+
const b1 = buffer[i + 1];
51+
if (b1 < 0x80 || b1 > 0xBF) {
52+
return false;
53+
}
54+
// Reject overlong 4-byte sequences and code points > U+10FFFF
55+
if (byte === 0xF0 && b1 < 0x90) {
56+
return false;
57+
}
58+
if (byte === 0xF4 && b1 > 0x8F) {
59+
return false;
60+
}
61+
if (i + 2 >= buffer.length) {
62+
return true;
63+
}
64+
if (buffer[i + 2] < 0x80 || buffer[i + 2] > 0xBF) {
65+
return false;
66+
}
67+
if (i + 3 >= buffer.length) {
68+
return true;
69+
}
70+
if (buffer[i + 3] < 0x80 || buffer[i + 3] > 0xBF) {
71+
return false;
72+
}
73+
i += 4;
74+
} else {
75+
// Invalid leading byte (0x80-0xBF, 0xC0-0xC1, 0xF5-0xFF)
76+
return false;
77+
}
78+
}
79+
return true;
80+
}

src/proc.ts

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import * as proc from 'child_process';
88
import * as iconv from 'iconv-lite';
99

1010
import { createLogger } from '@cmt/logging';
11+
import { isValidUtf8 } from '@cmt/encodingUtils';
1112
import rollbar from '@cmt/rollbar';
1213
import * as util from '@cmt/util';
1314
import * as nls from 'vscode-nls';
@@ -118,11 +119,14 @@ export interface ExecutionOptions {
118119
cwd?: string;
119120
encoding?: BufferEncoding;
120121
outputEncoding?: string;
122+
useAutoEncoding?: boolean;
121123
overrideLocale?: boolean;
122124
timeout?: number;
123125
showOutputOnError?: boolean;
124126
}
125127

128+
export { isValidUtf8 } from '@cmt/encodingUtils';
129+
126130
export function buildCmdStr(command: string, args?: string[]): string {
127131
let cmdarr = [command];
128132
if (args) {
@@ -219,7 +223,20 @@ export function execute(command: string, args?: string[], outputConsumer?: Outpu
219223
child.stdout?.setEncoding(options.encoding);
220224
}
221225

222-
const encoding = options.outputEncoding && iconv.encodingExists(options.outputEncoding) ? options.outputEncoding : 'utf8';
226+
const fallbackEncoding = options.outputEncoding && iconv.encodingExists(options.outputEncoding) ? options.outputEncoding : 'utf8';
227+
// When useAutoEncoding is true (i.e., outputLogEncoding is 'auto' on Windows),
228+
// try UTF-8 first for each chunk and fall back to the system code page encoding
229+
// only if the chunk contains bytes that are not valid UTF-8.
230+
// This correctly handles compilers that output UTF-8 (e.g., MSVC with /utf-8)
231+
// on systems where the default code page is non-UTF-8 (e.g., GBK on Chinese Windows).
232+
const useAutoEncoding = options.useAutoEncoding === true && fallbackEncoding !== 'utf8';
233+
const decodeData = (data: Uint8Array): string => {
234+
const buf = Buffer.from(data);
235+
if (useAutoEncoding) {
236+
return isValidUtf8(buf) ? iconv.decode(buf, 'utf8') : iconv.decode(buf, fallbackEncoding);
237+
}
238+
return iconv.decode(buf, fallbackEncoding);
239+
};
223240
const accumulate = (str1: string, str2: string) => {
224241
try {
225242
return str1 + str2;
@@ -259,7 +276,7 @@ export function execute(command: string, args?: string[], outputConsumer?: Outpu
259276
});
260277
child?.stdout?.on('data', (data: Uint8Array) => {
261278
rollbar.invoke(localize('processing.data.event.stdout', 'Processing {0} event from proc stdout', "\"data\""), { data, command, args }, () => {
262-
const str = iconv.decode(Buffer.from(data), encoding);
279+
const str = decodeData(data);
263280
const lines = str.split('\n').map(l => l.endsWith('\r') ? l.substr(0, l.length - 1) : l);
264281
while (lines.length > 1) {
265282
line_acc = accumulate(line_acc, lines[0]);
@@ -279,7 +296,7 @@ export function execute(command: string, args?: string[], outputConsumer?: Outpu
279296
});
280297
child?.stderr?.on('data', (data: Uint8Array) => {
281298
rollbar.invoke(localize('processing.data.event.stderr', 'Processing {0} event from proc stderr', "\"data\""), { data, command, args }, () => {
282-
const str = iconv.decode(Buffer.from(data), encoding);
299+
const str = decodeData(data);
283300
const lines = str.split('\n').map(l => l.endsWith('\r') ? l.substr(0, l.length - 1) : l);
284301
while (lines.length > 1) {
285302
stderr_line_acc = accumulate(stderr_line_acc, lines[0]);
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import { expect } from 'chai';
2+
import { isValidUtf8 } from '@cmt/encodingUtils';
3+
4+
suite('isValidUtf8', () => {
5+
test('ASCII-only content is valid UTF-8', () => {
6+
expect(isValidUtf8(Buffer.from('Hello, World!', 'ascii'))).to.be.true;
7+
});
8+
9+
test('Empty buffer is valid UTF-8', () => {
10+
expect(isValidUtf8(Buffer.alloc(0))).to.be.true;
11+
});
12+
13+
test('Valid UTF-8 with Chinese characters', () => {
14+
// "无法打开包括文件" in UTF-8
15+
const utf8Buf = Buffer.from('无法打开包括文件', 'utf8');
16+
expect(isValidUtf8(utf8Buf)).to.be.true;
17+
});
18+
19+
test('Valid UTF-8 with mixed ASCII and multibyte', () => {
20+
// MSVC error message: "fatal error C1083: 无法打开包括文件"
21+
const utf8Buf = Buffer.from('fatal error C1083: 无法打开包括文件: "a.h": No such file', 'utf8');
22+
expect(isValidUtf8(utf8Buf)).to.be.true;
23+
});
24+
25+
test('Valid UTF-8 with 2-byte sequences (Latin)', () => {
26+
// "café" contains é (U+00E9) → 0xC3 0xA9
27+
const utf8Buf = Buffer.from('café', 'utf8');
28+
expect(isValidUtf8(utf8Buf)).to.be.true;
29+
});
30+
31+
test('Valid UTF-8 with 4-byte sequences (emoji)', () => {
32+
const utf8Buf = Buffer.from('Hello 🌍', 'utf8');
33+
expect(isValidUtf8(utf8Buf)).to.be.true;
34+
});
35+
36+
test('GBK-encoded Chinese is NOT valid UTF-8', () => {
37+
// "无法打开" in GBK encoding: each Chinese character is 2 bytes in GBK
38+
// GBK bytes for "无法" are: 0xCE 0xDE 0xB7 0xA8
39+
// 0xCE 0xDE: 0xCE is a valid UTF-8 leading byte (2-byte), but 0xDE > 0xBF so invalid continuation
40+
const gbkBuf = Buffer.from([0xCE, 0xDE, 0xB7, 0xA8]);
41+
expect(isValidUtf8(gbkBuf)).to.be.false;
42+
});
43+
44+
test('GBK-encoded MSVC error message is NOT valid UTF-8', () => {
45+
// Simulate GBK output from cl.exe without /utf-8
46+
// "无法打开包括文件" in GBK
47+
const iconv = require('iconv-lite');
48+
const gbkBuf: Buffer = iconv.encode('无法打开包括文件', 'gbk');
49+
expect(isValidUtf8(gbkBuf)).to.be.false;
50+
});
51+
52+
test('Invalid: bare continuation byte', () => {
53+
expect(isValidUtf8(Buffer.from([0x80]))).to.be.false;
54+
});
55+
56+
test('Invalid: overlong 2-byte sequence (0xC0 0x80)', () => {
57+
expect(isValidUtf8(Buffer.from([0xC0, 0x80]))).to.be.false;
58+
});
59+
60+
test('Invalid: overlong 2-byte sequence (0xC1 0xBF)', () => {
61+
expect(isValidUtf8(Buffer.from([0xC1, 0xBF]))).to.be.false;
62+
});
63+
64+
test('Invalid: overlong 3-byte sequence (0xE0 0x80 0x80)', () => {
65+
expect(isValidUtf8(Buffer.from([0xE0, 0x80, 0x80]))).to.be.false;
66+
});
67+
68+
test('Invalid: surrogate pair (0xED 0xA0 0x80 = U+D800)', () => {
69+
expect(isValidUtf8(Buffer.from([0xED, 0xA0, 0x80]))).to.be.false;
70+
});
71+
72+
test('Invalid: byte 0xFF', () => {
73+
expect(isValidUtf8(Buffer.from([0xFF]))).to.be.false;
74+
});
75+
76+
test('Invalid: byte 0xFE', () => {
77+
expect(isValidUtf8(Buffer.from([0xFE]))).to.be.false;
78+
});
79+
80+
test('Incomplete 2-byte sequence at end is valid (boundary split)', () => {
81+
// 0xC3 is a valid 2-byte leading byte; incomplete at end
82+
expect(isValidUtf8(Buffer.from([0x41, 0xC3]))).to.be.true;
83+
});
84+
85+
test('Incomplete 3-byte sequence at end is valid (boundary split)', () => {
86+
// 0xE4 0xB8 is start of a 3-byte CJK character
87+
expect(isValidUtf8(Buffer.from([0x41, 0xE4, 0xB8]))).to.be.true;
88+
});
89+
90+
test('Incomplete 4-byte sequence at end is valid (boundary split)', () => {
91+
// 0xF0 0x9F is start of an emoji 4-byte sequence
92+
expect(isValidUtf8(Buffer.from([0x41, 0xF0, 0x9F]))).to.be.true;
93+
});
94+
95+
test('Invalid continuation in 2-byte sequence', () => {
96+
expect(isValidUtf8(Buffer.from([0xC3, 0x00]))).to.be.false;
97+
});
98+
99+
test('Invalid continuation in 3-byte sequence', () => {
100+
expect(isValidUtf8(Buffer.from([0xE4, 0xB8, 0x00]))).to.be.false;
101+
});
102+
});

0 commit comments

Comments
 (0)