Skip to content

Commit a78274e

Browse files
committed
fix: use super.setEncoding() to properly initialize StringDecoder for multi-byte UTF-8
The custom setEncoding() in BodyReadable only set _readableState.encoding without initializing a StringDecoder instance. This caused multi-byte UTF-8 characters (e.g. CJK characters) split across chunk boundaries to be corrupted with U+FFFD replacement characters when consuming the response body via the streaming API (on("data") / for-await-of). Node.js Readable.prototype.setEncoding() properly creates a StringDecoder that buffers incomplete multi-byte sequences across chunks. By delegating to super.setEncoding(), we get this behavior for free. Fixes: #5002
1 parent a434502 commit a78274e

File tree

2 files changed

+144
-4
lines changed

2 files changed

+144
-4
lines changed

lib/api/readable.js

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -323,10 +323,9 @@ class BodyReadable extends Readable {
323323
* @returns {this}
324324
*/
325325
setEncoding (encoding) {
326-
if (Buffer.isEncoding(encoding)) {
327-
this._readableState.encoding = encoding
328-
}
329-
return this
326+
// Delegate to the parent class which properly initializes StringDecoder
327+
// to handle incomplete multi-byte UTF-8 sequences across chunk boundaries.
328+
return super.setEncoding(encoding)
330329
}
331330
}
332331

test/issue-5002.js

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
'use strict'
2+
3+
const { tspl } = require('@matteo.collina/tspl')
4+
const { test, after } = require('node:test')
5+
const { Client } = require('..')
6+
const { createServer } = require('node:http')
7+
8+
// Regression test for https://github.com/nodejs/undici/issues/5002
9+
// setEncoding('utf8') corrupts multi-byte UTF-8 characters at chunk boundaries
10+
11+
test('setEncoding("utf8") handles 3-byte UTF-8 chars split across chunks (streaming)', async (t) => {
12+
t = tspl(t, { plan: 1 })
13+
14+
// U+4E16 is encoded as 3 bytes: 0xE4 0xB8 0x96
15+
// U+754C is encoded as 3 bytes: 0xE7 0x95 0x8C
16+
// We deliberately split the buffer in the middle of a 3-byte sequence
17+
const text = '世界世界世界世界'
18+
const buf = Buffer.from(text, 'utf8')
19+
20+
// Split at byte offset 2, which is in the middle of the first 3-byte char
21+
const chunk1 = buf.subarray(0, 2)
22+
const chunk2 = buf.subarray(2)
23+
24+
const server = createServer({ joinDuplicateHeaders: true }, (req, res) => {
25+
// Send the first chunk (incomplete UTF-8 sequence)
26+
res.write(chunk1)
27+
// Send the rest after a small delay to ensure separate chunks
28+
setTimeout(() => {
29+
res.write(chunk2)
30+
res.end()
31+
}, 50)
32+
})
33+
after(server.close.bind(server))
34+
35+
server.listen(0, async () => {
36+
const client = new Client(`http://localhost:${server.address().port}`)
37+
after(client.destroy.bind(client))
38+
39+
const { body } = await client.request({
40+
path: '/',
41+
method: 'GET'
42+
})
43+
body.setEncoding('utf8')
44+
45+
let result = ''
46+
for await (const chunk of body) {
47+
result += chunk
48+
}
49+
50+
// Without the fix, this would contain U+FFFD replacement characters
51+
t.strictEqual(result, text)
52+
})
53+
54+
await t.completed
55+
})
56+
57+
test('setEncoding("utf8") handles 4-byte UTF-8 emoji split across chunks (streaming)', async (t) => {
58+
t = tspl(t, { plan: 1 })
59+
60+
// U+1F600 is encoded as 4 bytes: 0xF0 0x9F 0x98 0x80
61+
const text = '😀🎉🚀💡🌍'
62+
const buf = Buffer.from(text, 'utf8')
63+
64+
// Split at byte 3, in the middle of the first 4-byte emoji
65+
const chunk1 = buf.subarray(0, 3)
66+
const chunk2 = buf.subarray(3)
67+
68+
const server = createServer({ joinDuplicateHeaders: true }, (req, res) => {
69+
res.write(chunk1)
70+
setTimeout(() => {
71+
res.write(chunk2)
72+
res.end()
73+
}, 50)
74+
})
75+
after(server.close.bind(server))
76+
77+
server.listen(0, async () => {
78+
const client = new Client(`http://localhost:${server.address().port}`)
79+
after(client.destroy.bind(client))
80+
81+
const { body } = await client.request({
82+
path: '/',
83+
method: 'GET'
84+
})
85+
body.setEncoding('utf8')
86+
87+
let result = ''
88+
for await (const chunk of body) {
89+
result += chunk
90+
}
91+
92+
t.strictEqual(result, text)
93+
})
94+
95+
await t.completed
96+
})
97+
98+
test('setEncoding("utf8") with on("data") handles split multi-byte chars', async (t) => {
99+
t = tspl(t, { plan: 1 })
100+
101+
const text = '世界'
102+
const buf = Buffer.from(text, 'utf8') // 6 bytes: E4 B8 96 E7 95 8C
103+
104+
// Split between byte 1 and 2 of the first character
105+
const chunk1 = buf.subarray(0, 1)
106+
const chunk2 = buf.subarray(1)
107+
108+
const server = createServer({ joinDuplicateHeaders: true }, (req, res) => {
109+
res.write(chunk1)
110+
setTimeout(() => {
111+
res.write(chunk2)
112+
res.end()
113+
}, 50)
114+
})
115+
after(server.close.bind(server))
116+
117+
server.listen(0, async () => {
118+
const client = new Client(`http://localhost:${server.address().port}`)
119+
after(client.destroy.bind(client))
120+
121+
const { body } = await client.request({
122+
path: '/',
123+
method: 'GET'
124+
})
125+
body.setEncoding('utf8')
126+
127+
await new Promise((resolve) => {
128+
let result = ''
129+
body.on('data', (chunk) => {
130+
// With setEncoding, chunks should be strings
131+
result += chunk
132+
})
133+
body.on('end', () => {
134+
t.strictEqual(result, text)
135+
resolve()
136+
})
137+
})
138+
})
139+
140+
await t.completed
141+
})

0 commit comments

Comments
 (0)