fix: use super.setEncoding() to properly initialize StringDecoder for multi-byte UTF-8

joecwu · joecwu · commit a78274ebb3d4 · 2026-04-09T11:55:09.000+08:00
The custom setEncoding() in BodyReadable only set _readableState.encoding without initializing a StringDecoder instance. This caused multi-byte UTF-8 characters (e.g. CJK characters) split across chunk boundaries to be corrupted with U+FFFD replacement characters when consuming the response body via the streaming API (on("data") / for-await-of). Node.js Readable.prototype.setEncoding() properly creates a StringDecoder that buffers incomplete multi-byte sequences across chunks. By delegating to super.setEncoding(), we get this behavior for free. Fixes: #5002
diff --git a/lib/api/readable.js b/lib/api/readable.js
@@ -323,10 +323,9 @@ class BodyReadable extends Readable {
    * @returns {this}
    */
   setEncoding (encoding) {
-    if (Buffer.isEncoding(encoding)) {
-      this._readableState.encoding = encoding
-    }
-    return this
+    // Delegate to the parent class which properly initializes StringDecoder
+    // to handle incomplete multi-byte UTF-8 sequences across chunk boundaries.
+    return super.setEncoding(encoding)
   }
 }
 
diff --git a/test/issue-5002.js b/test/issue-5002.js
@@ -0,0 +1,141 @@
+'use strict'
+
+const { tspl } = require('@matteo.collina/tspl')
+const { test, after } = require('node:test')
+const { Client } = require('..')
+const { createServer } = require('node:http')
+
+// Regression test for https://github.com/nodejs/undici/issues/5002
+// setEncoding('utf8') corrupts multi-byte UTF-8 characters at chunk boundaries
+
+test('setEncoding("utf8") handles 3-byte UTF-8 chars split across chunks (streaming)', async (t) => {
+  t = tspl(t, { plan: 1 })
+
+  // U+4E16 is encoded as 3 bytes: 0xE4 0xB8 0x96
+  // U+754C is encoded as 3 bytes: 0xE7 0x95 0x8C
+  // We deliberately split the buffer in the middle of a 3-byte sequence
+  const text = '世界世界世界世界'
+  const buf = Buffer.from(text, 'utf8')
+
+  // Split at byte offset 2, which is in the middle of the first 3-byte char
+  const chunk1 = buf.subarray(0, 2)
+  const chunk2 = buf.subarray(2)
+
+  const server = createServer({ joinDuplicateHeaders: true }, (req, res) => {
+    // Send the first chunk (incomplete UTF-8 sequence)
+    res.write(chunk1)
+    // Send the rest after a small delay to ensure separate chunks
+    setTimeout(() => {
+      res.write(chunk2)
+      res.end()
+    }, 50)
+  })
+  after(server.close.bind(server))
+
+  server.listen(0, async () => {
+    const client = new Client(`http://localhost:${server.address().port}`)
+    after(client.destroy.bind(client))
+
+    const { body } = await client.request({
+      path: '/',
+      method: 'GET'
+    })
+    body.setEncoding('utf8')
+
+    let result = ''
+    for await (const chunk of body) {
+      result += chunk
+    }
+
+    // Without the fix, this would contain U+FFFD replacement characters
+    t.strictEqual(result, text)
+  })
+
+  await t.completed
+})
+
+test('setEncoding("utf8") handles 4-byte UTF-8 emoji split across chunks (streaming)', async (t) => {
+  t = tspl(t, { plan: 1 })
+
+  // U+1F600 is encoded as 4 bytes: 0xF0 0x9F 0x98 0x80
+  const text = '😀🎉🚀💡🌍'
+  const buf = Buffer.from(text, 'utf8')
+
+  // Split at byte 3, in the middle of the first 4-byte emoji
+  const chunk1 = buf.subarray(0, 3)
+  const chunk2 = buf.subarray(3)
+
+  const server = createServer({ joinDuplicateHeaders: true }, (req, res) => {
+    res.write(chunk1)
+    setTimeout(() => {
+      res.write(chunk2)
+      res.end()
+    }, 50)
+  })
+  after(server.close.bind(server))
+
+  server.listen(0, async () => {
+    const client = new Client(`http://localhost:${server.address().port}`)
+    after(client.destroy.bind(client))
+
+    const { body } = await client.request({
+      path: '/',
+      method: 'GET'
+    })
+    body.setEncoding('utf8')
+
+    let result = ''
+    for await (const chunk of body) {
+      result += chunk
+    }
+
+    t.strictEqual(result, text)
+  })
+
+  await t.completed
+})
+
+test('setEncoding("utf8") with on("data") handles split multi-byte chars', async (t) => {
+  t = tspl(t, { plan: 1 })
+
+  const text = '世界'
+  const buf = Buffer.from(text, 'utf8') // 6 bytes: E4 B8 96 E7 95 8C
+
+  // Split between byte 1 and 2 of the first character
+  const chunk1 = buf.subarray(0, 1)
+  const chunk2 = buf.subarray(1)
+
+  const server = createServer({ joinDuplicateHeaders: true }, (req, res) => {
+    res.write(chunk1)
+    setTimeout(() => {
+      res.write(chunk2)
+      res.end()
+    }, 50)
+  })
+  after(server.close.bind(server))
+
+  server.listen(0, async () => {
+    const client = new Client(`http://localhost:${server.address().port}`)
+    after(client.destroy.bind(client))
+
+    const { body } = await client.request({
+      path: '/',
+      method: 'GET'
+    })
+    body.setEncoding('utf8')
+
+    await new Promise((resolve) => {
+      let result = ''
+      body.on('data', (chunk) => {
+        // With setEncoding, chunks should be strings
+        result += chunk
+      })
+      body.on('end', () => {
+        t.strictEqual(result, text)
+        resolve()
+      })
+    })
+  })
+
+  await t.completed
+})

Original file line number	Diff line number	Diff line change
`@@ -323,10 +323,9 @@ class BodyReadable extends Readable {`
`323`	`323`	`* @returns {this}`
`324`	`324`	`*/`
`325`	`325`	`setEncoding (encoding) {`
`326`		`- if (Buffer.isEncoding(encoding)) {`
`327`		`- this._readableState.encoding = encoding`
`328`		`- }`
`329`		`- return this`
	`326`	`+ // Delegate to the parent class which properly initializes StringDecoder`
	`327`	`+ // to handle incomplete multi-byte UTF-8 sequences across chunk boundaries.`
	`328`	`+ return super.setEncoding(encoding)`
`330`	`329`	`}`
`331`	`330`	`}`
`332`	`331`