diff --git a/scripts/runner.node.mjs b/scripts/runner.node.mjs index c223ca3d2c4..6f750f09220 100755 --- a/scripts/runner.node.mjs +++ b/scripts/runner.node.mjs @@ -1773,9 +1773,12 @@ async function getVendorTests(cwd) { } if (typeof skipTests === "object") { + // `readdirSync` yields platform-native separators on Windows + // (`\`). Normalize to `/` so vendor.json keys are portable. + const normalized = path.replace(/\\/g, "/"); for (const [glob, reason] of Object.entries(skipTests)) { const pattern = new RegExp(`^${glob.replace(/\*/g, ".*")}$`); - if (pattern.test(path) && reason) { + if (pattern.test(normalized) && reason) { return false; } } diff --git a/src/bun.js/VirtualMachine.zig b/src/bun.js/VirtualMachine.zig index ee3cd23d644..93225f9aaee 100644 --- a/src/bun.js/VirtualMachine.zig +++ b/src/bun.js/VirtualMachine.zig @@ -346,6 +346,12 @@ pub fn mimeType(this: *VirtualMachine, str: []const u8) ?bun.http.MimeType { return this.rareData().mimeTypeFromString(this.allocator, str); } +/// Interning lookup for Blob/File `type` that preserves the raw MIME +/// string (no charset substitution). See `rare_data.mimeTypeInternedValue`. +pub fn mimeTypeInternedValue(this: *VirtualMachine, str: []const u8) ?[]const u8 { + return this.rareData().mimeTypeInternedValue(this.allocator, str); +} + pub fn onAfterEventLoop(this: *VirtualMachine) void { if (this.after_event_loop_callback) |cb| { const ctx = this.after_event_loop_callback_ctx; diff --git a/src/bun.js/rare_data.zig b/src/bun.js/rare_data.zig index 21d0fb96749..0887597dccf 100644 --- a/src/bun.js/rare_data.zig +++ b/src/bun.js/rare_data.zig @@ -319,6 +319,30 @@ pub fn mimeTypeFromString(this: *RareData, allocator: std.mem.Allocator, str: [] return null; } +/// Look up a MIME type string in the interned table and return its raw +/// (uncanonicalized) static-string slice, if it exists. +/// +/// Unlike `mimeTypeFromString`, this does NOT substitute canonical +/// charset-appended forms (e.g. `text/plain` is returned as-is, not as +/// `text/plain;charset=utf-8`). Use this where the WHATWG File/Blob API +/// requires preserving the user-supplied MIME type verbatim. +/// +/// Returns a slice into a static `_bytes` blob — safe to store without +/// allocation tracking. +pub fn mimeTypeInternedValue(this: *RareData, allocator: std.mem.Allocator, str: []const u8) ?[]const u8 { + if (this.mime_types == null) { + this.mime_types = bun.http.MimeType.createHashTable( + allocator, + ) catch |err| bun.handleOom(err); + } + + if (this.mime_types.?.get(str)) |entry| { + return entry.slice(); + } + + return null; +} + pub const HotMap = struct { _map: bun.StringArrayHashMap(Entry), diff --git a/src/bun.js/webcore/Blob.zig b/src/bun.js/webcore/Blob.zig index 8f7bebce421..d0ec61b7a07 100644 --- a/src/bun.js/webcore/Blob.zig +++ b/src/bun.js/webcore/Blob.zig @@ -1829,8 +1829,18 @@ pub fn JSDOMFile__construct_(globalThis: *jsc.JSGlobalObject, callframe: *jsc.Ca } blob.content_type_was_set = true; - if (globalThis.bunVM().mimeType(slice)) |mime| { - blob.content_type = mime.value; + // WHATWG File API: the stored `type` must be the + // lowercased input verbatim — do NOT canonicalize into + // charset-appended forms like `text/plain;charset=utf-8`. + // + // `blob` may have come from `get()` → `dupe()`, which + // shallow-copies a parent's `content_type_allocated=true` + // flag along with an aliased pointer. Reset the flag + // before overwriting so we don't mark a static slice + // (or a future copyLowercase buffer) with stale state. + blob.content_type_allocated = false; + if (globalThis.bunVM().mimeTypeInternedValue(slice)) |interned| { + blob.content_type = interned; break :inner; } const content_type_buf = bun.handleOom(allocator.alloc(u8, slice.len)); @@ -1929,8 +1939,15 @@ pub fn constructBunFile( break :inner; } blob.content_type_was_set = true; - if (vm.mimeType(str.slice())) |entry| { - blob.content_type = entry.value; + // WHATWG File API: preserve the lowercased input + // verbatim; do not canonicalize to charset-appended forms. + // + // `findOrCreateFileFromPath` can return a duped blob + // (standalone module graph path), which shallow-copies + // `content_type_allocated=true`. Reset before overwrite. + blob.content_type_allocated = false; + if (vm.mimeTypeInternedValue(slice)) |interned| { + blob.content_type = interned; break :inner; } const content_type_buf = bun.handleOom(allocator.alloc(u8, slice.len)); @@ -2323,11 +2340,14 @@ pub fn doWrite(this: *Blob, globalThis: *jsc.JSGlobalObject, callframe: *jsc.Cal if (strings.isAllASCII(slice)) { if (this.content_type_allocated) { bun.default_allocator.free(this.content_type); + this.content_type_allocated = false; } this.content_type_was_set = true; - if (globalThis.bunVM().mimeType(slice)) |mime| { - this.content_type = mime.value; + // WHATWG File API: preserve the lowercased input + // verbatim; do not canonicalize to charset-appended forms. + if (globalThis.bunVM().mimeTypeInternedValue(slice)) |interned| { + this.content_type = interned; } else { const content_type_buf = bun.handleOom(bun.default_allocator.alloc(u8, slice.len)); this.content_type = strings.copyLowercase(slice, content_type_buf); @@ -2666,11 +2686,14 @@ pub fn getWriter( if (strings.isAllASCII(slice)) { if (this.content_type_allocated) { bun.default_allocator.free(this.content_type); + this.content_type_allocated = false; } this.content_type_was_set = true; - if (globalThis.bunVM().mimeType(slice)) |mime| { - this.content_type = mime.value; + // WHATWG File API: preserve the lowercased input + // verbatim; do not canonicalize to charset-appended forms. + if (globalThis.bunVM().mimeTypeInternedValue(slice)) |interned| { + this.content_type = interned; } else { const content_type_buf = bun.handleOom(bun.default_allocator.alloc(u8, slice.len)); this.content_type = strings.copyLowercase(slice, content_type_buf); @@ -2843,7 +2866,13 @@ pub fn getSliceFrom(this: *Blob, globalThis: *jsc.JSGlobalObject, relativeStart: blob.content_type = content_type; } blob.content_type_allocated = content_type_was_allocated; - blob.content_type_was_set = this.content_type_was_set or content_type_was_allocated; + // The slice's `content_type_was_set` is true if the parent already + // had one set *or* the caller of `.slice(start, end, type)` passed a + // non-empty `type` argument. Checking the argument slice directly + // (rather than the old `content_type_was_allocated` flag) is required + // for the interned path, which points at a static slice and leaves + // `content_type_was_allocated` false. + blob.content_type_was_set = this.content_type_was_set or content_type.len > 0; var blob_ = Blob.new(blob); return blob_.toJS(globalThis); @@ -2926,8 +2955,10 @@ pub fn getSlice( break :inner; } - if (globalThis.bunVM().mimeType(slice)) |mime| { - content_type = mime.value; + // WHATWG File API: preserve the lowercased input + // verbatim; do not canonicalize to charset-appended forms. + if (globalThis.bunVM().mimeTypeInternedValue(slice)) |interned| { + content_type = interned; break :inner; } @@ -3354,8 +3385,16 @@ pub fn constructor(globalThis: *jsc.JSGlobalObject, callframe: *jsc.CallFrame) b } blob.content_type_was_set = true; - if (globalThis.bunVM().mimeType(slice)) |mime| { - blob.content_type = mime.value; + // WHATWG File API: preserve the lowercased input + // verbatim; do not canonicalize to charset-appended forms. + // + // `blob` came from `get()` which may shallow-copy + // a parent's `content_type_allocated=true` via + // `dupe()`. Reset before overwrite so we do not + // mark a static slice as owned. + blob.content_type_allocated = false; + if (globalThis.bunVM().mimeTypeInternedValue(slice)) |interned| { + blob.content_type = interned; break :inner; } const content_type_buf = bun.handleOom(allocator.alloc(u8, slice.len)); @@ -3520,8 +3559,18 @@ pub fn dupeWithContentType(this: *const Blob, include_content_type: bool) Blob { if (this.store != null) this.store.?.ref(); var duped = this.*; duped.setNotHeapAllocated(); + // NOTE: both branches below are currently unreachable — `setNotHeapAllocated` + // above zeroes the ref count so `duped.isHeapAllocated()` is always false. + // That means neither the use-after-free workaround (first branch) nor the + // `content_type` duplication for `include_content_type=true` (second branch) + // ever runs at runtime. Left in place because fixing both guards (e.g. by + // checking `this.isHeapAllocated()` instead) would activate previously-dead + // behavior and is out of scope for the WHATWG-compliance fix; it needs + // its own testing and is tracked as a separate follow-up. If/when the + // guards are revived, `jsc.VirtualMachine.get().mimeType(duped.content_type)` + // below must be swapped for `mimeTypeInternedValue` or it will silently + // re-introduce the charset canonicalization this PR removes elsewhere. if (duped.content_type_allocated and duped.isHeapAllocated() and !include_content_type) { - // for now, we just want to avoid a use-after-free here if (jsc.VirtualMachine.get().mimeType(duped.content_type)) |mime| { duped.content_type = mime.value; diff --git a/src/bun.js/webcore/S3File.zig b/src/bun.js/webcore/S3File.zig index 41e900fb987..d8a7a72f4c4 100644 --- a/src/bun.js/webcore/S3File.zig +++ b/src/bun.js/webcore/S3File.zig @@ -283,8 +283,10 @@ pub fn constructS3FileWithS3CredentialsAndOptions( break :inner; } blob.content_type_was_set = true; - if (globalObject.bunVM().mimeType(str.slice())) |entry| { - blob.content_type = entry.value; + // WHATWG File API: preserve the lowercased input + // verbatim; do not canonicalize to charset-appended forms. + if (globalObject.bunVM().mimeTypeInternedValue(slice)) |interned| { + blob.content_type = interned; break :inner; } const content_type_buf = bun.handleOom(allocator.alloc(u8, slice.len)); @@ -327,8 +329,10 @@ pub fn constructS3FileWithS3Credentials( break :inner; } blob.content_type_was_set = true; - if (globalObject.bunVM().mimeType(str.slice())) |entry| { - blob.content_type = entry.value; + // WHATWG File API: preserve the lowercased input + // verbatim; do not canonicalize to charset-appended forms. + if (globalObject.bunVM().mimeTypeInternedValue(slice)) |interned| { + blob.content_type = interned; break :inner; } const content_type_buf = bun.handleOom(allocator.alloc(u8, slice.len)); diff --git a/test/js/bun/s3/s3.test.ts b/test/js/bun/s3/s3.test.ts index 6ede69dddc6..ac932eb4d5e 100644 --- a/test/js/bun/s3/s3.test.ts +++ b/test/js/bun/s3/s3.test.ts @@ -170,8 +170,10 @@ describe.concurrent.skipIf(!r2Credentials.endpoint && !isCI)("Virtual Hosted-Sty type: "text/plain", }) .slice(10); + // Per WHATWG File API the user-supplied `text/plain` is preserved + // verbatim — no charset canonicalization. See #29257. expect(Bun.inspect(file)).toBe( - 'S3Ref ("bucket/filename.txt") {\n type: "text/plain;charset=utf-8",\n offset: 10,\n endpoint: "bucket.test.r2.cloudflarestorage.com",\n region: "auto",\n accessKeyId: "[REDACTED]",\n secretAccessKey: "[REDACTED]",\n partSize: 5242880,\n queueSize: 5,\n retry: 3\n}', + 'S3Ref ("bucket/filename.txt") {\n type: "text/plain",\n offset: 10,\n endpoint: "bucket.test.r2.cloudflarestorage.com",\n region: "auto",\n accessKeyId: "[REDACTED]",\n secretAccessKey: "[REDACTED]",\n partSize: 5242880,\n queueSize: 5,\n retry: 3\n}', ); } }); diff --git a/test/js/bun/util/inspect.test.js b/test/js/bun/util/inspect.test.js index 02b3be838c0..2bfdd6aa6fe 100644 --- a/test/js/bun/util/inspect.test.js +++ b/test/js/bun/util/inspect.test.js @@ -587,24 +587,26 @@ describe("console.logging class displays names and extends", async () => { }); it("console.log on a Blob shows name", () => { + // Per WHATWG File API, `text/plain` must NOT be canonicalized to + // `text/plain;charset=utf-8` — see #29257. const blob = new Blob(["foo"], { type: "text/plain" }); - expect(Bun.inspect(blob)).toBe('Blob (3 bytes) {\n type: "text/plain;charset=utf-8"\n}'); + expect(Bun.inspect(blob)).toBe('Blob (3 bytes) {\n type: "text/plain"\n}'); blob.name = "bar"; - expect(Bun.inspect(blob)).toBe('Blob (3 bytes) {\n name: "bar",\n type: "text/plain;charset=utf-8"\n}'); + expect(Bun.inspect(blob)).toBe('Blob (3 bytes) {\n name: "bar",\n type: "text/plain"\n}'); blob.name = "foobar"; - expect(Bun.inspect(blob)).toBe('Blob (3 bytes) {\n name: "foobar",\n type: "text/plain;charset=utf-8"\n}'); + expect(Bun.inspect(blob)).toBe('Blob (3 bytes) {\n name: "foobar",\n type: "text/plain"\n}'); const file = new File(["foo"], "bar.txt", { type: "text/plain" }); expect(Bun.inspect(file)).toBe( - `File (3 bytes) {\n name: "bar.txt",\n type: "text/plain;charset=utf-8",\n lastModified: ${file.lastModified}\n}`, + `File (3 bytes) {\n name: "bar.txt",\n type: "text/plain",\n lastModified: ${file.lastModified}\n}`, ); file.name = "foobar"; expect(Bun.inspect(file)).toBe( - `File (3 bytes) {\n name: "foobar",\n type: "text/plain;charset=utf-8",\n lastModified: ${file.lastModified}\n}`, + `File (3 bytes) {\n name: "foobar",\n type: "text/plain",\n lastModified: ${file.lastModified}\n}`, ); file.name = ""; expect(Bun.inspect(file)).toBe( - `File (3 bytes) {\n name: "",\n type: "text/plain;charset=utf-8",\n lastModified: ${file.lastModified}\n}`, + `File (3 bytes) {\n name: "",\n type: "text/plain",\n lastModified: ${file.lastModified}\n}`, ); }); diff --git a/test/js/web/structured-clone-blob-file.test.ts b/test/js/web/structured-clone-blob-file.test.ts index ac3507e7fa4..b449a47167d 100644 --- a/test/js/web/structured-clone-blob-file.test.ts +++ b/test/js/web/structured-clone-blob-file.test.ts @@ -15,7 +15,9 @@ describe("structuredClone with Blob and File", () => { const cloned = structuredClone(blob); expect(cloned).toBeInstanceOf(Blob); expect(cloned.size).toBe(11); - expect(cloned.type).toBe("text/plain;charset=utf-8"); + // Per WHATWG File API, `text/plain` must NOT be canonicalized to + // `text/plain;charset=utf-8` — see #29257. + expect(cloned.type).toBe("text/plain"); const originalText = await blob.text(); const clonedText = await cloned.text(); @@ -63,11 +65,11 @@ describe("structuredClone with Blob and File", () => { expect(cloned.first).toBeInstanceOf(Blob); expect(cloned.first.size).toBe(5); - expect(cloned.first.type).toBe("text/plain;charset=utf-8"); + expect(cloned.first.type).toBe("text/plain"); expect(cloned.second).toBeInstanceOf(Blob); expect(cloned.second.size).toBe(5); - expect(cloned.second.type).toBe("text/html;charset=utf-8"); + expect(cloned.second.type).toBe("text/html"); }); test("deeply nested Blob", () => { @@ -92,7 +94,7 @@ describe("structuredClone with Blob and File", () => { expect(cloned).toBeInstanceOf(File); expect(cloned.name).toBe("test.txt"); expect(cloned.size).toBe(7); - expect(cloned.type).toBe("text/plain;charset=utf-8"); + expect(cloned.type).toBe("text/plain"); expect(cloned.lastModified).toBe(1234567890000); }); @@ -103,7 +105,7 @@ describe("structuredClone with Blob and File", () => { expect(cloned).toBeInstanceOf(File); expect(cloned.name).toBe("test.txt"); expect(cloned.size).toBe(7); - expect(cloned.type).toBe("text/plain;charset=utf-8"); + expect(cloned.type).toBe("text/plain"); expect(cloned.lastModified).toBeGreaterThan(0); }); @@ -125,7 +127,7 @@ describe("structuredClone with Blob and File", () => { expect(cloned.file).toBeInstanceOf(File); expect(cloned.file.name).toBe("test.txt"); expect(cloned.file.size).toBe(4); - expect(cloned.file.type).toBe("text/plain;charset=utf-8"); + expect(cloned.file.type).toBe("text/plain"); }); test("multiple Files in object", () => { @@ -136,11 +138,11 @@ describe("structuredClone with Blob and File", () => { expect(cloned.txt).toBeInstanceOf(File); expect(cloned.txt.name).toBe("hello.txt"); - expect(cloned.txt.type).toBe("text/plain;charset=utf-8"); + expect(cloned.txt.type).toBe("text/plain"); expect(cloned.html).toBeInstanceOf(File); expect(cloned.html.name).toBe("world.html"); - expect(cloned.html.type).toBe("text/html;charset=utf-8"); + expect(cloned.html.type).toBe("text/html"); }); }); @@ -153,12 +155,12 @@ describe("structuredClone with Blob and File", () => { expect(cloned.blob).toBeInstanceOf(Blob); expect(cloned.blob.size).toBe(12); - expect(cloned.blob.type).toBe("text/plain;charset=utf-8"); + expect(cloned.blob.type).toBe("text/plain"); expect(cloned.file).toBeInstanceOf(File); expect(cloned.file.name).toBe("test.txt"); expect(cloned.file.size).toBe(12); - expect(cloned.file.type).toBe("text/plain;charset=utf-8"); + expect(cloned.file.type).toBe("text/plain"); }); test("array with mixed Blob and File", () => { diff --git a/test/regression/issue/29257.test.ts b/test/regression/issue/29257.test.ts new file mode 100644 index 00000000000..bba0ded221f --- /dev/null +++ b/test/regression/issue/29257.test.ts @@ -0,0 +1,89 @@ +import { expect, test } from "bun:test"; + +// https://github.com/oven-sh/bun/issues/29257 +// +// Bun was rewriting `text/plain` (and `text/css`, `text/html`, +// `application/json`, ...) to their charset-appended canonical forms +// (`text/plain;charset=utf-8`, etc.) when the user set the `type` on a +// Blob/File at construction time. +// +// Per the WHATWG File API (https://w3c.github.io/FileAPI/#blob), user +// agents must NOT append a charset parameter to the media type. + +test("new File(..., { type: 'text/plain' }).type is preserved verbatim", () => { + const file = new File([], "empty.txt", { type: "text/plain" }); + expect(file.type).toBe("text/plain"); +}); + +test("new Blob([], { type: 'text/plain' }).type is preserved verbatim", () => { + const blob = new Blob([], { type: "text/plain" }); + expect(blob.type).toBe("text/plain"); +}); + +test("File/Blob type is preserved for other types Bun used to canonicalize", () => { + // These are the types Compact.toMimeType() substitutes into + // charset-appended forms for HTTP responses. None of them should leak + // the substitution into the File/Blob `type` property. + const types = [ + "text/plain", + "text/css", + "text/html", + "text/javascript", + "application/json", + "application/javascript", + ]; + for (const type of types) { + expect(new File([], "x", { type }).type).toBe(type); + expect(new Blob([], { type }).type).toBe(type); + } +}); + +test("File/Blob type with explicit charset is preserved verbatim", () => { + // A user who explicitly passes a charset parameter should get it back + // unchanged — not silently swapped for a different canonical form. + const file = new File([], "x.txt", { type: "text/plain;charset=utf-8" }); + expect(file.type).toBe("text/plain;charset=utf-8"); + + const blob = new Blob([], { type: "text/plain;charset=utf-8" }); + expect(blob.type).toBe("text/plain;charset=utf-8"); +}); + +test("File/Blob type is lowercased (per WHATWG spec)", () => { + // The spec requires lowercasing but not charset canonicalization. + expect(new File([], "x", { type: "TEXT/PLAIN" }).type).toBe("text/plain"); + expect(new Blob([], { type: "Text/Plain" }).type).toBe("text/plain"); +}); + +test("uncommon MIME types still round-trip unchanged", () => { + // Types not in the interning table take the copyLowercase path. They + // should also round-trip verbatim (lowercased) — check both the File + // and Blob constructor paths since they share logic but are separate + // call sites in src/bun.js/webcore/Blob.zig. + const file = new File([], "x", { type: "application/x-custom-type" }); + expect(file.type).toBe("application/x-custom-type"); + const blob = new Blob([], { type: "application/x-custom-type" }); + expect(blob.type).toBe("application/x-custom-type"); +}); + +test("Bun.file(path, { type: 'text/plain' }).type is preserved verbatim", () => { + // Covers the `constructBunFile` path in Blob.zig. + const file = Bun.file(import.meta.path, { type: "text/plain" }); + expect(file.type).toBe("text/plain"); +}); + +test("Blob.prototype.slice(start, end, type) preserves the type verbatim", () => { + // Covers the `getSlice` / `getSliceFrom` path in Blob.zig. This hits + // the interning fast-path and also exercises the fix to the + // `content_type_was_set` flag computation in `getSliceFrom`. + const parent = new Blob(["hello world"]); + const slice = parent.slice(0, 5, "text/plain"); + expect(slice.type).toBe("text/plain"); +}); + +test("Bun.s3.file(path, { type: 'text/plain' }).type is preserved verbatim", () => { + // Covers the S3File constructor paths in S3File.zig (same bug, different + // file). The object is never actually touched over the network — we only + // check that the `type` field is set from our argument verbatim. + const file = Bun.s3.file("test.txt", { type: "text/plain" }); + expect(file.type).toBe("text/plain"); +}); diff --git a/test/vendor.json b/test/vendor.json index 05ca430f3ae..e5084ca9177 100644 --- a/test/vendor.json +++ b/test/vendor.json @@ -2,6 +2,9 @@ { "package": "elysia", "repository": "https://github.com/elysiajs/elysia", - "tag": "1.4.12" + "tag": "1.4.12", + "skipTests": { + "path/path.test.ts": "asserts pre-WHATWG File.type canonicalization (text/plain;charset=utf-8); see oven-sh/bun#29257" + } } ]