diff --git a/core/src/bitmap/bitmap_data.rs b/core/src/bitmap/bitmap_data.rs
index 29eb34fdc16c..bf959a952025 100644
--- a/core/src/bitmap/bitmap_data.rs
+++ b/core/src/bitmap/bitmap_data.rs
@@ -4,10 +4,12 @@ use crate::display_object::{BoundsMode, DisplayObject, DisplayObjectWeak, TDispl
 use bitflags::bitflags;
 use gc_arena::lock::GcRefLock;
 use gc_arena::{Collect, Gc, Mutation};
-use ruffle_render::backend::RenderBackend;
+use ruffle_render::backend::{RenderBackend, RenderOffscreenBatches};
 use ruffle_render::bitmap::{
     Bitmap, BitmapFormat, BitmapHandle, PixelRegion, PixelSnapping, SyncHandle,
 };
+use ruffle_render::commands::CommandList;
+use ruffle_render::quality::StageQuality;
 use ruffle_wstr::WStr;
 use std::cell::Ref;
 use std::fmt::Debug;
@@ -344,6 +346,42 @@ impl<'gc> BitmapData<'gc> {
         self.0.render(smoothing, context, pixel_snapping);
     }
 
+    pub fn append_gpu_commands(
+        &self,
+        gc: &Mutation<'gc>,
+        renderer: &mut dyn RenderBackend,
+        commands: CommandList,
+        dirty: PixelRegion,
+        quality: StageQuality,
+    ) {
+        self.0
+            .append_gpu_commands(gc, renderer, commands, dirty, quality);
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn append_render_bitmap(
+        &self,
+        gc: &Mutation<'gc>,
+        renderer: &mut dyn RenderBackend,
+        bitmap: BitmapHandle,
+        transform: ruffle_render::transform::Transform,
+        smoothing: bool,
+        pixel_snapping: PixelSnapping,
+        dirty: PixelRegion,
+        quality: StageQuality,
+    ) {
+        self.0.append_render_bitmap(
+            gc,
+            renderer,
+            bitmap,
+            transform,
+            smoothing,
+            pixel_snapping,
+            dirty,
+            quality,
+        );
+    }
+
     pub fn can_read(&self, read_area: PixelRegion) -> bool {
         self.0.can_read(read_area)
     }
@@ -362,6 +400,16 @@ impl<'gc> BitmapData<'gc> {
     }
 }
 
+/// A sub-batch of queued GPU commands together with the dirty region those
+/// commands cover. Sub-batches within a pending sequence become separate
+/// render passes at flush time.
+type PendingBatch = (CommandList, PixelRegion);
+
+/// The state of `BitmapRawData::pending_gpu_commands`: the list of queued
+/// sub-batches, the union of all their dirty regions, and the stage quality
+/// all sub-batches share.
+type PendingGpuCommands = (Vec<PendingBatch>, PixelRegion, StageQuality);
+
 #[derive(Collect)]
 #[collect(no_drop)]
 pub struct BitmapRawData<'gc> {
@@ -397,6 +445,15 @@ pub struct BitmapRawData<'gc> {
     #[collect(require_static)]
     dirty_state: DirtyState,
 
+    /// Accumulated GPU commands awaiting batch execution. Stored as a
+    /// sequence of sub-batches paired with their own dirty regions plus a
+    /// union dirty for the whole sequence and the shared render quality.
+    /// Each sub-batch becomes a separate render pass at flush time, letting
+    /// callers insert resolve boundaries between overlapping draws under
+    /// MSAA while non-overlapping draws merge into one pass.
+    #[collect(require_static)]
+    pending_gpu_commands: Option<PendingGpuCommands>,
+
     /// Holds an egui texture handle, used for rendering this Bitmap in the debug ui.
     /// This is automatically set to `None` when the texture is updated (either from
     /// marking the CPU side dirty, or from performing a GPU -> CPU sync).
@@ -423,9 +480,10 @@ mod wrapper {
     use gc_arena::barrier::Write;
     use gc_arena::lock::GcRefLock;
     use gc_arena::{Collect, Gc, Mutation};
-    use ruffle_render::backend::RenderBackend;
+    use ruffle_render::backend::{RenderBackend, RenderOffscreenBatches};
     use ruffle_render::bitmap::{BitmapHandle, PixelRegion, PixelSnapping};
-    use ruffle_render::commands::CommandHandler;
+    use ruffle_render::commands::{CommandHandler, CommandList};
+    use ruffle_render::quality::StageQuality;
     use std::cell::Ref;
 
     use super::{BitmapRawData, DirtyState, copy_pixels_to_bitmapdata};
@@ -484,6 +542,7 @@ mod wrapper {
                     avm2_object: None,
                     display_objects: vec![],
                     dirty_state: DirtyState::Clean,
+                    pending_gpu_commands: None,
                     #[cfg(feature = "egui")]
                     egui_texture: Default::default(),
                 }
@@ -508,6 +567,7 @@ mod wrapper {
                 display_objects: vec![],
                 // We have no GPU texture, so there's no need to mark as dirty
                 dirty_state: DirtyState::Clean,
+                pending_gpu_commands: None,
                 #[cfg(feature = "egui")]
                 egui_texture: Default::default(),
             }
@@ -523,6 +583,21 @@ mod wrapper {
             let mut write = unsafe { Write::assume(Gc::as_ref(self.0)) }
                 .unlock()
                 .borrow_mut();
+
+            // Flush any pending GPU commands before syncing - the flush
+            // leaves the dirty_state as GpuModified whose sync handle
+            // covers the union of all pending writes.
+            if let Some((batches, dirty, quality)) = write.pending_gpu_commands.take() {
+                let handle = write.bitmap_handle(renderer);
+                let command_lists: RenderOffscreenBatches =
+                    batches.into_iter().map(|(cmds, _)| cmds).collect();
+                if let Some(sync_handle) =
+                    renderer.render_offscreen(handle, command_lists, quality, dirty)
+                {
+                    write.dirty_state = DirtyState::GpuModified(sync_handle, dirty);
+                }
+            }
+
             match std::mem::replace(&mut write.dirty_state, DirtyState::Clean) {
                 DirtyState::GpuModified(sync_handle, bounds) => {
                     renderer
@@ -554,10 +629,121 @@ mod wrapper {
             renderer: &mut dyn RenderBackend,
         ) -> BitmapHandle {
             let mut bitmap_data = self.0.borrow_mut(gc_context);
+            bitmap_data.flush_pending_gpu_commands(renderer, gc_context);
             bitmap_data.update_dirty_texture(renderer);
             bitmap_data.bitmap_handle(renderer)
         }
 
+        /// Append a `CommandList` to the deferred GPU batch for this bitmap.
+        /// The batch is flushed lazily on the next `bitmap_handle`/`sync` call
+        /// so repeated draws onto the same bitmap submit as one `render_offscreen`
+        /// invocation instead of one per call.
+        pub fn append_gpu_commands(
+            &self,
+            gc: &Mutation<'gc>,
+            renderer: &mut dyn RenderBackend,
+            commands: CommandList,
+            dirty: PixelRegion,
+            quality: StageQuality,
+        ) {
+            let mut data = self.0.borrow_mut(gc);
+            data.update_dirty_texture(renderer);
+
+            // Quality must stay constant inside one pending batch - each
+            // render pass in the sequence shares the same MSAA sample count.
+            if let Some((_, _, existing_quality)) = &data.pending_gpu_commands
+                && *existing_quality != quality
+            {
+                data.flush_pending_gpu_commands(renderer, gc);
+            }
+
+            match &mut data.pending_gpu_commands {
+                Some((batches, union_dirty, _)) => {
+                    // For MSAA, insert a resolve boundary between draws that
+                    // overlap so the next draw samples the already-resolved
+                    // 8-bit pixels. Only check against the last sub-batch's
+                    // dirty: non-adjacent draws can still share a pass even
+                    // when they overlap earlier already-resolved work.
+                    let needs_resolve_boundary = quality.sample_count() > 1
+                        && batches.last().is_some_and(|(_, d)| dirty.intersects(*d));
+                    if needs_resolve_boundary || batches.is_empty() {
+                        batches.push((commands, dirty));
+                    } else {
+                        let last = batches.last_mut().expect("batches is non-empty");
+                        last.0.commands.extend(commands.commands);
+                        last.1.union(dirty);
+                    }
+                    union_dirty.union(dirty);
+                }
+                None => {
+                    let mut merged = dirty;
+                    if let DirtyState::GpuModified(_, old) = &data.dirty_state {
+                        merged.union(*old);
+                    }
+                    data.pending_gpu_commands = Some((vec![(commands, dirty)], merged, quality));
+                }
+            }
+        }
+
+        /// Append one `render_bitmap` command to the deferred batch. When
+        /// the command can merge into the current sub-batch, no new
+        /// `CommandList` is allocated - the command is pushed onto the
+        /// existing list in place.
+        #[allow(clippy::too_many_arguments)]
+        pub fn append_render_bitmap(
+            &self,
+            gc: &Mutation<'gc>,
+            renderer: &mut dyn RenderBackend,
+            bitmap: BitmapHandle,
+            transform: ruffle_render::transform::Transform,
+            smoothing: bool,
+            pixel_snapping: PixelSnapping,
+            dirty: PixelRegion,
+            quality: StageQuality,
+        ) {
+            let mut data = self.0.borrow_mut(gc);
+            data.update_dirty_texture(renderer);
+
+            if let Some((_, _, existing_quality)) = &data.pending_gpu_commands
+                && *existing_quality != quality
+            {
+                data.flush_pending_gpu_commands(renderer, gc);
+            }
+
+            let command = ruffle_render::commands::Command::RenderBitmap {
+                bitmap,
+                transform,
+                smoothing,
+                pixel_snapping,
+            };
+
+            match &mut data.pending_gpu_commands {
+                Some((batches, union_dirty, _)) => {
+                    let needs_resolve_boundary = quality.sample_count() > 1
+                        && batches.last().is_some_and(|(_, d)| dirty.intersects(*d));
+                    if needs_resolve_boundary || batches.is_empty() {
+                        let mut list = CommandList::new();
+                        list.commands.push(command);
+                        batches.push((list, dirty));
+                    } else {
+                        let last = batches.last_mut().expect("batches is non-empty");
+                        last.0.commands.push(command);
+                        last.1.union(dirty);
+                    }
+                    union_dirty.union(dirty);
+                }
+                None => {
+                    let mut merged = dirty;
+                    if let DirtyState::GpuModified(_, old) = &data.dirty_state {
+                        merged.union(*old);
+                    }
+                    let mut list = CommandList::new();
+                    list.commands.push(command);
+                    data.pending_gpu_commands = Some((vec![(list, dirty)], merged, quality));
+                }
+            }
+        }
+
         /// Provides access to the underlying `BitmapData`.
         /// This should only be used when you will be overwriting the entire
         /// `pixels` vec without reading from it. Cancels any in-progress GPU -> CPU sync.
@@ -567,6 +753,12 @@ mod wrapper {
             mc: &Mutation<'gc>,
         ) -> (GcRefLock<'gc, BitmapRawData<'gc>>, Option<PixelRegion>) {
             let mut write = self.0.borrow_mut(mc);
+            // Caller is about to fully overwrite the CPU pixels; any pending
+            // GPU work would land on a texture that's about to be replaced
+            // by the upcoming CPU->GPU sync, so drop it now to keep the
+            // invariant that `pending_gpu_commands.is_some()` only holds
+            // when `dirty_state` is Clean or GpuModified, never CpuModified.
+            write.discard_pending_gpu_commands();
             let dirty_rect = match write.dirty_state {
                 DirtyState::GpuModified(_, rect) => {
                     write.dirty_state = DirtyState::Clean;
@@ -587,13 +779,19 @@ mod wrapper {
             read_area: PixelRegion,
             renderer: &mut dyn RenderBackend,
         ) -> Ref<'_, BitmapRawData<'gc>> {
-            let needs_update = if let DirtyState::GpuModified(_, area) = self.0.borrow().dirty_state
-            {
+            let data = self.0.borrow();
+            // Pending GPU commands haven't run yet, so the CPU pixels are
+            // stale even if `dirty_state` reads Clean. Force a sync so the
+            // queued work is rendered and resolved back to the CPU before
+            // the caller observes the pixels.
+            let needs_flush = data.pending_gpu_commands.is_some();
+            let needs_sync = if let DirtyState::GpuModified(_, area) = data.dirty_state {
                 area.intersects(read_area)
             } else {
                 false
             };
-            if needs_update {
+            drop(data);
+            if needs_flush || needs_sync {
                 self.sync(renderer);
             }
             self.0.borrow()
@@ -661,11 +859,13 @@ mod wrapper {
             context: &mut RenderContext<'_, 'gc>,
             pixel_snapping: PixelSnapping,
         ) {
-            let mut inner_bitmap_data = self.0.borrow_mut(context.gc());
+            let gc = context.gc();
+            let mut inner_bitmap_data = self.0.borrow_mut(gc);
             if inner_bitmap_data.disposed() {
                 return;
             }
 
+            inner_bitmap_data.flush_pending_gpu_commands(context.renderer, gc);
             // Note - we do a CPU -> GPU sync, but we do *not* do a GPU -> CPU sync
             // (rendering is done on the GPU, so the CPU pixels don't need to be up-to-date).
             inner_bitmap_data.update_dirty_texture(context.renderer);
@@ -680,7 +880,14 @@ mod wrapper {
         }
 
         pub fn can_read(&self, read_area: PixelRegion) -> bool {
-            if let DirtyState::GpuModified(_, area) = self.0.borrow().dirty_state {
+            let data = self.0.borrow();
+            // Pending GPU commands mean the CPU pixels are stale: even if
+            // `dirty_state` still reads Clean, a flush would update the GPU
+            // texture and mark GpuModified across `read_area`.
+            if data.pending_gpu_commands.is_some() {
+                return false;
+            }
+            if let DirtyState::GpuModified(_, area) = data.dirty_state {
                 !area.intersects(read_area)
             } else {
                 true
@@ -742,6 +949,7 @@ impl<'gc> BitmapRawData<'gc> {
             avm2_object: None,
             display_objects: vec![],
             dirty_state: DirtyState::Clean,
+            pending_gpu_commands: None,
             #[cfg(feature = "egui")]
             egui_texture: Default::default(),
         }
@@ -762,6 +970,7 @@ impl<'gc> BitmapRawData<'gc> {
             avm2_object: None,
             disposed: false,
             dirty_state: DirtyState::Clean,
+            pending_gpu_commands: None,
             display_objects: vec![],
             #[cfg(feature = "egui")]
             egui_texture: Default::default(),
@@ -830,6 +1039,14 @@ impl<'gc> BitmapRawData<'gc> {
     pub fn set_cpu_dirty(&mut self, gc_context: &Mutation<'gc>, region: PixelRegion) {
         debug_assert!(region.x_max <= self.width);
         debug_assert!(region.y_max <= self.height);
+        // Invariant: a CPU-dirty bitmap has no queued GPU commands. Callers
+        // must clear pending first - either by flushing (via sync/read_area)
+        // or by discarding (via overwrite_cpu_pixels_from_gpu for full
+        // overwrites) - before declaring CPU-side modifications.
+        debug_assert!(
+            self.pending_gpu_commands.is_none(),
+            "set_cpu_dirty called while GPU commands are still pending"
+        );
 
         #[cfg(feature = "egui")]
         self.egui_texture.borrow_mut().take();
@@ -926,6 +1143,33 @@ impl<'gc> BitmapRawData<'gc> {
         }
     }
 
+    /// Drop any accumulated GPU commands without executing them. Used when
+    /// the caller knows the bitmap is about to be fully overwritten (e.g. a
+    /// full-texture `fillRect`) and the pending draws would be wasted work.
+    pub fn discard_pending_gpu_commands(&mut self) {
+        self.pending_gpu_commands = None;
+    }
+
+    /// Submit accumulated GPU commands as a single `render_offscreen` call,
+    /// leaving `dirty_state` marked `GpuModified` across the union of all
+    /// sub-batch dirty regions so a later CPU-side read will sync back.
+    pub fn flush_pending_gpu_commands(
+        &mut self,
+        renderer: &mut dyn RenderBackend,
+        gc: &Mutation<'gc>,
+    ) {
+        if let Some((batches, dirty, quality)) = self.pending_gpu_commands.take() {
+            let handle = self.bitmap_handle(renderer);
+            let command_lists: RenderOffscreenBatches =
+                batches.into_iter().map(|(cmds, _)| cmds).collect();
+            if let Some(sync_handle) =
+                renderer.render_offscreen(handle, command_lists, quality, dirty)
+            {
+                self.set_gpu_dirty(gc, sync_handle, dirty);
+            }
+        }
+    }
+
     fn inform_display_objects(&self, gc_context: &Mutation<'gc>) {
         for object in &self.display_objects {
             if let Some(object) = object.upgrade(gc_context) {
diff --git a/core/src/bitmap/operations.rs b/core/src/bitmap/operations.rs
index c4bbeb0ee1e6..909912700028 100644
--- a/core/src/bitmap/operations.rs
+++ b/core/src/bitmap/operations.rs
@@ -1065,6 +1065,85 @@ pub fn copy_pixels<'gc>(
         return;
     }
 
+    let needs_blend = (source_transparency && !transparency) || merge_alpha;
+    let different_bitmaps = !source_bitmap.ptr_eq(target);
+
+    // Fast path 1: plain replace (no blending) from a GPU-resident source
+    // onto a different target. Uses a raw texture-to-texture copy so we
+    // skip the blocking GPU->CPU readback that `copy_on_cpu` would do.
+    if !needs_blend
+        && different_bitmaps
+        && !source_bitmap.can_read(source_region)
+        && context.renderer.is_offscreen_supported()
+    {
+        // `bitmap_handle` flushes each side's pending batch, so our raw
+        // copy observes the right source pixels and isn't clobbered by a
+        // later `render_offscreen` into the target.
+        let source_handle = source_bitmap.bitmap_handle(context.gc(), context.renderer);
+        let target_handle = target.bitmap_handle(context.gc(), context.renderer);
+        if let Some(sync_handle) = context.renderer.copy_pixels_to_texture(
+            source_handle,
+            source_region,
+            target_handle,
+            (dest_region.x_min, dest_region.y_min),
+        ) {
+            let (target_ref, old_dirty) = target.overwrite_cpu_pixels_from_gpu(context.gc());
+            let mut write = target_ref.borrow_mut(context.gc());
+            let mut dirty = dest_region;
+            if let Some(old) = old_dirty {
+                dirty.union(old);
+            }
+            write.set_gpu_dirty(context.gc(), sync_handle, dirty);
+            return;
+        }
+    }
+
+    // Fast path 2: when the original src_rect covers the full source and
+    // the target's current pixels don't need to come back to the CPU, we
+    // can queue a `render_bitmap` into the pending GPU batch instead of
+    // reading back both sides and blending on the CPU. Using the original
+    // dest_point (before clamp) lets the GPU naturally clip pixels that
+    // fall outside the target texture.
+    let original_covers_full_source = src_min_x <= 0
+        && src_min_y <= 0
+        && src_width >= source_bitmap.width() as i32
+        && src_height >= source_bitmap.height() as i32;
+    if original_covers_full_source
+        && different_bitmaps
+        && !target.can_read(dest_region)
+        && context.renderer.is_offscreen_supported()
+    {
+        let source_handle = source_bitmap.bitmap_handle(context.gc(), context.renderer);
+        let transform = Transform {
+            matrix: Matrix {
+                a: 1.0,
+                b: 0.0,
+                c: 0.0,
+                d: 1.0,
+                tx: Twips::from_pixels(dest_min_x as f64),
+                ty: Twips::from_pixels(dest_min_y as f64),
+            },
+            color_transform: ColorTransform::default(),
+            perspective_projection: None,
+        };
+
+        // copyPixels is a plain blit - MSAA on it just wastes a 4x buffer
+        // and a resolve pass. Use `Low` so contiguous copyPixels runs skip
+        // MSAA entirely; a later vector `draw()` with higher stage quality
+        // will flush this batch and start its own MSAA pass.
+        target.append_render_bitmap(
+            context.gc(),
+            context.renderer,
+            source_handle,
+            transform,
+            false,
+            PixelSnapping::Never,
+            dest_region,
+            StageQuality::Low,
+        );
+        return;
+    }
+
     copy_on_cpu(
         context.gc(),
         context.renderer,
@@ -1072,7 +1151,7 @@ pub fn copy_pixels<'gc>(
         target,
         source_region,
         dest_region,
-        (source_transparency && !transparency) || merge_alpha,
+        needs_blend,
     );
 }
 
@@ -1592,8 +1671,6 @@ pub fn draw<'gc>(
         render_context.commands.pop_mask();
     }
 
-    let handle = target.bitmap_handle(render_context.gc(), render_context.renderer);
-
     let commands = if blend_mode == BlendMode::Normal {
         render_context.commands
     } else {
@@ -1605,28 +1682,25 @@ pub fn draw<'gc>(
         commands
     };
 
-    let (target, include_dirty_area) = target.overwrite_cpu_pixels_from_gpu(context.gc());
-    let mut write = target.borrow_mut(context.gc());
-    // If we have another dirty area to preserve, expand this to include it
-    if let Some(old) = include_dirty_area {
-        dirty_region.union(old);
-    }
-
     assert!(
         cache_draws.is_empty(),
         "BitmapData.draw() should not use cacheAsBitmap"
     );
-    let image = context
-        .renderer
-        .render_offscreen(handle, commands, quality, dirty_region);
-
-    match image {
-        Some(sync_handle) => {
-            write.set_gpu_dirty(context.gc(), sync_handle, dirty_region);
-            Ok(())
-        }
-        None => Err(BitmapDataDrawError::Unimplemented),
-    }
+
+    // Defer the draw into the target's pending GPU batch instead of calling
+    // `render_offscreen` synchronously. Consecutive `BitmapData.draw()` calls
+    // onto the same target then share one submission at flush time - crucial
+    // for scenes (e.g. cached backgrounds) that do tens of thousands of
+    // draws per frame and would otherwise hit the backend's per-frame draw
+    // cap with fresh command-encoder allocations.
+    target.append_gpu_commands(
+        context.gc(),
+        context.renderer,
+        commands,
+        dirty_region,
+        quality,
+    );
+    Ok(())
 }
 
 pub fn get_vector<'gc>(
diff --git a/render/canvas/src/lib.rs b/render/canvas/src/lib.rs
index 7af736176d91..20fcc3ac8c5a 100644
--- a/render/canvas/src/lib.rs
+++ b/render/canvas/src/lib.rs
@@ -4,7 +4,7 @@
 
 use ruffle_render::backend::{
     BitmapCacheEntry, Context3D, Context3DProfile, PixelBenderOutput, PixelBenderTarget,
-    RenderBackend, ShapeHandle, ShapeHandleImpl, ViewportDimensions,
+    RenderBackend, RenderOffscreenBatches, ShapeHandle, ShapeHandleImpl, ViewportDimensions,
 };
 use ruffle_render::bitmap::{
     Bitmap, BitmapHandle, BitmapHandleImpl, BitmapSource, PixelRegion, PixelSnapping, RgbaBufRead,
@@ -509,7 +509,7 @@ impl RenderBackend for WebCanvasRenderBackend {
     fn render_offscreen(
         &mut self,
         _handle: BitmapHandle,
-        _commands: CommandList,
+        _batches: RenderOffscreenBatches,
         _quality: StageQuality,
         _bounds: PixelRegion,
     ) -> Option<Box<dyn SyncHandle>> {
diff --git a/render/src/backend.rs b/render/src/backend.rs
index 4e424b1c60f3..95bace9bbf1f 100644
--- a/render/src/backend.rs
+++ b/render/src/backend.rs
@@ -9,6 +9,13 @@ use crate::pixel_bender_support::PixelBenderShaderArgument;
 use crate::quality::StageQuality;
 use crate::shape_utils::DistilledShape;
 use ruffle_wstr::{FromWStr, WStr};
+use smallvec::SmallVec;
+
+/// A sequence of command-list sub-batches passed to `render_offscreen`.
+/// Inline size of 1 covers the single-batch call path (e.g. `BitmapData.draw()`)
+/// without a heap allocation; longer sequences from deferred batching spill to
+/// heap identically to `Vec`.
+pub type RenderOffscreenBatches = SmallVec<[CommandList; 1]>;
 use std::any::Any;
 use std::borrow::Cow;
 use std::cell::RefCell;
@@ -36,10 +43,16 @@ pub trait RenderBackend: Any {
         bitmap_source: &dyn BitmapSource,
     ) -> ShapeHandle;
 
+    /// Renders the given sequence of command lists onto the texture bound
+    /// to `handle` and returns a sync handle for the resulting write. Each
+    /// entry in `batches` is rendered as its own render pass; with MSAA,
+    /// this means the previous pass's resolve is visible as input to the
+    /// next. A single long batch stays in one pass, so callers that pack
+    /// non-overlapping draws into one batch skip any per-draw resolve.
     fn render_offscreen(
         &mut self,
         handle: BitmapHandle,
-        commands: CommandList,
+        batches: RenderOffscreenBatches,
         quality: StageQuality,
         bounds: PixelRegion,
     ) -> Option<Box<dyn SyncHandle>>;
@@ -70,6 +83,20 @@ pub trait RenderBackend: Any {
         false
     }
 
+    /// Copies a rectangular region from one GPU texture to another without
+    /// CPU readback or blending. Returns a `SyncHandle` the caller can use
+    /// to mark the destination `GpuModified`. Backends without GPU support
+    /// return `None` and the caller should fall back to the CPU copy path.
+    fn copy_pixels_to_texture(
+        &mut self,
+        _source: BitmapHandle,
+        _source_region: PixelRegion,
+        _destination: BitmapHandle,
+        _dest_point: (u32, u32),
+    ) -> Option<Box<dyn SyncHandle>> {
+        None
+    }
+
     fn submit_frame(
         &mut self,
         clear: swf::Color,
diff --git a/render/src/backend/null.rs b/render/src/backend/null.rs
index c8856ddb21a8..4881688a87b5 100644
--- a/render/src/backend/null.rs
+++ b/render/src/backend/null.rs
@@ -3,7 +3,8 @@ use std::num::NonZeroU32;
 use std::sync::Arc;
 
 use crate::backend::{
-    BitmapCacheEntry, RenderBackend, ShapeHandle, ShapeHandleImpl, ViewportDimensions,
+    BitmapCacheEntry, RenderBackend, RenderOffscreenBatches, ShapeHandle, ShapeHandleImpl,
+    ViewportDimensions,
 };
 use crate::bitmap::{
     Bitmap, BitmapHandle, BitmapHandleImpl, BitmapSize, BitmapSource, PixelRegion, RgbaBufRead,
@@ -66,7 +67,7 @@ impl RenderBackend for NullRenderer {
     fn render_offscreen(
         &mut self,
         _handle: BitmapHandle,
-        _commands: CommandList,
+        _batches: RenderOffscreenBatches,
         _quality: StageQuality,
         _bounds: PixelRegion,
     ) -> Option<Box<dyn SyncHandle>> {
diff --git a/render/webgl/src/lib.rs b/render/webgl/src/lib.rs
index 1bb43fa415a4..ae37562eaa28 100644
--- a/render/webgl/src/lib.rs
+++ b/render/webgl/src/lib.rs
@@ -5,7 +5,7 @@
 use bytemuck::{Pod, Zeroable};
 use ruffle_render::backend::{
     BitmapCacheEntry, Context3D, Context3DProfile, PixelBenderOutput, PixelBenderTarget,
-    RenderBackend, ShapeHandle, ShapeHandleImpl, ViewportDimensions,
+    RenderBackend, RenderOffscreenBatches, ShapeHandle, ShapeHandleImpl, ViewportDimensions,
 };
 use ruffle_render::bitmap::{
     Bitmap, BitmapFormat, BitmapHandle, BitmapHandleImpl, BitmapSource, PixelRegion, PixelSnapping,
@@ -995,7 +995,7 @@ impl RenderBackend for WebGlRenderBackend {
     fn render_offscreen(
         &mut self,
         _handle: BitmapHandle,
-        _commands: CommandList,
+        _batches: RenderOffscreenBatches,
         _quality: StageQuality,
         _bounds: PixelRegion,
     ) -> Option<Box<dyn SyncHandle>> {
diff --git a/render/wgpu/src/backend.rs b/render/wgpu/src/backend.rs
index bd2b7a6a023d..7eeafa5dce3f 100644
--- a/render/wgpu/src/backend.rs
+++ b/render/wgpu/src/backend.rs
@@ -17,7 +17,9 @@ use image::imageops::FilterType;
 use ruffle_render::backend::{
     BitmapCacheEntry, Context3D, Context3DProfile, PixelBenderOutput, PixelBenderTarget,
 };
-use ruffle_render::backend::{RenderBackend, ShapeHandle, ViewportDimensions};
+use ruffle_render::backend::{
+    RenderBackend, RenderOffscreenBatches, ShapeHandle, ViewportDimensions,
+};
 use ruffle_render::bitmap::{
     Bitmap, BitmapFormat, BitmapHandle, BitmapSource, PixelRegion, RgbaBufRead, SyncHandle,
 };
@@ -729,10 +731,14 @@ impl<T: RenderTarget + 'static> RenderBackend for WgpuRenderBackend<T> {
     fn render_offscreen(
         &mut self,
         handle: BitmapHandle,
-        commands: CommandList,
+        batches: RenderOffscreenBatches,
         quality: StageQuality,
         bounds: PixelRegion,
     ) -> Option<Box<dyn SyncHandle>> {
+        if batches.is_empty() {
+            return None;
+        }
+
         let texture = as_texture(&handle);
 
         let extent = wgpu::Extent3d {
@@ -741,17 +747,13 @@ impl<T: RenderTarget + 'static> RenderBackend for WgpuRenderBackend<T> {
             depth_or_array_layers: 1,
         };
 
-        let mut target = TextureTarget {
+        let target = TextureTarget {
             size: extent,
             texture: texture.texture.clone(),
             format: wgpu::TextureFormat::Rgba8Unorm,
             buffer: None,
         };
 
-        let frame_output = target
-            .get_next_texture()
-            .expect("TextureTargetFrame.get_next_texture is infallible");
-
         let mut surface = Surface::new(
             &self.descriptors,
             quality,
@@ -759,19 +761,30 @@ impl<T: RenderTarget + 'static> RenderBackend for WgpuRenderBackend<T> {
             texture.texture.height(),
             wgpu::TextureFormat::Rgba8Unorm,
         );
-        surface.draw_commands_and_copy_to(
-            frame_output.view(),
-            RenderTargetMode::FreshWithTexture(target.get_texture()),
-            &self.descriptors,
-            &mut self.active_frame.staging_belt,
-            &self.dynamic_transforms,
-            &mut self.active_frame.command_encoder,
-            &self.meshes,
-            commands,
-            LayerRef::Current,
-            &mut self.offscreen_texture_pool,
-        );
 
+        for commands in batches {
+            // Each batch is rendered as its own pass against the running
+            // target texture - the previous pass's resolve (if any) is
+            // visible as input to this one.
+            let frame_view = target.texture.create_view(&Default::default());
+            surface.draw_commands_and_copy_to(
+                &frame_view,
+                RenderTargetMode::FreshWithTexture(target.get_texture()),
+                &self.descriptors,
+                &mut self.active_frame.staging_belt,
+                &self.dynamic_transforms,
+                &mut self.active_frame.command_encoder,
+                &self.meshes,
+                commands,
+                LayerRef::Current,
+                &mut self.offscreen_texture_pool,
+            );
+        }
+        // One flush per call covers both single- and multi-batch paths with
+        // one encoder reset. The frame-scoped OOM valve (MAX_DRAWS_PER_FLUSH)
+        // still protects us if a single render_offscreen is dwarfed by a
+        // caller that submits many render_offscreens without hitting any
+        // other flush point - each call still increments draws_since_flush.
         self.active_frame.maybe_flush(&self.descriptors);
         Some(self.make_queue_sync_handle(target, None, handle, bounds))
     }
@@ -793,6 +806,85 @@ impl<T: RenderTarget + 'static> RenderBackend for WgpuRenderBackend<T> {
         true
     }
 
+    fn copy_pixels_to_texture(
+        &mut self,
+        source: BitmapHandle,
+        source_region: PixelRegion,
+        destination: BitmapHandle,
+        dest_point: (u32, u32),
+    ) -> Option<Box<dyn SyncHandle>> {
+        let source_texture = as_texture(&source);
+        let dest_texture = as_texture(&destination);
+
+        let copy_width = source_region
+            .width()
+            .min(dest_texture.texture.width().saturating_sub(dest_point.0))
+            .min(
+                source_texture
+                    .texture
+                    .width()
+                    .saturating_sub(source_region.x_min),
+            );
+        let copy_height = source_region
+            .height()
+            .min(dest_texture.texture.height().saturating_sub(dest_point.1))
+            .min(
+                source_texture
+                    .texture
+                    .height()
+                    .saturating_sub(source_region.y_min),
+            );
+
+        if copy_width == 0 || copy_height == 0 {
+            return None;
+        }
+
+        let dest_region =
+            PixelRegion::for_region(dest_point.0, dest_point.1, copy_width, copy_height);
+
+        let target = TextureTarget {
+            size: wgpu::Extent3d {
+                width: dest_texture.texture.width(),
+                height: dest_texture.texture.height(),
+                depth_or_array_layers: 1,
+            },
+            texture: dest_texture.texture.clone(),
+            format: dest_texture.texture.format(),
+            buffer: None,
+        };
+
+        self.active_frame.command_encoder.copy_texture_to_texture(
+            wgpu::TexelCopyTextureInfo {
+                texture: &source_texture.texture,
+                mip_level: 0,
+                origin: wgpu::Origin3d {
+                    x: source_region.x_min,
+                    y: source_region.y_min,
+                    z: 0,
+                },
+                aspect: Default::default(),
+            },
+            wgpu::TexelCopyTextureInfo {
+                texture: &dest_texture.texture,
+                mip_level: 0,
+                origin: wgpu::Origin3d {
+                    x: dest_point.0,
+                    y: dest_point.1,
+                    z: 0,
+                },
+                aspect: Default::default(),
+            },
+            wgpu::Extent3d {
+                width: copy_width,
+                height: copy_height,
+                depth_or_array_layers: 1,
+            },
+        );
+
+        self.active_frame.maybe_flush(&self.descriptors);
+        Some(self.make_queue_sync_handle(target, None, destination, dest_region))
+    }
+
     fn apply_filter(
         &mut self,
         source: BitmapHandle,