diff --git a/core/src/bitmap/bitmap_data.rs b/core/src/bitmap/bitmap_data.rs index 29eb34fdc16c..bf959a952025 100644 --- a/core/src/bitmap/bitmap_data.rs +++ b/core/src/bitmap/bitmap_data.rs @@ -4,10 +4,12 @@ use crate::display_object::{BoundsMode, DisplayObject, DisplayObjectWeak, TDispl use bitflags::bitflags; use gc_arena::lock::GcRefLock; use gc_arena::{Collect, Gc, Mutation}; -use ruffle_render::backend::RenderBackend; +use ruffle_render::backend::{RenderBackend, RenderOffscreenBatches}; use ruffle_render::bitmap::{ Bitmap, BitmapFormat, BitmapHandle, PixelRegion, PixelSnapping, SyncHandle, }; +use ruffle_render::commands::CommandList; +use ruffle_render::quality::StageQuality; use ruffle_wstr::WStr; use std::cell::Ref; use std::fmt::Debug; @@ -344,6 +346,42 @@ impl<'gc> BitmapData<'gc> { self.0.render(smoothing, context, pixel_snapping); } + pub fn append_gpu_commands( + &self, + gc: &Mutation<'gc>, + renderer: &mut dyn RenderBackend, + commands: CommandList, + dirty: PixelRegion, + quality: StageQuality, + ) { + self.0 + .append_gpu_commands(gc, renderer, commands, dirty, quality); + } + + #[allow(clippy::too_many_arguments)] + pub fn append_render_bitmap( + &self, + gc: &Mutation<'gc>, + renderer: &mut dyn RenderBackend, + bitmap: BitmapHandle, + transform: ruffle_render::transform::Transform, + smoothing: bool, + pixel_snapping: PixelSnapping, + dirty: PixelRegion, + quality: StageQuality, + ) { + self.0.append_render_bitmap( + gc, + renderer, + bitmap, + transform, + smoothing, + pixel_snapping, + dirty, + quality, + ); + } + pub fn can_read(&self, read_area: PixelRegion) -> bool { self.0.can_read(read_area) } @@ -362,6 +400,16 @@ impl<'gc> BitmapData<'gc> { } } +/// A sub-batch of queued GPU commands together with the dirty region those +/// commands cover. Sub-batches within a pending sequence become separate +/// render passes at flush time. +type PendingBatch = (CommandList, PixelRegion); + +/// The state of `BitmapRawData::pending_gpu_commands`: the list of queued +/// sub-batches, the union of all their dirty regions, and the stage quality +/// all sub-batches share. +type PendingGpuCommands = (Vec, PixelRegion, StageQuality); + #[derive(Collect)] #[collect(no_drop)] pub struct BitmapRawData<'gc> { @@ -397,6 +445,15 @@ pub struct BitmapRawData<'gc> { #[collect(require_static)] dirty_state: DirtyState, + /// Accumulated GPU commands awaiting batch execution. Stored as a + /// sequence of sub-batches paired with their own dirty regions plus a + /// union dirty for the whole sequence and the shared render quality. + /// Each sub-batch becomes a separate render pass at flush time, letting + /// callers insert resolve boundaries between overlapping draws under + /// MSAA while non-overlapping draws merge into one pass. + #[collect(require_static)] + pending_gpu_commands: Option, + /// Holds an egui texture handle, used for rendering this Bitmap in the debug ui. /// This is automatically set to `None` when the texture is updated (either from /// marking the CPU side dirty, or from performing a GPU -> CPU sync). @@ -423,9 +480,10 @@ mod wrapper { use gc_arena::barrier::Write; use gc_arena::lock::GcRefLock; use gc_arena::{Collect, Gc, Mutation}; - use ruffle_render::backend::RenderBackend; + use ruffle_render::backend::{RenderBackend, RenderOffscreenBatches}; use ruffle_render::bitmap::{BitmapHandle, PixelRegion, PixelSnapping}; - use ruffle_render::commands::CommandHandler; + use ruffle_render::commands::{CommandHandler, CommandList}; + use ruffle_render::quality::StageQuality; use std::cell::Ref; use super::{BitmapRawData, DirtyState, copy_pixels_to_bitmapdata}; @@ -484,6 +542,7 @@ mod wrapper { avm2_object: None, display_objects: vec![], dirty_state: DirtyState::Clean, + pending_gpu_commands: None, #[cfg(feature = "egui")] egui_texture: Default::default(), } @@ -508,6 +567,7 @@ mod wrapper { display_objects: vec![], // We have no GPU texture, so there's no need to mark as dirty dirty_state: DirtyState::Clean, + pending_gpu_commands: None, #[cfg(feature = "egui")] egui_texture: Default::default(), } @@ -523,6 +583,21 @@ mod wrapper { let mut write = unsafe { Write::assume(Gc::as_ref(self.0)) } .unlock() .borrow_mut(); + + // Flush any pending GPU commands before syncing - the flush + // leaves the dirty_state as GpuModified whose sync handle + // covers the union of all pending writes. + if let Some((batches, dirty, quality)) = write.pending_gpu_commands.take() { + let handle = write.bitmap_handle(renderer); + let command_lists: RenderOffscreenBatches = + batches.into_iter().map(|(cmds, _)| cmds).collect(); + if let Some(sync_handle) = + renderer.render_offscreen(handle, command_lists, quality, dirty) + { + write.dirty_state = DirtyState::GpuModified(sync_handle, dirty); + } + } + match std::mem::replace(&mut write.dirty_state, DirtyState::Clean) { DirtyState::GpuModified(sync_handle, bounds) => { renderer @@ -554,10 +629,121 @@ mod wrapper { renderer: &mut dyn RenderBackend, ) -> BitmapHandle { let mut bitmap_data = self.0.borrow_mut(gc_context); + bitmap_data.flush_pending_gpu_commands(renderer, gc_context); bitmap_data.update_dirty_texture(renderer); bitmap_data.bitmap_handle(renderer) } + /// Append a `CommandList` to the deferred GPU batch for this bitmap. + /// The batch is flushed lazily on the next `bitmap_handle`/`sync` call + /// so repeated draws onto the same bitmap submit as one `render_offscreen` + /// invocation instead of one per call. + pub fn append_gpu_commands( + &self, + gc: &Mutation<'gc>, + renderer: &mut dyn RenderBackend, + commands: CommandList, + dirty: PixelRegion, + quality: StageQuality, + ) { + let mut data = self.0.borrow_mut(gc); + data.update_dirty_texture(renderer); + + // Quality must stay constant inside one pending batch - each + // render pass in the sequence shares the same MSAA sample count. + if let Some((_, _, existing_quality)) = &data.pending_gpu_commands + && *existing_quality != quality + { + data.flush_pending_gpu_commands(renderer, gc); + } + + match &mut data.pending_gpu_commands { + Some((batches, union_dirty, _)) => { + // For MSAA, insert a resolve boundary between draws that + // overlap so the next draw samples the already-resolved + // 8-bit pixels. Only check against the last sub-batch's + // dirty: non-adjacent draws can still share a pass even + // when they overlap earlier already-resolved work. + let needs_resolve_boundary = quality.sample_count() > 1 + && batches.last().is_some_and(|(_, d)| dirty.intersects(*d)); + if needs_resolve_boundary || batches.is_empty() { + batches.push((commands, dirty)); + } else { + let last = batches.last_mut().expect("batches is non-empty"); + last.0.commands.extend(commands.commands); + last.1.union(dirty); + } + union_dirty.union(dirty); + } + None => { + let mut merged = dirty; + if let DirtyState::GpuModified(_, old) = &data.dirty_state { + merged.union(*old); + } + data.pending_gpu_commands = Some((vec![(commands, dirty)], merged, quality)); + } + } + } + + /// Append one `render_bitmap` command to the deferred batch. When + /// the command can merge into the current sub-batch, no new + /// `CommandList` is allocated - the command is pushed onto the + /// existing list in place. + #[allow(clippy::too_many_arguments)] + pub fn append_render_bitmap( + &self, + gc: &Mutation<'gc>, + renderer: &mut dyn RenderBackend, + bitmap: BitmapHandle, + transform: ruffle_render::transform::Transform, + smoothing: bool, + pixel_snapping: PixelSnapping, + dirty: PixelRegion, + quality: StageQuality, + ) { + let mut data = self.0.borrow_mut(gc); + data.update_dirty_texture(renderer); + + if let Some((_, _, existing_quality)) = &data.pending_gpu_commands + && *existing_quality != quality + { + data.flush_pending_gpu_commands(renderer, gc); + } + + let command = ruffle_render::commands::Command::RenderBitmap { + bitmap, + transform, + smoothing, + pixel_snapping, + }; + + match &mut data.pending_gpu_commands { + Some((batches, union_dirty, _)) => { + let needs_resolve_boundary = quality.sample_count() > 1 + && batches.last().is_some_and(|(_, d)| dirty.intersects(*d)); + if needs_resolve_boundary || batches.is_empty() { + let mut list = CommandList::new(); + list.commands.push(command); + batches.push((list, dirty)); + } else { + let last = batches.last_mut().expect("batches is non-empty"); + last.0.commands.push(command); + last.1.union(dirty); + } + union_dirty.union(dirty); + } + None => { + let mut merged = dirty; + if let DirtyState::GpuModified(_, old) = &data.dirty_state { + merged.union(*old); + } + let mut list = CommandList::new(); + list.commands.push(command); + data.pending_gpu_commands = Some((vec![(list, dirty)], merged, quality)); + } + } + } + /// Provides access to the underlying `BitmapData`. /// This should only be used when you will be overwriting the entire /// `pixels` vec without reading from it. Cancels any in-progress GPU -> CPU sync. @@ -567,6 +753,12 @@ mod wrapper { mc: &Mutation<'gc>, ) -> (GcRefLock<'gc, BitmapRawData<'gc>>, Option) { let mut write = self.0.borrow_mut(mc); + // Caller is about to fully overwrite the CPU pixels; any pending + // GPU work would land on a texture that's about to be replaced + // by the upcoming CPU->GPU sync, so drop it now to keep the + // invariant that `pending_gpu_commands.is_some()` only holds + // when `dirty_state` is Clean or GpuModified, never CpuModified. + write.discard_pending_gpu_commands(); let dirty_rect = match write.dirty_state { DirtyState::GpuModified(_, rect) => { write.dirty_state = DirtyState::Clean; @@ -587,13 +779,19 @@ mod wrapper { read_area: PixelRegion, renderer: &mut dyn RenderBackend, ) -> Ref<'_, BitmapRawData<'gc>> { - let needs_update = if let DirtyState::GpuModified(_, area) = self.0.borrow().dirty_state - { + let data = self.0.borrow(); + // Pending GPU commands haven't run yet, so the CPU pixels are + // stale even if `dirty_state` reads Clean. Force a sync so the + // queued work is rendered and resolved back to the CPU before + // the caller observes the pixels. + let needs_flush = data.pending_gpu_commands.is_some(); + let needs_sync = if let DirtyState::GpuModified(_, area) = data.dirty_state { area.intersects(read_area) } else { false }; - if needs_update { + drop(data); + if needs_flush || needs_sync { self.sync(renderer); } self.0.borrow() @@ -661,11 +859,13 @@ mod wrapper { context: &mut RenderContext<'_, 'gc>, pixel_snapping: PixelSnapping, ) { - let mut inner_bitmap_data = self.0.borrow_mut(context.gc()); + let gc = context.gc(); + let mut inner_bitmap_data = self.0.borrow_mut(gc); if inner_bitmap_data.disposed() { return; } + inner_bitmap_data.flush_pending_gpu_commands(context.renderer, gc); // Note - we do a CPU -> GPU sync, but we do *not* do a GPU -> CPU sync // (rendering is done on the GPU, so the CPU pixels don't need to be up-to-date). inner_bitmap_data.update_dirty_texture(context.renderer); @@ -680,7 +880,14 @@ mod wrapper { } pub fn can_read(&self, read_area: PixelRegion) -> bool { - if let DirtyState::GpuModified(_, area) = self.0.borrow().dirty_state { + let data = self.0.borrow(); + // Pending GPU commands mean the CPU pixels are stale: even if + // `dirty_state` still reads Clean, a flush would update the GPU + // texture and mark GpuModified across `read_area`. + if data.pending_gpu_commands.is_some() { + return false; + } + if let DirtyState::GpuModified(_, area) = data.dirty_state { !area.intersects(read_area) } else { true @@ -742,6 +949,7 @@ impl<'gc> BitmapRawData<'gc> { avm2_object: None, display_objects: vec![], dirty_state: DirtyState::Clean, + pending_gpu_commands: None, #[cfg(feature = "egui")] egui_texture: Default::default(), } @@ -762,6 +970,7 @@ impl<'gc> BitmapRawData<'gc> { avm2_object: None, disposed: false, dirty_state: DirtyState::Clean, + pending_gpu_commands: None, display_objects: vec![], #[cfg(feature = "egui")] egui_texture: Default::default(), @@ -830,6 +1039,14 @@ impl<'gc> BitmapRawData<'gc> { pub fn set_cpu_dirty(&mut self, gc_context: &Mutation<'gc>, region: PixelRegion) { debug_assert!(region.x_max <= self.width); debug_assert!(region.y_max <= self.height); + // Invariant: a CPU-dirty bitmap has no queued GPU commands. Callers + // must clear pending first - either by flushing (via sync/read_area) + // or by discarding (via overwrite_cpu_pixels_from_gpu for full + // overwrites) - before declaring CPU-side modifications. + debug_assert!( + self.pending_gpu_commands.is_none(), + "set_cpu_dirty called while GPU commands are still pending" + ); #[cfg(feature = "egui")] self.egui_texture.borrow_mut().take(); @@ -926,6 +1143,33 @@ impl<'gc> BitmapRawData<'gc> { } } + /// Drop any accumulated GPU commands without executing them. Used when + /// the caller knows the bitmap is about to be fully overwritten (e.g. a + /// full-texture `fillRect`) and the pending draws would be wasted work. + pub fn discard_pending_gpu_commands(&mut self) { + self.pending_gpu_commands = None; + } + + /// Submit accumulated GPU commands as a single `render_offscreen` call, + /// leaving `dirty_state` marked `GpuModified` across the union of all + /// sub-batch dirty regions so a later CPU-side read will sync back. + pub fn flush_pending_gpu_commands( + &mut self, + renderer: &mut dyn RenderBackend, + gc: &Mutation<'gc>, + ) { + if let Some((batches, dirty, quality)) = self.pending_gpu_commands.take() { + let handle = self.bitmap_handle(renderer); + let command_lists: RenderOffscreenBatches = + batches.into_iter().map(|(cmds, _)| cmds).collect(); + if let Some(sync_handle) = + renderer.render_offscreen(handle, command_lists, quality, dirty) + { + self.set_gpu_dirty(gc, sync_handle, dirty); + } + } + } + fn inform_display_objects(&self, gc_context: &Mutation<'gc>) { for object in &self.display_objects { if let Some(object) = object.upgrade(gc_context) { diff --git a/core/src/bitmap/operations.rs b/core/src/bitmap/operations.rs index c4bbeb0ee1e6..909912700028 100644 --- a/core/src/bitmap/operations.rs +++ b/core/src/bitmap/operations.rs @@ -1065,6 +1065,85 @@ pub fn copy_pixels<'gc>( return; } + let needs_blend = (source_transparency && !transparency) || merge_alpha; + let different_bitmaps = !source_bitmap.ptr_eq(target); + + // Fast path 1: plain replace (no blending) from a GPU-resident source + // onto a different target. Uses a raw texture-to-texture copy so we + // skip the blocking GPU->CPU readback that `copy_on_cpu` would do. + if !needs_blend + && different_bitmaps + && !source_bitmap.can_read(source_region) + && context.renderer.is_offscreen_supported() + { + // `bitmap_handle` flushes each side's pending batch, so our raw + // copy observes the right source pixels and isn't clobbered by a + // later `render_offscreen` into the target. + let source_handle = source_bitmap.bitmap_handle(context.gc(), context.renderer); + let target_handle = target.bitmap_handle(context.gc(), context.renderer); + if let Some(sync_handle) = context.renderer.copy_pixels_to_texture( + source_handle, + source_region, + target_handle, + (dest_region.x_min, dest_region.y_min), + ) { + let (target_ref, old_dirty) = target.overwrite_cpu_pixels_from_gpu(context.gc()); + let mut write = target_ref.borrow_mut(context.gc()); + let mut dirty = dest_region; + if let Some(old) = old_dirty { + dirty.union(old); + } + write.set_gpu_dirty(context.gc(), sync_handle, dirty); + return; + } + } + + // Fast path 2: when the original src_rect covers the full source and + // the target's current pixels don't need to come back to the CPU, we + // can queue a `render_bitmap` into the pending GPU batch instead of + // reading back both sides and blending on the CPU. Using the original + // dest_point (before clamp) lets the GPU naturally clip pixels that + // fall outside the target texture. + let original_covers_full_source = src_min_x <= 0 + && src_min_y <= 0 + && src_width >= source_bitmap.width() as i32 + && src_height >= source_bitmap.height() as i32; + if original_covers_full_source + && different_bitmaps + && !target.can_read(dest_region) + && context.renderer.is_offscreen_supported() + { + let source_handle = source_bitmap.bitmap_handle(context.gc(), context.renderer); + let transform = Transform { + matrix: Matrix { + a: 1.0, + b: 0.0, + c: 0.0, + d: 1.0, + tx: Twips::from_pixels(dest_min_x as f64), + ty: Twips::from_pixels(dest_min_y as f64), + }, + color_transform: ColorTransform::default(), + perspective_projection: None, + }; + + // copyPixels is a plain blit - MSAA on it just wastes a 4x buffer + // and a resolve pass. Use `Low` so contiguous copyPixels runs skip + // MSAA entirely; a later vector `draw()` with higher stage quality + // will flush this batch and start its own MSAA pass. + target.append_render_bitmap( + context.gc(), + context.renderer, + source_handle, + transform, + false, + PixelSnapping::Never, + dest_region, + StageQuality::Low, + ); + return; + } + copy_on_cpu( context.gc(), context.renderer, @@ -1072,7 +1151,7 @@ pub fn copy_pixels<'gc>( target, source_region, dest_region, - (source_transparency && !transparency) || merge_alpha, + needs_blend, ); } @@ -1592,8 +1671,6 @@ pub fn draw<'gc>( render_context.commands.pop_mask(); } - let handle = target.bitmap_handle(render_context.gc(), render_context.renderer); - let commands = if blend_mode == BlendMode::Normal { render_context.commands } else { @@ -1605,28 +1682,25 @@ pub fn draw<'gc>( commands }; - let (target, include_dirty_area) = target.overwrite_cpu_pixels_from_gpu(context.gc()); - let mut write = target.borrow_mut(context.gc()); - // If we have another dirty area to preserve, expand this to include it - if let Some(old) = include_dirty_area { - dirty_region.union(old); - } - assert!( cache_draws.is_empty(), "BitmapData.draw() should not use cacheAsBitmap" ); - let image = context - .renderer - .render_offscreen(handle, commands, quality, dirty_region); - - match image { - Some(sync_handle) => { - write.set_gpu_dirty(context.gc(), sync_handle, dirty_region); - Ok(()) - } - None => Err(BitmapDataDrawError::Unimplemented), - } + + // Defer the draw into the target's pending GPU batch instead of calling + // `render_offscreen` synchronously. Consecutive `BitmapData.draw()` calls + // onto the same target then share one submission at flush time - crucial + // for scenes (e.g. cached backgrounds) that do tens of thousands of + // draws per frame and would otherwise hit the backend's per-frame draw + // cap with fresh command-encoder allocations. + target.append_gpu_commands( + context.gc(), + context.renderer, + commands, + dirty_region, + quality, + ); + Ok(()) } pub fn get_vector<'gc>( diff --git a/render/canvas/src/lib.rs b/render/canvas/src/lib.rs index 7af736176d91..20fcc3ac8c5a 100644 --- a/render/canvas/src/lib.rs +++ b/render/canvas/src/lib.rs @@ -4,7 +4,7 @@ use ruffle_render::backend::{ BitmapCacheEntry, Context3D, Context3DProfile, PixelBenderOutput, PixelBenderTarget, - RenderBackend, ShapeHandle, ShapeHandleImpl, ViewportDimensions, + RenderBackend, RenderOffscreenBatches, ShapeHandle, ShapeHandleImpl, ViewportDimensions, }; use ruffle_render::bitmap::{ Bitmap, BitmapHandle, BitmapHandleImpl, BitmapSource, PixelRegion, PixelSnapping, RgbaBufRead, @@ -509,7 +509,7 @@ impl RenderBackend for WebCanvasRenderBackend { fn render_offscreen( &mut self, _handle: BitmapHandle, - _commands: CommandList, + _batches: RenderOffscreenBatches, _quality: StageQuality, _bounds: PixelRegion, ) -> Option> { diff --git a/render/src/backend.rs b/render/src/backend.rs index 4e424b1c60f3..95bace9bbf1f 100644 --- a/render/src/backend.rs +++ b/render/src/backend.rs @@ -9,6 +9,13 @@ use crate::pixel_bender_support::PixelBenderShaderArgument; use crate::quality::StageQuality; use crate::shape_utils::DistilledShape; use ruffle_wstr::{FromWStr, WStr}; +use smallvec::SmallVec; + +/// A sequence of command-list sub-batches passed to `render_offscreen`. +/// Inline size of 1 covers the single-batch call path (e.g. `BitmapData.draw()`) +/// without a heap allocation; longer sequences from deferred batching spill to +/// heap identically to `Vec`. +pub type RenderOffscreenBatches = SmallVec<[CommandList; 1]>; use std::any::Any; use std::borrow::Cow; use std::cell::RefCell; @@ -36,10 +43,16 @@ pub trait RenderBackend: Any { bitmap_source: &dyn BitmapSource, ) -> ShapeHandle; + /// Renders the given sequence of command lists onto the texture bound + /// to `handle` and returns a sync handle for the resulting write. Each + /// entry in `batches` is rendered as its own render pass; with MSAA, + /// this means the previous pass's resolve is visible as input to the + /// next. A single long batch stays in one pass, so callers that pack + /// non-overlapping draws into one batch skip any per-draw resolve. fn render_offscreen( &mut self, handle: BitmapHandle, - commands: CommandList, + batches: RenderOffscreenBatches, quality: StageQuality, bounds: PixelRegion, ) -> Option>; @@ -70,6 +83,20 @@ pub trait RenderBackend: Any { false } + /// Copies a rectangular region from one GPU texture to another without + /// CPU readback or blending. Returns a `SyncHandle` the caller can use + /// to mark the destination `GpuModified`. Backends without GPU support + /// return `None` and the caller should fall back to the CPU copy path. + fn copy_pixels_to_texture( + &mut self, + _source: BitmapHandle, + _source_region: PixelRegion, + _destination: BitmapHandle, + _dest_point: (u32, u32), + ) -> Option> { + None + } + fn submit_frame( &mut self, clear: swf::Color, diff --git a/render/src/backend/null.rs b/render/src/backend/null.rs index c8856ddb21a8..4881688a87b5 100644 --- a/render/src/backend/null.rs +++ b/render/src/backend/null.rs @@ -3,7 +3,8 @@ use std::num::NonZeroU32; use std::sync::Arc; use crate::backend::{ - BitmapCacheEntry, RenderBackend, ShapeHandle, ShapeHandleImpl, ViewportDimensions, + BitmapCacheEntry, RenderBackend, RenderOffscreenBatches, ShapeHandle, ShapeHandleImpl, + ViewportDimensions, }; use crate::bitmap::{ Bitmap, BitmapHandle, BitmapHandleImpl, BitmapSize, BitmapSource, PixelRegion, RgbaBufRead, @@ -66,7 +67,7 @@ impl RenderBackend for NullRenderer { fn render_offscreen( &mut self, _handle: BitmapHandle, - _commands: CommandList, + _batches: RenderOffscreenBatches, _quality: StageQuality, _bounds: PixelRegion, ) -> Option> { diff --git a/render/webgl/src/lib.rs b/render/webgl/src/lib.rs index 1bb43fa415a4..ae37562eaa28 100644 --- a/render/webgl/src/lib.rs +++ b/render/webgl/src/lib.rs @@ -5,7 +5,7 @@ use bytemuck::{Pod, Zeroable}; use ruffle_render::backend::{ BitmapCacheEntry, Context3D, Context3DProfile, PixelBenderOutput, PixelBenderTarget, - RenderBackend, ShapeHandle, ShapeHandleImpl, ViewportDimensions, + RenderBackend, RenderOffscreenBatches, ShapeHandle, ShapeHandleImpl, ViewportDimensions, }; use ruffle_render::bitmap::{ Bitmap, BitmapFormat, BitmapHandle, BitmapHandleImpl, BitmapSource, PixelRegion, PixelSnapping, @@ -995,7 +995,7 @@ impl RenderBackend for WebGlRenderBackend { fn render_offscreen( &mut self, _handle: BitmapHandle, - _commands: CommandList, + _batches: RenderOffscreenBatches, _quality: StageQuality, _bounds: PixelRegion, ) -> Option> { diff --git a/render/wgpu/src/backend.rs b/render/wgpu/src/backend.rs index bd2b7a6a023d..7eeafa5dce3f 100644 --- a/render/wgpu/src/backend.rs +++ b/render/wgpu/src/backend.rs @@ -17,7 +17,9 @@ use image::imageops::FilterType; use ruffle_render::backend::{ BitmapCacheEntry, Context3D, Context3DProfile, PixelBenderOutput, PixelBenderTarget, }; -use ruffle_render::backend::{RenderBackend, ShapeHandle, ViewportDimensions}; +use ruffle_render::backend::{ + RenderBackend, RenderOffscreenBatches, ShapeHandle, ViewportDimensions, +}; use ruffle_render::bitmap::{ Bitmap, BitmapFormat, BitmapHandle, BitmapSource, PixelRegion, RgbaBufRead, SyncHandle, }; @@ -729,10 +731,14 @@ impl RenderBackend for WgpuRenderBackend { fn render_offscreen( &mut self, handle: BitmapHandle, - commands: CommandList, + batches: RenderOffscreenBatches, quality: StageQuality, bounds: PixelRegion, ) -> Option> { + if batches.is_empty() { + return None; + } + let texture = as_texture(&handle); let extent = wgpu::Extent3d { @@ -741,17 +747,13 @@ impl RenderBackend for WgpuRenderBackend { depth_or_array_layers: 1, }; - let mut target = TextureTarget { + let target = TextureTarget { size: extent, texture: texture.texture.clone(), format: wgpu::TextureFormat::Rgba8Unorm, buffer: None, }; - let frame_output = target - .get_next_texture() - .expect("TextureTargetFrame.get_next_texture is infallible"); - let mut surface = Surface::new( &self.descriptors, quality, @@ -759,19 +761,30 @@ impl RenderBackend for WgpuRenderBackend { texture.texture.height(), wgpu::TextureFormat::Rgba8Unorm, ); - surface.draw_commands_and_copy_to( - frame_output.view(), - RenderTargetMode::FreshWithTexture(target.get_texture()), - &self.descriptors, - &mut self.active_frame.staging_belt, - &self.dynamic_transforms, - &mut self.active_frame.command_encoder, - &self.meshes, - commands, - LayerRef::Current, - &mut self.offscreen_texture_pool, - ); + for commands in batches { + // Each batch is rendered as its own pass against the running + // target texture - the previous pass's resolve (if any) is + // visible as input to this one. + let frame_view = target.texture.create_view(&Default::default()); + surface.draw_commands_and_copy_to( + &frame_view, + RenderTargetMode::FreshWithTexture(target.get_texture()), + &self.descriptors, + &mut self.active_frame.staging_belt, + &self.dynamic_transforms, + &mut self.active_frame.command_encoder, + &self.meshes, + commands, + LayerRef::Current, + &mut self.offscreen_texture_pool, + ); + } + // One flush per call covers both single- and multi-batch paths with + // one encoder reset. The frame-scoped OOM valve (MAX_DRAWS_PER_FLUSH) + // still protects us if a single render_offscreen is dwarfed by a + // caller that submits many render_offscreens without hitting any + // other flush point - each call still increments draws_since_flush. self.active_frame.maybe_flush(&self.descriptors); Some(self.make_queue_sync_handle(target, None, handle, bounds)) } @@ -793,6 +806,85 @@ impl RenderBackend for WgpuRenderBackend { true } + fn copy_pixels_to_texture( + &mut self, + source: BitmapHandle, + source_region: PixelRegion, + destination: BitmapHandle, + dest_point: (u32, u32), + ) -> Option> { + let source_texture = as_texture(&source); + let dest_texture = as_texture(&destination); + + let copy_width = source_region + .width() + .min(dest_texture.texture.width().saturating_sub(dest_point.0)) + .min( + source_texture + .texture + .width() + .saturating_sub(source_region.x_min), + ); + let copy_height = source_region + .height() + .min(dest_texture.texture.height().saturating_sub(dest_point.1)) + .min( + source_texture + .texture + .height() + .saturating_sub(source_region.y_min), + ); + + if copy_width == 0 || copy_height == 0 { + return None; + } + + let dest_region = + PixelRegion::for_region(dest_point.0, dest_point.1, copy_width, copy_height); + + let target = TextureTarget { + size: wgpu::Extent3d { + width: dest_texture.texture.width(), + height: dest_texture.texture.height(), + depth_or_array_layers: 1, + }, + texture: dest_texture.texture.clone(), + format: dest_texture.texture.format(), + buffer: None, + }; + + self.active_frame.command_encoder.copy_texture_to_texture( + wgpu::TexelCopyTextureInfo { + texture: &source_texture.texture, + mip_level: 0, + origin: wgpu::Origin3d { + x: source_region.x_min, + y: source_region.y_min, + z: 0, + }, + aspect: Default::default(), + }, + wgpu::TexelCopyTextureInfo { + texture: &dest_texture.texture, + mip_level: 0, + origin: wgpu::Origin3d { + x: dest_point.0, + y: dest_point.1, + z: 0, + }, + aspect: Default::default(), + }, + wgpu::Extent3d { + width: copy_width, + height: copy_height, + depth_or_array_layers: 1, + }, + ); + + self.active_frame.maybe_flush(&self.descriptors); + Some(self.make_queue_sync_handle(target, None, destination, dest_region)) + } + fn apply_filter( &mut self, source: BitmapHandle,