// Copyright 2023 The Dawn & Tint Authors // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, this // list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // 3. Neither the name of the copyright holder nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dawn/native/BlitTextureToBuffer.h" #include <algorithm> #include <array> #include <string> #include <string_view> #include <utility> #include "dawn/common/Assert.h" #include "dawn/native/BindGroup.h" #include "dawn/native/CommandBuffer.h" #include "dawn/native/CommandEncoder.h" #include "dawn/native/CommandValidation.h" #include "dawn/native/ComputePassEncoder.h" #include "dawn/native/ComputePipeline.h" #include "dawn/native/Device.h" #include "dawn/native/InternalPipelineStore.h" #include "dawn/native/PhysicalDevice.h" #include "dawn/native/Queue.h" #include "dawn/native/Sampler.h" #include "dawn/native/utils/WGPUHelpers.h" namespace dawn::native { namespace { constexpr uint32_t kWorkgroupSizeX = …; constexpr uint32_t kWorkgroupSizeY = …; constexpr std::string_view kDstBufferU32 = …; // For DepthFloat32 we can directly use f32 for the buffer array data type as we don't need packing. constexpr std::string_view kDstBufferF32 = …; constexpr std::string_view kFloatTexture1D = …; constexpr std::string_view kFloatTexture2D = …; constexpr std::string_view kFloatTexture2DArray = …; constexpr std::string_view kFloatTexture3D = …; // Cube map reference: https://en.wikipedia.org/wiki/Cube_mapping // Function converting texel coord to sample st coord for cube texture. constexpr std::string_view kCubeCoordCommon = …; constexpr std::string_view kFloatTextureCube = …; constexpr std::string_view kUintTexture = …; constexpr std::string_view kUintTextureArray = …; // textureSampleLevel doesn't support texture_cube<u32> // Use textureGather as a workaround. // Always choose the texel with the smallest coord (stored in w component). // Since this is only used for Stencil8 (1 channel), we only care component idx == 0. constexpr std::string_view kUintTextureCube = …; constexpr std::string_view kEncodeRGBA8UnormInU32 = …; constexpr std::string_view kEncodeRGBA8SnormInU32 = …; // Storing and swizzling bgra8unorm texel values and convert to u32. constexpr std::string_view kEncodeBGRA8UnormInU32 = …; constexpr std::string_view kEncodeRG16FloatInU32 = …; // Each thread is responsible for reading (packTexelCount) texel and packing them into a 4-byte u32. constexpr std::string_view kCommonHead = …; constexpr std::string_view kCommonStart = …; constexpr std::string_view kCommonEnd = …; constexpr std::string_view kPackStencil8ToU32 = …; // Color format R8Snorm and RG8Snorm T2B copy doesn't require offset to be multiple of 4 bytes, // making it more complicated than other formats. // TODO(dawn:1886): potentially separate "middle of the image" case // and "on the edge" case into different shaders and passes for better performance. constexpr std::string_view kNonMultipleOf4OffsetStart = …; // R8snorm: texelByte = 1; each thread reads 1 ~ 4 texels. // Different scenarios are listed below: // // * In the middle of the row: reads 4 texels // | x | x+1 | x+2 | x+3 | // // * At the edge of the row: when offset % 4 > 0 // - when copyWidth % bytesPerRow == 0 (compact row), read 4 texels // e.g. offset = 1; copyWidth = 256; // | 255,y-1 | 0,y | 1,y | 2,y | // - when copyWidth % bytesPerRow > 0 || rowsPerImage > copyHeight (sparse row / sparse image) // One more thread is added to the end of each row, // reads 1 ~ 3 texels, reads dst buf values // e.g. offset = 1; copyWidth = 128; mask = 0xffffff00; // | 127,y-1 | b | b | b | // - when copyWidth % bytesPerRow > 0 && copyWidth + offset % 4 > bytesPerRow (special case) // reads 1 ~ 3 texels, reads dst buf values; mask = 0x0000ff00; // e.g. offset = 1; copyWidth = 255; // | 254,y-1 | b | 0,y | 1,y | // // * At the start of the whole copy: // - when offset % 4 == 0, reads 4 texels // - when offset % 4 > 0, reads 1 ~ 3 texels, reads dst buf values // e.g. offset = 1; mask = 0x000000ff; // | b | 0 | 1 | 2 | // e.g. offset = 1, copyWidth = 2; mask = 0xff0000ff; // | b | 0 | 1 | b | // // * At the end of the whole copy: // - reads 1 ~ 4 texels, reads dst buf values; // e.g. offset = 0; copyWidth = 256; // | 252 | 253 | 254 | 255 | // e.g. offset = 1; copyWidth = 256; mask = 0xffffff00; // | 255 | b | b | b | constexpr std::string_view kPackR8ToU32 = …; // RG8snorm: texelByte = 2; each thread reads 1 ~ 2 texels. // Different scenarios are listed below: // // * In the middle of the row: reads 2 texels // | x | x+1 | // // * At the edge of the row: when offset % 4 > 0 // - when copyWidth % bytesPerRow == 0 (compact row), read 2 texels // e.g. offset = 2; copyWidth = 128; // | 127,y-1 | 0,y | // - when copyWidth % bytesPerRow > 0 || rowsPerImage > copyHeight (sparse row / sparse image) // One more thread is added to the end of each row, // reads 1 texels, reads dst buf values // e.g. offset = 1; copyWidth = 64; mask = 0xffff0000; // | 63,y-1 | b | // // * At the start of the whole copy: // - when offset % 4 == 0, reads 2 texels // - when offset % 4 > 0, reads 1 texels, reads dst buf values // e.g. offset = 2; mask = 0x0000ffff; // | b | 0 | // // * At the end of the whole copy: // - reads 1 ~ 2 texels, reads dst buf values; // e.g. offset = 0; copyWidth = 128; // | 126 | 127 | // e.g. offset = 1; copyWidth = 128; mask = 0xffff0000; // | 127 | b | constexpr std::string_view kPackRG8ToU32 = …; // R16: texelByte = 2; each thread reads 1 ~ 2 texels. // General packing algorithm is similar to kPackRG8ToU32. constexpr std::string_view kPackR16ToU32 = …; constexpr std::string_view kPackRG16ToU32 = …; // Load RGBA16 and pack to 2 uint4_t constexpr std::string_view kLoadRGBA16ToU32 = …; // ShaderF16 extension is only enabled by GL_AMD_gpu_shader_half_float for GL // so we should not use it generally for the emulation. // As a result we are using f32 and array<u32> to do all the math and byte manipulation. // If we have 2-byte scalar type (f16, u16) it can be a bit easier when writing to the storage // buffer. constexpr std::string_view kPackDepth16UnormToU32 = …; // Storing rgba texel values // later called by encodeVectorInU32General to convert to u32. constexpr std::string_view kPackRGBAToU32 = …; // Storing rgb9e5ufloat texel values // In this format float is represented as // 2^(exponent - bias) * (mantissa / 2^numMantissaBits) // Packing algorithm is from: // https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt // // Note: there are multiple bytes that could represent the same value in this format. // e.g. // 0x0a090807 and 0x0412100e both unpack to // [8.344650268554688e-7, 0.000015735626220703125, 0.000015497207641601562] // So the bytes copied via blit could be different. constexpr std::string_view kEncodeRGB9E5UfloatInU32 = …; // Directly loading R32Float values into dst_buf // No bit manipulation and packing is needed. constexpr std::string_view kLoadR32Float = …; ResultOrError<Ref<ComputePipelineBase>> GetOrCreateTextureToBufferPipeline( DeviceBase* device, const TextureCopy& src, wgpu::TextureViewDimension viewDimension) { … } } // anonymous namespace bool IsFormatSupportedByTextureToBufferBlit(wgpu::TextureFormat format) { … } MaybeError BlitTextureToBuffer(DeviceBase* device, CommandEncoder* commandEncoder, const TextureCopy& src, const BufferCopy& dst, const Extent3D& copyExtent) { … } } // namespace dawn::native