mirror of
https://github.com/stenzek/duckstation.git
synced 2025-06-06 19:45:33 +00:00
2146 lines
76 KiB
C++
2146 lines
76 KiB
C++
// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin <stenzek@gmail.com>
|
|
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
|
|
//
|
|
// NOTE: Some parts of this file have more permissive licenses. They are marked appropriately.
|
|
//
|
|
|
|
#include "gpu_hw_shadergen.h"
|
|
|
|
#include "common/assert.h"
|
|
|
|
GPU_HW_ShaderGen::GPU_HW_ShaderGen(RenderAPI render_api, bool supports_dual_source_blend,
|
|
bool supports_framebuffer_fetch)
|
|
: ShaderGen(render_api, GetShaderLanguageForAPI(render_api), supports_dual_source_blend, supports_framebuffer_fetch)
|
|
{
|
|
}
|
|
|
|
GPU_HW_ShaderGen::~GPU_HW_ShaderGen() = default;
|
|
|
|
void GPU_HW_ShaderGen::WriteColorConversionFunctions(std::stringstream& ss) const
|
|
{
|
|
ss << R"(
|
|
uint RGBA8ToRGBA5551(float4 v)
|
|
{
|
|
uint r = uint(roundEven(v.r * 31.0));
|
|
uint g = uint(roundEven(v.g * 31.0));
|
|
uint b = uint(roundEven(v.b * 31.0));
|
|
uint a = (v.a != 0.0) ? 1u : 0u;
|
|
return (r) | (g << 5) | (b << 10) | (a << 15);
|
|
}
|
|
|
|
float4 RGBA5551ToRGBA8(uint v)
|
|
{
|
|
uint r = (v & 31u);
|
|
uint g = ((v >> 5) & 31u);
|
|
uint b = ((v >> 10) & 31u);
|
|
uint a = ((v >> 15) & 1u);
|
|
|
|
return float4(float(r) / 31.0, float(g) / 31.0, float(b) / 31.0, float(a));
|
|
}
|
|
)";
|
|
}
|
|
|
|
void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss) const
|
|
{
|
|
DeclareUniformBuffer(ss,
|
|
{"uint2 u_texture_window_and", "uint2 u_texture_window_or", "float u_src_alpha_factor",
|
|
"float u_dst_alpha_factor", "uint u_interlaced_displayed_field",
|
|
"bool u_set_mask_while_drawing", "float u_resolution_scale", "float u_rcp_resolution_scale",
|
|
"float u_resolution_scale_minus_one"},
|
|
false);
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateScreenVertexShader() const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
DeclareVertexEntryPoint(ss, {"float2 a_pos", "float2 a_tex0"}, 0, 1, {}, false, "", false, false, false);
|
|
ss << R"(
|
|
{
|
|
// Depth set to 1 for PGXP depth buffer.
|
|
v_pos = float4(a_pos, 1.0f, 1.0f);
|
|
v_tex0 = a_tex0;
|
|
|
|
// NDC space Y flip in Vulkan.
|
|
#if API_OPENGL || API_OPENGL_ES || API_VULKAN
|
|
v_pos.y = -v_pos.y;
|
|
#endif
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool upscaled, bool msaa, bool per_sample_shading,
|
|
bool textured, bool palette, bool page_texture, bool uv_limits,
|
|
bool force_round_texcoords, bool pgxp_depth,
|
|
bool disable_color_perspective) const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
DefineMacro(ss, "TEXTURED", textured);
|
|
DefineMacro(ss, "PALETTE", palette);
|
|
DefineMacro(ss, "PAGE_TEXTURE", page_texture);
|
|
DefineMacro(ss, "UV_LIMITS", uv_limits);
|
|
DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords);
|
|
DefineMacro(ss, "PGXP_DEPTH", pgxp_depth);
|
|
DefineMacro(ss, "UPSCALED", upscaled);
|
|
|
|
WriteBatchUniformBuffer(ss);
|
|
|
|
if (textured && page_texture)
|
|
{
|
|
if (uv_limits)
|
|
{
|
|
DeclareVertexEntryPoint(
|
|
ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1,
|
|
{{"nointerpolation", "float4 v_uv_limits"}}, false, "", msaa, per_sample_shading, disable_color_perspective);
|
|
}
|
|
else
|
|
{
|
|
DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1, {},
|
|
false, "", msaa, per_sample_shading, disable_color_perspective);
|
|
}
|
|
}
|
|
else if (textured)
|
|
{
|
|
if (uv_limits)
|
|
{
|
|
DeclareVertexEntryPoint(
|
|
ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1,
|
|
{{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"},
|
|
{"nointerpolation", "float4 v_uv_limits"}},
|
|
false, "", msaa, per_sample_shading, disable_color_perspective);
|
|
}
|
|
else
|
|
{
|
|
DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1,
|
|
{{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, false, "", msaa,
|
|
per_sample_shading, disable_color_perspective);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0"}, 1, 0, {}, false, "", msaa, per_sample_shading,
|
|
disable_color_perspective);
|
|
}
|
|
|
|
ss << R"(
|
|
{
|
|
// Offset the vertex position by 0.5 to ensure correct interpolation of texture coordinates
|
|
// at 1x resolution scale. This doesn't work at >1x, we adjust the texture coordinates before
|
|
// uploading there instead.
|
|
float vertex_offset = (UPSCALED == 0) ? 0.5 : 0.0;
|
|
|
|
// 0..+1023 -> -1..1
|
|
float pos_x = ((a_pos.x + vertex_offset) / 512.0) - 1.0;
|
|
float pos_y = ((a_pos.y + vertex_offset) / -256.0) + 1.0;
|
|
|
|
#if PGXP_DEPTH
|
|
// Ignore mask Z when using PGXP depth.
|
|
float pos_z = a_pos.w;
|
|
float pos_w = a_pos.w;
|
|
#else
|
|
float pos_z = a_pos.z;
|
|
float pos_w = a_pos.w;
|
|
#endif
|
|
|
|
#if API_OPENGL || API_OPENGL_ES
|
|
// 0..1 to -1..1 depth range.
|
|
pos_z = (pos_z * 2.0) - 1.0;
|
|
#endif
|
|
|
|
// NDC space Y flip in Vulkan.
|
|
#if API_OPENGL || API_OPENGL_ES || API_VULKAN
|
|
pos_y = -pos_y;
|
|
#endif
|
|
|
|
v_pos = float4(pos_x * pos_w, pos_y * pos_w, pos_z * pos_w, pos_w);
|
|
|
|
v_col0 = a_col0;
|
|
#if TEXTURED
|
|
v_tex0 = float2(uint2(a_texcoord & 0xFFFFu, a_texcoord >> 16));
|
|
#if !PALETTE && !PAGE_TEXTURE
|
|
v_tex0 *= u_resolution_scale;
|
|
#endif
|
|
|
|
#if !PAGE_TEXTURE
|
|
// base_x,base_y,palette_x,palette_y
|
|
v_texpage.x = (a_texpage & 15u) * 64u;
|
|
v_texpage.y = ((a_texpage >> 4) & 1u) * 256u;
|
|
#if PALETTE
|
|
v_texpage.z = ((a_texpage >> 16) & 63u) * 16u;
|
|
v_texpage.w = ((a_texpage >> 22) & 511u);
|
|
#endif
|
|
#endif
|
|
|
|
#if UV_LIMITS
|
|
v_uv_limits = a_uv_limits * 255.0;
|
|
|
|
#if FORCE_ROUND_TEXCOORDS && PALETTE
|
|
// Add 0.5 to the upper bounds when upscaling, to work around interpolation differences.
|
|
// Limited to force-round-texcoord hack, to avoid breaking other games.
|
|
v_uv_limits.zw += 0.5;
|
|
#elif !PAGE_TEXTURE && !PALETTE
|
|
// Treat coordinates as being in upscaled space, and extend the UV range to all "upscaled"
|
|
// pixels. This means 1-pixel-high polygon-based framebuffer effects won't be downsampled.
|
|
// (e.g. Mega Man Legends 2 haze effect)
|
|
v_uv_limits *= u_resolution_scale;
|
|
v_uv_limits.zw += u_resolution_scale_minus_one;
|
|
#endif
|
|
#endif
|
|
#endif
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
void GPU_HW_ShaderGen::WriteBatchTextureFilter(std::stringstream& ss, GPUTextureFilter texture_filter) const
|
|
{
|
|
// JINC2 and xBRZ shaders originally from beetle-psx, modified to support filtering mask channel.
|
|
if (texture_filter == GPUTextureFilter::Bilinear || texture_filter == GPUTextureFilter::BilinearBinAlpha)
|
|
{
|
|
DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::BilinearBinAlpha);
|
|
ss << R"(
|
|
void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits,
|
|
out float4 texcol, out float ialpha)
|
|
{
|
|
// Compute the coordinates of the four texels we will be interpolating between.
|
|
// Clamp this to the triangle texture coordinates.
|
|
float2 texel_top_left = frac(coords) - float2(0.5, 0.5);
|
|
float2 texel_offset = sign(texel_top_left);
|
|
float4 fcoords = max(coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y),
|
|
float4(0.0, 0.0, 0.0, 0.0));
|
|
|
|
// Load four texels.
|
|
float4 s00 = SampleFromVRAM(texpage, clamp(fcoords.xy, uv_limits.xy, uv_limits.zw));
|
|
float4 s10 = SampleFromVRAM(texpage, clamp(fcoords.zy, uv_limits.xy, uv_limits.zw));
|
|
float4 s01 = SampleFromVRAM(texpage, clamp(fcoords.xw, uv_limits.xy, uv_limits.zw));
|
|
float4 s11 = SampleFromVRAM(texpage, clamp(fcoords.zw, uv_limits.xy, uv_limits.zw));
|
|
|
|
// Compute alpha from how many texels aren't pixel color 0000h.
|
|
float a00 = float(VECTOR_NEQ(s00, TRANSPARENT_PIXEL_COLOR));
|
|
float a10 = float(VECTOR_NEQ(s10, TRANSPARENT_PIXEL_COLOR));
|
|
float a01 = float(VECTOR_NEQ(s01, TRANSPARENT_PIXEL_COLOR));
|
|
float a11 = float(VECTOR_NEQ(s11, TRANSPARENT_PIXEL_COLOR));
|
|
|
|
// Bilinearly interpolate.
|
|
float2 weights = abs(texel_top_left);
|
|
texcol = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y);
|
|
ialpha = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y);
|
|
|
|
// Compensate for partially transparent sampling.
|
|
if (ialpha > 0.0)
|
|
texcol.rgb /= float3(ialpha, ialpha, ialpha);
|
|
|
|
#if BINALPHA
|
|
ialpha = (ialpha >= 0.5) ? 1.0 : 0.0;
|
|
#endif
|
|
}
|
|
)";
|
|
}
|
|
else if (texture_filter == GPUTextureFilter::JINC2 || texture_filter == GPUTextureFilter::JINC2BinAlpha)
|
|
{
|
|
/*
|
|
Hyllian's jinc windowed-jinc 2-lobe sharper with anti-ringing Shader
|
|
|
|
Copyright (C) 2011-2016 Hyllian/Jararaca - sergiogdb@gmail.com
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::JINC2BinAlpha);
|
|
ss << R"(
|
|
CONSTANT float JINC2_WINDOW_SINC = 0.44;
|
|
CONSTANT float JINC2_SINC = 0.82;
|
|
CONSTANT float JINC2_AR_STRENGTH = 0.8;
|
|
|
|
CONSTANT float halfpi = 1.5707963267948966192313216916398;
|
|
CONSTANT float pi = 3.1415926535897932384626433832795;
|
|
CONSTANT float wa = 1.382300768;
|
|
CONSTANT float wb = 2.576105976;
|
|
|
|
// Calculates the distance between two points
|
|
float d(float2 pt1, float2 pt2)
|
|
{
|
|
float2 v = pt2 - pt1;
|
|
return sqrt(dot(v,v));
|
|
}
|
|
|
|
float min4(float a, float b, float c, float d)
|
|
{
|
|
return min(a, min(b, min(c, d)));
|
|
}
|
|
|
|
float4 min4(float4 a, float4 b, float4 c, float4 d)
|
|
{
|
|
return min(a, min(b, min(c, d)));
|
|
}
|
|
|
|
float max4(float a, float b, float c, float d)
|
|
{
|
|
return max(a, max(b, max(c, d)));
|
|
}
|
|
|
|
float4 max4(float4 a, float4 b, float4 c, float4 d)
|
|
{
|
|
return max(a, max(b, max(c, d)));
|
|
}
|
|
|
|
float4 resampler(float4 x)
|
|
{
|
|
float4 res;
|
|
|
|
// res = (x==float4(0.0, 0.0, 0.0, 0.0)) ? float4(wa*wb) : sin(x*wa)*sin(x*wb)/(x*x);
|
|
// Need to use mix(.., equal(..)) since we want zero check to be component wise
|
|
res = lerp(sin(x*wa)*sin(x*wb)/(x*x), float4(wa*wb, wa*wb, wa*wb, wa*wb), VECTOR_COMP_EQ(x,float4(0.0, 0.0, 0.0, 0.0)));
|
|
|
|
return res;
|
|
}
|
|
|
|
void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits,
|
|
out float4 texcol, out float ialpha)
|
|
{
|
|
float4 weights[4];
|
|
|
|
float2 dx = float2(1.0, 0.0);
|
|
float2 dy = float2(0.0, 1.0);
|
|
|
|
float2 pc = coords.xy;
|
|
|
|
float2 tc = (floor(pc-float2(0.5,0.5))+float2(0.5,0.5));
|
|
|
|
weights[0] = resampler(float4(d(pc, tc -dx -dy), d(pc, tc -dy), d(pc, tc +dx -dy), d(pc, tc+2.0*dx -dy)));
|
|
weights[1] = resampler(float4(d(pc, tc -dx ), d(pc, tc ), d(pc, tc +dx ), d(pc, tc+2.0*dx )));
|
|
weights[2] = resampler(float4(d(pc, tc -dx +dy), d(pc, tc +dy), d(pc, tc +dx +dy), d(pc, tc+2.0*dx +dy)));
|
|
weights[3] = resampler(float4(d(pc, tc -dx+2.0*dy), d(pc, tc +2.0*dy), d(pc, tc +dx+2.0*dy), d(pc, tc+2.0*dx+2.0*dy)));
|
|
|
|
dx = dx;
|
|
dy = dy;
|
|
tc = tc;
|
|
|
|
#define sample_texel(coords) SampleFromVRAM(texpage, clamp((coords), uv_limits.xy, uv_limits.zw))
|
|
|
|
float4 c00 = sample_texel(tc -dx -dy);
|
|
float a00 = float(VECTOR_NEQ(c00, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c10 = sample_texel(tc -dy);
|
|
float a10 = float(VECTOR_NEQ(c10, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c20 = sample_texel(tc +dx -dy);
|
|
float a20 = float(VECTOR_NEQ(c20, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c30 = sample_texel(tc+2.0*dx -dy);
|
|
float a30 = float(VECTOR_NEQ(c30, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c01 = sample_texel(tc -dx );
|
|
float a01 = float(VECTOR_NEQ(c01, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c11 = sample_texel(tc );
|
|
float a11 = float(VECTOR_NEQ(c11, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c21 = sample_texel(tc +dx );
|
|
float a21 = float(VECTOR_NEQ(c21, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c31 = sample_texel(tc+2.0*dx );
|
|
float a31 = float(VECTOR_NEQ(c31, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c02 = sample_texel(tc -dx +dy);
|
|
float a02 = float(VECTOR_NEQ(c02, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c12 = sample_texel(tc +dy);
|
|
float a12 = float(VECTOR_NEQ(c12, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c22 = sample_texel(tc +dx +dy);
|
|
float a22 = float(VECTOR_NEQ(c22, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c32 = sample_texel(tc+2.0*dx +dy);
|
|
float a32 = float(VECTOR_NEQ(c32, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c03 = sample_texel(tc -dx+2.0*dy);
|
|
float a03 = float(VECTOR_NEQ(c03, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c13 = sample_texel(tc +2.0*dy);
|
|
float a13 = float(VECTOR_NEQ(c13, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c23 = sample_texel(tc +dx+2.0*dy);
|
|
float a23 = float(VECTOR_NEQ(c23, TRANSPARENT_PIXEL_COLOR));
|
|
float4 c33 = sample_texel(tc+2.0*dx+2.0*dy);
|
|
float a33 = float(VECTOR_NEQ(c33, TRANSPARENT_PIXEL_COLOR));
|
|
|
|
#undef sample_texel
|
|
|
|
// Get min/max samples
|
|
float4 min_sample = min4(c11, c21, c12, c22);
|
|
float min_sample_alpha = min4(a11, a21, a12, a22);
|
|
float4 max_sample = max4(c11, c21, c12, c22);
|
|
float max_sample_alpha = max4(a11, a21, a12, a22);
|
|
|
|
float4 color;
|
|
color = float4(dot(weights[0], float4(c00.x, c10.x, c20.x, c30.x)), dot(weights[0], float4(c00.y, c10.y, c20.y, c30.y)), dot(weights[0], float4(c00.z, c10.z, c20.z, c30.z)), dot(weights[0], float4(c00.w, c10.w, c20.w, c30.w)));
|
|
color+= float4(dot(weights[1], float4(c01.x, c11.x, c21.x, c31.x)), dot(weights[1], float4(c01.y, c11.y, c21.y, c31.y)), dot(weights[1], float4(c01.z, c11.z, c21.z, c31.z)), dot(weights[1], float4(c01.w, c11.w, c21.w, c31.w)));
|
|
color+= float4(dot(weights[2], float4(c02.x, c12.x, c22.x, c32.x)), dot(weights[2], float4(c02.y, c12.y, c22.y, c32.y)), dot(weights[2], float4(c02.z, c12.z, c22.z, c32.z)), dot(weights[2], float4(c02.w, c12.w, c22.w, c32.w)));
|
|
color+= float4(dot(weights[3], float4(c03.x, c13.x, c23.x, c33.x)), dot(weights[3], float4(c03.y, c13.y, c23.y, c33.y)), dot(weights[3], float4(c03.z, c13.z, c23.z, c33.z)), dot(weights[3], float4(c03.w, c13.w, c23.w, c33.w)));
|
|
color = color/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1)));
|
|
|
|
float alpha;
|
|
alpha = dot(weights[0], float4(a00, a10, a20, a30));
|
|
alpha+= dot(weights[1], float4(a01, a11, a21, a31));
|
|
alpha+= dot(weights[2], float4(a02, a12, a22, a32));
|
|
alpha+= dot(weights[3], float4(a03, a13, a23, a33));
|
|
//alpha = alpha/(weights[0].w + weights[1].w + weights[2].w + weights[3].w);
|
|
alpha = alpha/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1)));
|
|
|
|
// Anti-ringing
|
|
float4 aux = color;
|
|
float aux_alpha = alpha;
|
|
color = clamp(color, min_sample, max_sample);
|
|
alpha = clamp(alpha, min_sample_alpha, max_sample_alpha);
|
|
color = lerp(aux, color, JINC2_AR_STRENGTH);
|
|
alpha = lerp(aux_alpha, alpha, JINC2_AR_STRENGTH);
|
|
|
|
// final sum and weight normalization
|
|
ialpha = alpha;
|
|
texcol = color;
|
|
|
|
// Compensate for partially transparent sampling.
|
|
if (ialpha > 0.0)
|
|
texcol.rgb /= float3(ialpha, ialpha, ialpha);
|
|
|
|
#if BINALPHA
|
|
ialpha = (ialpha >= 0.5) ? 1.0 : 0.0;
|
|
#endif
|
|
}
|
|
)";
|
|
}
|
|
else if (texture_filter == GPUTextureFilter::xBR || texture_filter == GPUTextureFilter::xBRBinAlpha)
|
|
{
|
|
/*
|
|
Hyllian's xBR-vertex code and texel mapping
|
|
|
|
Copyright (C) 2011/2016 Hyllian - sergiogdb@gmail.com
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::xBRBinAlpha);
|
|
ss << R"(
|
|
CONSTANT int BLEND_NONE = 0;
|
|
CONSTANT int BLEND_NORMAL = 1;
|
|
CONSTANT int BLEND_DOMINANT = 2;
|
|
CONSTANT float LUMINANCE_WEIGHT = 1.0;
|
|
CONSTANT float EQUAL_COLOR_TOLERANCE = 0.1176470588235294;
|
|
CONSTANT float STEEP_DIRECTION_THRESHOLD = 2.2;
|
|
CONSTANT float DOMINANT_DIRECTION_THRESHOLD = 3.6;
|
|
CONSTANT float4 w = float4(0.2627, 0.6780, 0.0593, 0.5);
|
|
|
|
float DistYCbCr(float4 pixA, float4 pixB)
|
|
{
|
|
const float scaleB = 0.5 / (1.0 - w.b);
|
|
const float scaleR = 0.5 / (1.0 - w.r);
|
|
float4 diff = pixA - pixB;
|
|
float Y = dot(diff, w);
|
|
float Cb = scaleB * (diff.b - Y);
|
|
float Cr = scaleR * (diff.r - Y);
|
|
|
|
return sqrt(((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr));
|
|
}
|
|
|
|
bool IsPixEqual(const float4 pixA, const float4 pixB)
|
|
{
|
|
return (DistYCbCr(pixA, pixB) < EQUAL_COLOR_TOLERANCE);
|
|
}
|
|
|
|
float get_left_ratio(float2 center, float2 origin, float2 direction, float2 scale)
|
|
{
|
|
float2 P0 = center - origin;
|
|
float2 proj = direction * (dot(P0, direction) / dot(direction, direction));
|
|
float2 distv = P0 - proj;
|
|
float2 orth = float2(-direction.y, direction.x);
|
|
float side = sign(dot(P0, orth));
|
|
float v = side * length(distv * scale);
|
|
|
|
// return step(0, v);
|
|
return smoothstep(-sqrt(2.0)/2.0, sqrt(2.0)/2.0, v);
|
|
}
|
|
|
|
#define P(coord, xoffs, yoffs) SampleFromVRAM(texpage, clamp(coords + float2((xoffs), (yoffs)), uv_limits.xy, uv_limits.zw))
|
|
|
|
void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits,
|
|
out float4 texcol, out float ialpha)
|
|
{
|
|
//---------------------------------------
|
|
// Input Pixel Mapping: -|x|x|x|-
|
|
// x|A|B|C|x
|
|
// x|D|E|F|x
|
|
// x|G|H|I|x
|
|
// -|x|x|x|-
|
|
|
|
float2 scale = float2(8.0, 8.0);
|
|
float2 pos = frac(coords.xy) - float2(0.5, 0.5);
|
|
float2 coord = coords.xy - pos;
|
|
|
|
float4 A = P(coord, -1,-1);
|
|
float Aw = A.w;
|
|
A.w = float(VECTOR_NEQ(A, TRANSPARENT_PIXEL_COLOR));
|
|
float4 B = P(coord, 0,-1);
|
|
float Bw = B.w;
|
|
B.w = float(VECTOR_NEQ(B, TRANSPARENT_PIXEL_COLOR));
|
|
float4 C = P(coord, 1,-1);
|
|
float Cw = C.w;
|
|
C.w = float(VECTOR_NEQ(C, TRANSPARENT_PIXEL_COLOR));
|
|
float4 D = P(coord, -1, 0);
|
|
float Dw = D.w;
|
|
D.w = float(VECTOR_NEQ(D, TRANSPARENT_PIXEL_COLOR));
|
|
float4 E = P(coord, 0, 0);
|
|
float Ew = E.w;
|
|
E.w = float(VECTOR_NEQ(E, TRANSPARENT_PIXEL_COLOR));
|
|
float4 F = P(coord, 1, 0);
|
|
float Fw = F.w;
|
|
F.w = float(VECTOR_NEQ(F, TRANSPARENT_PIXEL_COLOR));
|
|
float4 G = P(coord, -1, 1);
|
|
float Gw = G.w;
|
|
G.w = float(VECTOR_NEQ(G, TRANSPARENT_PIXEL_COLOR));
|
|
float4 H = P(coord, 0, 1);
|
|
float Hw = H.w;
|
|
H.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR));
|
|
float4 I = P(coord, 1, 1);
|
|
float Iw = I.w;
|
|
I.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR));
|
|
|
|
// blendResult Mapping: x|y|
|
|
// w|z|
|
|
int4 blendResult = int4(BLEND_NONE,BLEND_NONE,BLEND_NONE,BLEND_NONE);
|
|
|
|
// Preprocess corners
|
|
// Pixel Tap Mapping: -|-|-|-|-
|
|
// -|-|B|C|-
|
|
// -|D|E|F|x
|
|
// -|G|H|I|x
|
|
// -|-|x|x|-
|
|
if (!((VECTOR_EQ(E,F) && VECTOR_EQ(H,I)) || (VECTOR_EQ(E,H) && VECTOR_EQ(F,I))))
|
|
{
|
|
float dist_H_F = DistYCbCr(G, E) + DistYCbCr(E, C) + DistYCbCr(P(coord, 0,2), I) + DistYCbCr(I, P(coord, 2,0)) + (4.0 * DistYCbCr(H, F));
|
|
float dist_E_I = DistYCbCr(D, H) + DistYCbCr(H, P(coord, 1,2)) + DistYCbCr(B, F) + DistYCbCr(F, P(coord, 2,1)) + (4.0 * DistYCbCr(E, I));
|
|
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_H_F) < dist_E_I;
|
|
blendResult.z = ((dist_H_F < dist_E_I) && VECTOR_NEQ(E,F) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
|
|
}
|
|
|
|
|
|
// Pixel Tap Mapping: -|-|-|-|-
|
|
// -|A|B|-|-
|
|
// x|D|E|F|-
|
|
// x|G|H|I|-
|
|
// -|x|x|-|-
|
|
if (!((VECTOR_EQ(D,E) && VECTOR_EQ(G,H)) || (VECTOR_EQ(D,G) && VECTOR_EQ(E,H))))
|
|
{
|
|
float dist_G_E = DistYCbCr(P(coord, -2,1) , D) + DistYCbCr(D, B) + DistYCbCr(P(coord, -1,2), H) + DistYCbCr(H, F) + (4.0 * DistYCbCr(G, E));
|
|
float dist_D_H = DistYCbCr(P(coord, -2,0) , G) + DistYCbCr(G, P(coord, 0,2)) + DistYCbCr(A, E) + DistYCbCr(E, I) + (4.0 * DistYCbCr(D, H));
|
|
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_H) < dist_G_E;
|
|
blendResult.w = ((dist_G_E > dist_D_H) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
|
|
}
|
|
|
|
// Pixel Tap Mapping: -|-|x|x|-
|
|
// -|A|B|C|x
|
|
// -|D|E|F|x
|
|
// -|-|H|I|-
|
|
// -|-|-|-|-
|
|
if (!((VECTOR_EQ(B,C) && VECTOR_EQ(E,F)) || (VECTOR_EQ(B,E) && VECTOR_EQ(C,F))))
|
|
{
|
|
float dist_E_C = DistYCbCr(D, B) + DistYCbCr(B, P(coord, 1,-2)) + DistYCbCr(H, F) + DistYCbCr(F, P(coord, 2,-1)) + (4.0 * DistYCbCr(E, C));
|
|
float dist_B_F = DistYCbCr(A, E) + DistYCbCr(E, I) + DistYCbCr(P(coord, 0,-2), C) + DistYCbCr(C, P(coord, 2,0)) + (4.0 * DistYCbCr(B, F));
|
|
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_B_F) < dist_E_C;
|
|
blendResult.y = ((dist_E_C > dist_B_F) && VECTOR_NEQ(E,B) && VECTOR_NEQ(E,F)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
|
|
}
|
|
|
|
// Pixel Tap Mapping: -|x|x|-|-
|
|
// x|A|B|C|-
|
|
// x|D|E|F|-
|
|
// -|G|H|-|-
|
|
// -|-|-|-|-
|
|
if (!((VECTOR_EQ(A,B) && VECTOR_EQ(D,E)) || (VECTOR_EQ(A,D) && VECTOR_EQ(B,E))))
|
|
{
|
|
float dist_D_B = DistYCbCr(P(coord, -2,0), A) + DistYCbCr(A, P(coord, 0,-2)) + DistYCbCr(G, E) + DistYCbCr(E, C) + (4.0 * DistYCbCr(D, B));
|
|
float dist_A_E = DistYCbCr(P(coord, -2,-1), D) + DistYCbCr(D, H) + DistYCbCr(P(coord, -1,-2), B) + DistYCbCr(B, F) + (4.0 * DistYCbCr(A, E));
|
|
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_B) < dist_A_E;
|
|
blendResult.x = ((dist_D_B < dist_A_E) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,B)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
|
|
}
|
|
|
|
float4 res = E;
|
|
float resW = Ew;
|
|
|
|
// Pixel Tap Mapping: -|-|-|-|-
|
|
// -|-|B|C|-
|
|
// -|D|E|F|x
|
|
// -|G|H|I|x
|
|
// -|-|x|x|-
|
|
if(blendResult.z != BLEND_NONE)
|
|
{
|
|
float dist_F_G = DistYCbCr(F, G);
|
|
float dist_H_C = DistYCbCr(H, C);
|
|
bool doLineBlend = (blendResult.z == BLEND_DOMINANT ||
|
|
!((blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) || (blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) ||
|
|
(IsPixEqual(G, H) && IsPixEqual(H, I) && IsPixEqual(I, F) && IsPixEqual(F, C) && !IsPixEqual(E, I))));
|
|
|
|
float2 origin = float2(0.0, 1.0 / sqrt(2.0));
|
|
float2 direction = float2(1.0, -1.0);
|
|
if(doLineBlend)
|
|
{
|
|
bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_F_G <= dist_H_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(D,G);
|
|
bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_H_C <= dist_F_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(B,C);
|
|
origin = haveShallowLine? float2(0.0, 0.25) : float2(0.0, 0.5);
|
|
direction.x += haveShallowLine? 1.0: 0.0;
|
|
direction.y -= haveSteepLine? 1.0: 0.0;
|
|
}
|
|
|
|
float4 blendPix = lerp(H,F, step(DistYCbCr(E, F), DistYCbCr(E, H)));
|
|
float blendW = lerp(Hw,Fw, step(DistYCbCr(E, F), DistYCbCr(E, H)));
|
|
res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
|
|
resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
|
|
}
|
|
|
|
// Pixel Tap Mapping: -|-|-|-|-
|
|
// -|A|B|-|-
|
|
// x|D|E|F|-
|
|
// x|G|H|I|-
|
|
// -|x|x|-|-
|
|
if(blendResult.w != BLEND_NONE)
|
|
{
|
|
float dist_H_A = DistYCbCr(H, A);
|
|
float dist_D_I = DistYCbCr(D, I);
|
|
bool doLineBlend = (blendResult.w == BLEND_DOMINANT ||
|
|
!((blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) || (blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) ||
|
|
(IsPixEqual(A, D) && IsPixEqual(D, G) && IsPixEqual(G, H) && IsPixEqual(H, I) && !IsPixEqual(E, G))));
|
|
|
|
float2 origin = float2(-1.0 / sqrt(2.0), 0.0);
|
|
float2 direction = float2(1.0, 1.0);
|
|
if(doLineBlend)
|
|
{
|
|
bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_H_A <= dist_D_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(B,A);
|
|
bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_D_I <= dist_H_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(F,I);
|
|
origin = haveShallowLine? float2(-0.25, 0.0) : float2(-0.5, 0.0);
|
|
direction.y += haveShallowLine? 1.0: 0.0;
|
|
direction.x += haveSteepLine? 1.0: 0.0;
|
|
}
|
|
origin = origin;
|
|
direction = direction;
|
|
|
|
float4 blendPix = lerp(H,D, step(DistYCbCr(E, D), DistYCbCr(E, H)));
|
|
float blendW = lerp(Hw,Dw, step(DistYCbCr(E, D), DistYCbCr(E, H)));
|
|
res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
|
|
resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
|
|
}
|
|
|
|
// Pixel Tap Mapping: -|-|x|x|-
|
|
// -|A|B|C|x
|
|
// -|D|E|F|x
|
|
// -|-|H|I|-
|
|
// -|-|-|-|-
|
|
if(blendResult.y != BLEND_NONE)
|
|
{
|
|
float dist_B_I = DistYCbCr(B, I);
|
|
float dist_F_A = DistYCbCr(F, A);
|
|
bool doLineBlend = (blendResult.y == BLEND_DOMINANT ||
|
|
!((blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) || (blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) ||
|
|
(IsPixEqual(I, F) && IsPixEqual(F, C) && IsPixEqual(C, B) && IsPixEqual(B, A) && !IsPixEqual(E, C))));
|
|
|
|
float2 origin = float2(1.0 / sqrt(2.0), 0.0);
|
|
float2 direction = float2(-1.0, -1.0);
|
|
|
|
if(doLineBlend)
|
|
{
|
|
bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_B_I <= dist_F_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(H,I);
|
|
bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_F_A <= dist_B_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(D,A);
|
|
origin = haveShallowLine? float2(0.25, 0.0) : float2(0.5, 0.0);
|
|
direction.y -= haveShallowLine? 1.0: 0.0;
|
|
direction.x -= haveSteepLine? 1.0: 0.0;
|
|
}
|
|
|
|
float4 blendPix = lerp(F,B, step(DistYCbCr(E, B), DistYCbCr(E, F)));
|
|
float blendW = lerp(Fw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, F)));
|
|
res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
|
|
resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
|
|
}
|
|
|
|
// Pixel Tap Mapping: -|x|x|-|-
|
|
// x|A|B|C|-
|
|
// x|D|E|F|-
|
|
// -|G|H|-|-
|
|
// -|-|-|-|-
|
|
if(blendResult.x != BLEND_NONE)
|
|
{
|
|
float dist_D_C = DistYCbCr(D, C);
|
|
float dist_B_G = DistYCbCr(B, G);
|
|
bool doLineBlend = (blendResult.x == BLEND_DOMINANT ||
|
|
!((blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) || (blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) ||
|
|
(IsPixEqual(C, B) && IsPixEqual(B, A) && IsPixEqual(A, D) && IsPixEqual(D, G) && !IsPixEqual(E, A))));
|
|
|
|
float2 origin = float2(0.0, -1.0 / sqrt(2.0));
|
|
float2 direction = float2(-1.0, 1.0);
|
|
if(doLineBlend)
|
|
{
|
|
bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_D_C <= dist_B_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(F,C);
|
|
bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_B_G <= dist_D_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(H,G);
|
|
origin = haveShallowLine? float2(0.0, -0.25) : float2(0.0, -0.5);
|
|
direction.x -= haveShallowLine? 1.0: 0.0;
|
|
direction.y += haveSteepLine? 1.0: 0.0;
|
|
}
|
|
|
|
float4 blendPix = lerp(D,B, step(DistYCbCr(E, B), DistYCbCr(E, D)));
|
|
float blendW = lerp(Dw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, D)));
|
|
res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
|
|
resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
|
|
}
|
|
|
|
ialpha = res.w;
|
|
texcol = float4(res.xyz, resW);
|
|
|
|
// Compensate for partially transparent sampling.
|
|
if (ialpha > 0.0)
|
|
texcol.rgb /= float3(ialpha, ialpha, ialpha);
|
|
|
|
#if BINALPHA
|
|
ialpha = (ialpha >= 0.5) ? 1.0 : 0.0;
|
|
#endif
|
|
}
|
|
|
|
#undef P
|
|
|
|
)";
|
|
}
|
|
else if (texture_filter == GPUTextureFilter::MMPX)
|
|
{
|
|
ss << "#define src(xoffs, yoffs) packUnorm4x8(SampleFromVRAM(texpage, clamp(bcoords + float2((xoffs), (yoffs)), "
|
|
"uv_limits.xy, uv_limits.zw)))\n";
|
|
|
|
/*
|
|
* This part of the shader is from MMPX.glc from https://casual-effects.com/research/McGuire2021PixelArt/index.html
|
|
* Copyright 2020 Morgan McGuire & Mara Gagiu.
|
|
* Provided under the Open Source MIT license https://opensource.org/licenses/MIT
|
|
*/
|
|
ss << R"(
|
|
uint luma(uint C) {
|
|
uint alpha = (C & 0xFF000000u) >> 24;
|
|
return (((C & 0x00FF0000u) >> 16) + ((C & 0x0000FF00u) >> 8) + (C & 0x000000FFu) + 1u) * (256u - alpha);
|
|
}
|
|
|
|
bool all_eq2(uint B, uint A0, uint A1) {
|
|
return ((B ^ A0) | (B ^ A1)) == 0u;
|
|
}
|
|
|
|
bool all_eq3(uint B, uint A0, uint A1, uint A2) {
|
|
return ((B ^ A0) | (B ^ A1) | (B ^ A2)) == 0u;
|
|
}
|
|
|
|
bool all_eq4(uint B, uint A0, uint A1, uint A2, uint A3) {
|
|
return ((B ^ A0) | (B ^ A1) | (B ^ A2) | (B ^ A3)) == 0u;
|
|
}
|
|
|
|
bool any_eq3(uint B, uint A0, uint A1, uint A2) {
|
|
return B == A0 || B == A1 || B == A2;
|
|
}
|
|
|
|
bool none_eq2(uint B, uint A0, uint A1) {
|
|
return (B != A0) && (B != A1);
|
|
}
|
|
|
|
bool none_eq4(uint B, uint A0, uint A1, uint A2, uint A3) {
|
|
return B != A0 && B != A1 && B != A2 && B != A3;
|
|
}
|
|
|
|
void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits, out float4 texcol, out float ialpha)
|
|
{
|
|
float2 bcoords = floor(coords);
|
|
|
|
uint A = src(-1, -1), B = src(+0, -1), C = src(+1, -1);
|
|
uint D = src(-1, +0), E = src(+0, +0), F = src(+1, +0);
|
|
uint G = src(-1, +1), H = src(+0, +1), I = src(+1, +1);
|
|
|
|
uint J = E, K = E, L = E, M = E;
|
|
|
|
if (((A ^ E) | (B ^ E) | (C ^ E) | (D ^ E) | (F ^ E) | (G ^ E) | (H ^ E) | (I ^ E)) != 0u) {
|
|
uint P = src(+0, -2), S = src(+0, +2);
|
|
uint Q = src(-2, +0), R = src(+2, +0);
|
|
uint Bl = luma(B), Dl = luma(D), El = luma(E), Fl = luma(F), Hl = luma(H);
|
|
|
|
// 1:1 slope rules
|
|
if ((D == B && D != H && D != F) && (El >= Dl || E == A) && any_eq3(E, A, C, G) && ((El < Dl) || A != D || E != P || E != Q)) J = D;
|
|
if ((B == F && B != D && B != H) && (El >= Bl || E == C) && any_eq3(E, A, C, I) && ((El < Bl) || C != B || E != P || E != R)) K = B;
|
|
if ((H == D && H != F && H != B) && (El >= Hl || E == G) && any_eq3(E, A, G, I) && ((El < Hl) || G != H || E != S || E != Q)) L = H;
|
|
if ((F == H && F != B && F != D) && (El >= Fl || E == I) && any_eq3(E, C, G, I) && ((El < Fl) || I != H || E != R || E != S)) M = F;
|
|
|
|
// Intersection rules
|
|
if ((E != F && all_eq4(E, C, I, D, Q) && all_eq2(F, B, H)) && (F != src(+3, +0))) K = M = F;
|
|
if ((E != D && all_eq4(E, A, G, F, R) && all_eq2(D, B, H)) && (D != src(-3, +0))) J = L = D;
|
|
if ((E != H && all_eq4(E, G, I, B, P) && all_eq2(H, D, F)) && (H != src(+0, +3))) L = M = H;
|
|
if ((E != B && all_eq4(E, A, C, H, S) && all_eq2(B, D, F)) && (B != src(+0, -3))) J = K = B;
|
|
if (Bl < El && all_eq4(E, G, H, I, S) && none_eq4(E, A, D, C, F)) J = K = B;
|
|
if (Hl < El && all_eq4(E, A, B, C, P) && none_eq4(E, D, G, I, F)) L = M = H;
|
|
if (Fl < El && all_eq4(E, A, D, G, Q) && none_eq4(E, B, C, I, H)) K = M = F;
|
|
if (Dl < El && all_eq4(E, C, F, I, R) && none_eq4(E, B, A, G, H)) J = L = D;
|
|
|
|
// 2:1 slope rules
|
|
if (H != B) {
|
|
if (H != A && H != E && H != C) {
|
|
if (all_eq3(H, G, F, R) && none_eq2(H, D, src(+2, -1))) L = M;
|
|
if (all_eq3(H, I, D, Q) && none_eq2(H, F, src(-2, -1))) M = L;
|
|
}
|
|
|
|
if (B != I && B != G && B != E) {
|
|
if (all_eq3(B, A, F, R) && none_eq2(B, D, src(+2, +1))) J = K;
|
|
if (all_eq3(B, C, D, Q) && none_eq2(B, F, src(-2, +1))) K = J;
|
|
}
|
|
} // H !== B
|
|
|
|
if (F != D) {
|
|
if (D != I && D != E && D != C) {
|
|
if (all_eq3(D, A, H, S) && none_eq2(D, B, src(+1, +2))) J = L;
|
|
if (all_eq3(D, G, B, P) && none_eq2(D, H, src(+1, -2))) L = J;
|
|
}
|
|
|
|
if (F != E && F != A && F != G) {
|
|
if (all_eq3(F, C, H, S) && none_eq2(F, B, src(-1, +2))) K = M;
|
|
if (all_eq3(F, I, B, P) && none_eq2(F, H, src(-1, -2))) M = K;
|
|
}
|
|
} // F !== D
|
|
} // not constant
|
|
|
|
// select quadrant based on fractional part of texture coordinates
|
|
float2 fpart = frac(coords);
|
|
uint res = (fpart.x < 0.5f) ? ((fpart.y < 0.5f) ? J : L) : ((fpart.y < 0.5f) ? K : M);
|
|
|
|
ialpha = float(res != 0u);
|
|
texcol = unpackUnorm4x8(res);
|
|
}
|
|
|
|
#undef src
|
|
)";
|
|
}
|
|
else if (texture_filter == GPUTextureFilter::Scale2x)
|
|
{
|
|
// Based on https://www.scale2x.it/algorithm
|
|
ss << R"(
|
|
#define src(xoffs, yoffs) packUnorm4x8(SampleFromVRAM(texpage, clamp(bcoords + float2((xoffs), (yoffs)), uv_limits.xy, uv_limits.zw)))
|
|
|
|
void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits, out float4 texcol, out float ialpha)
|
|
{
|
|
float2 bcoords = floor(coords);
|
|
|
|
uint E = src(+0, +0);
|
|
uint B = src(+0, - 1);
|
|
uint D = src(-1, +0);
|
|
uint F = src(+1, +0);
|
|
uint H = src(+0, +1);
|
|
|
|
uint J = (D == B && B != F && D != H) ? D : E;
|
|
uint K = (B == F && D != F && H != F) ? F : E;
|
|
uint L = (H == D && F != D && B != D) ? D : E;
|
|
uint M = (H == F && D != H && B != F) ? F : E;
|
|
|
|
// select quadrant based on fractional part of texture coordinates
|
|
float2 fpart = frac(coords);
|
|
uint res = (fpart.x < 0.5f) ? ((fpart.y < 0.5f) ? J : L) : ((fpart.y < 0.5f) ? K : M);
|
|
|
|
ialpha = float(res != 0u);
|
|
texcol = unpackUnorm4x8(res);
|
|
}
|
|
|
|
#undef src
|
|
)";
|
|
}
|
|
else if (texture_filter == GPUTextureFilter::Scale3x)
|
|
{
|
|
// Based on https://www.scale2x.it/algorithm
|
|
ss << R"(
|
|
#define src(xoffs, yoffs) packUnorm4x8(SampleFromVRAM(texpage, clamp(bcoords + float2((xoffs), (yoffs)), uv_limits.xy, uv_limits.zw)))
|
|
|
|
void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits, out float4 texcol, out float ialpha)
|
|
{
|
|
float2 bcoords = floor(coords);
|
|
|
|
uint E = src(+0, +0);
|
|
uint B = src(+0, -1);
|
|
uint D = src(-1, +0);
|
|
uint F = src(+1, +0);
|
|
uint H = src(+0, +1);
|
|
|
|
uint res = E;
|
|
if (B != H && D != F) {
|
|
uint A = src(-1, -1);
|
|
uint C = src(+1, -1);
|
|
uint G = src(-1, +1);
|
|
uint I = src(+1, +1);
|
|
|
|
uint E0 = (D == B) ? D : E;
|
|
uint E1 = (D == B && E != C) || (B == F && E != A) ? B : E;
|
|
uint E2 = (B == F) ? F : E;
|
|
uint E3 = (D == B && E != G) || (D == H && E != A) ? D : E;
|
|
uint E4 = E;
|
|
uint E5 = (B == F && E != I) || (H == F && E != C) ? F : E;
|
|
uint E6 = (D == H) ? D : E;
|
|
uint E7 = (D == H && E != I) || (H == F && E != G) ? H : E;
|
|
uint E8 = (H == F) ? F : E;
|
|
|
|
// select quadrant based on fractional part of texture coordinates
|
|
float2 fpart = frac(coords);
|
|
uint R0, R1, R2;
|
|
if (fpart.y < 0.34f) {
|
|
R0 = E0;
|
|
R1 = E1;
|
|
R2 = E2;
|
|
} else if (fpart.y < 0.67f) {
|
|
R0 = E3;
|
|
R1 = E4;
|
|
R2 = E5;
|
|
} else {
|
|
R0 = E6;
|
|
R1 = E7;
|
|
R2 = E8;
|
|
}
|
|
|
|
res = (fpart.x < 0.34f) ? R0 : ((fpart.x < 0.67f) ? R1 : R2);
|
|
}
|
|
|
|
ialpha = float(res != 0u);
|
|
texcol = unpackUnorm4x8(res);
|
|
}
|
|
|
|
#undef src
|
|
)";
|
|
}
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(
|
|
GPU_HW::BatchRenderMode render_mode, GPUTransparencyMode transparency, GPU_HW::BatchTextureMode texture_mode,
|
|
GPUTextureFilter texture_filtering, bool upscaled, bool msaa, bool per_sample_shading, bool uv_limits,
|
|
bool force_round_texcoords, bool true_color, bool dithering, bool scaled_dithering, bool disable_color_perspective,
|
|
bool interlacing, bool scaled_interlacing, bool check_mask, bool write_mask_as_depth, bool use_rov,
|
|
bool use_rov_depth, bool rov_depth_test, bool rov_depth_write) const
|
|
{
|
|
DebugAssert(!true_color || !dithering); // Should not be doing dithering+true color.
|
|
|
|
DebugAssert(transparency == GPUTransparencyMode::Disabled || render_mode == GPU_HW::BatchRenderMode::ShaderBlend);
|
|
DebugAssert((!rov_depth_test && !rov_depth_write) || (use_rov && use_rov_depth));
|
|
|
|
const bool textured = (texture_mode != GPU_HW::BatchTextureMode::Disabled);
|
|
const bool palette =
|
|
(texture_mode == GPU_HW::BatchTextureMode::Palette4Bit || texture_mode == GPU_HW::BatchTextureMode::Palette8Bit);
|
|
const bool page_texture = (texture_mode == GPU_HW::BatchTextureMode::PageTexture);
|
|
const bool shader_blending = (render_mode == GPU_HW::BatchRenderMode::ShaderBlend);
|
|
const bool use_dual_source = (!shader_blending && !use_rov && m_supports_dual_source_blend &&
|
|
((render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled &&
|
|
render_mode != GPU_HW::BatchRenderMode::OnlyOpaque) ||
|
|
texture_filtering != GPUTextureFilter::Nearest));
|
|
|
|
std::stringstream ss;
|
|
WriteHeader(ss, use_rov, shader_blending && !use_rov, use_dual_source);
|
|
DefineMacro(ss, "TRANSPARENCY", render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled);
|
|
DefineMacro(ss, "TRANSPARENCY_ONLY_OPAQUE", render_mode == GPU_HW::BatchRenderMode::OnlyOpaque);
|
|
DefineMacro(ss, "TRANSPARENCY_ONLY_TRANSPARENT", render_mode == GPU_HW::BatchRenderMode::OnlyTransparent);
|
|
DefineMacro(ss, "TRANSPARENCY_MODE", static_cast<s32>(transparency));
|
|
DefineMacro(ss, "SHADER_BLENDING", shader_blending);
|
|
DefineMacro(ss, "CHECK_MASK_BIT", check_mask);
|
|
DefineMacro(ss, "TEXTURED", textured);
|
|
DefineMacro(ss, "PALETTE", palette);
|
|
DefineMacro(ss, "PALETTE_4_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette4Bit);
|
|
DefineMacro(ss, "PALETTE_8_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette8Bit);
|
|
DefineMacro(ss, "PAGE_TEXTURE", page_texture);
|
|
DefineMacro(ss, "DITHERING", dithering);
|
|
DefineMacro(ss, "DITHERING_SCALED", dithering && scaled_dithering);
|
|
DefineMacro(ss, "INTERLACING", interlacing);
|
|
DefineMacro(ss, "INTERLACING_SCALED", interlacing && scaled_interlacing);
|
|
DefineMacro(ss, "TRUE_COLOR", true_color);
|
|
DefineMacro(ss, "TEXTURE_FILTERING", texture_filtering != GPUTextureFilter::Nearest);
|
|
DefineMacro(ss, "UV_LIMITS", uv_limits);
|
|
DefineMacro(ss, "USE_ROV", use_rov);
|
|
DefineMacro(ss, "USE_ROV_DEPTH", use_rov_depth);
|
|
DefineMacro(ss, "ROV_DEPTH_TEST", rov_depth_test);
|
|
DefineMacro(ss, "ROV_DEPTH_WRITE", rov_depth_write);
|
|
DefineMacro(ss, "USE_DUAL_SOURCE", use_dual_source);
|
|
DefineMacro(ss, "WRITE_MASK_AS_DEPTH", write_mask_as_depth);
|
|
DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords);
|
|
DefineMacro(ss, "UPSCALED", upscaled);
|
|
|
|
// Used for converting to normalized coordinates for sampling.
|
|
ss << "CONSTANT float2 RCP_VRAM_SIZE = float2(1.0 / float(" << VRAM_WIDTH << "), 1.0 / float(" << VRAM_HEIGHT
|
|
<< "));\n";
|
|
|
|
WriteColorConversionFunctions(ss);
|
|
WriteBatchUniformBuffer(ss);
|
|
DeclareTexture(ss, "samp0", 0);
|
|
|
|
if (use_rov)
|
|
{
|
|
DeclareImage(ss, "rov_color", 0);
|
|
if (use_rov_depth)
|
|
DeclareImage(ss, "rov_depth", 1, true);
|
|
}
|
|
|
|
if (m_glsl)
|
|
ss << "CONSTANT int[16] s_dither_values = int[16]( ";
|
|
else
|
|
ss << "CONSTANT int s_dither_values[] = {";
|
|
for (u32 i = 0; i < 16; i++)
|
|
{
|
|
if (i > 0)
|
|
ss << ", ";
|
|
ss << DITHER_MATRIX[i / 4][i % 4];
|
|
}
|
|
if (m_glsl)
|
|
ss << " );\n";
|
|
else
|
|
ss << "};\n";
|
|
|
|
ss << R"(
|
|
uint3 ApplyDithering(uint2 coord, uint3 icol)
|
|
{
|
|
#if (DITHERING_SCALED != 0 || UPSCALED == 0)
|
|
uint2 fc = coord & uint2(3u, 3u);
|
|
#else
|
|
uint2 fc = uint2(float2(coord) * u_rcp_resolution_scale) & uint2(3u, 3u);
|
|
#endif
|
|
int offset = s_dither_values[fc.y * 4u + fc.x];
|
|
return uint3(clamp((int3(icol) + offset) >> 3, 0, 31));
|
|
}
|
|
|
|
#if TEXTURED
|
|
CONSTANT float4 TRANSPARENT_PIXEL_COLOR = float4(0.0, 0.0, 0.0, 0.0);
|
|
|
|
#if PALETTE
|
|
#define TEXPAGE_VALUE uint4
|
|
#else
|
|
#define TEXPAGE_VALUE uint2
|
|
#endif
|
|
|
|
uint2 ApplyTextureWindow(uint2 coords)
|
|
{
|
|
uint x = (uint(coords.x) & u_texture_window_and.x) | u_texture_window_or.x;
|
|
uint y = (uint(coords.y) & u_texture_window_and.y) | u_texture_window_or.y;
|
|
return uint2(x, y);
|
|
}
|
|
|
|
uint2 FloatToIntegerCoords(float2 coords)
|
|
{
|
|
// With the vertex offset applied at 1x resolution scale, we want to round the texture coordinates.
|
|
// Floor them otherwise, as it currently breaks when upscaling as the vertex offset is not applied.
|
|
return uint2((UPSCALED == 0 || FORCE_ROUND_TEXCOORDS != 0) ? roundEven(coords) : floor(coords));
|
|
}
|
|
|
|
#if PAGE_TEXTURE
|
|
|
|
float4 SampleFromPageTexture(float2 coords)
|
|
{
|
|
// Cached textures.
|
|
uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
|
|
#if UPSCALED
|
|
float2 fpart = frac(coords);
|
|
coords = (float2(icoord) + fpart);
|
|
#else
|
|
// Drop fractional part.
|
|
coords = float2(icoord);
|
|
#endif
|
|
|
|
// Normalize.
|
|
coords = coords * (1.0f / 256.0f);
|
|
return SAMPLE_TEXTURE(samp0, coords);
|
|
}
|
|
|
|
#endif
|
|
|
|
#if !PAGE_TEXTURE || TEXTURE_FILTERING
|
|
|
|
float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords)
|
|
{
|
|
#if PAGE_TEXTURE
|
|
return SampleFromPageTexture(coords);
|
|
#elif PALETTE
|
|
uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
|
|
|
|
uint2 vicoord;
|
|
#if PALETTE_4_BIT
|
|
// 4bit will never wrap, since it's in the last texpage row.
|
|
vicoord = uint2(texpage.x + (icoord.x / 4u), texpage.y + icoord.y);
|
|
#elif PALETTE_8_BIT
|
|
// 8bit can wrap in the X direction.
|
|
vicoord = uint2((texpage.x + (icoord.x / 2u)) & 0x3FFu, texpage.y + icoord.y);
|
|
#endif
|
|
|
|
// load colour/palette
|
|
// use texelFetch()/load for native resolution to work around point sampling precision
|
|
// in some drivers, such as older AMD and Mali Midgard
|
|
#if !UPSCALED
|
|
float4 texel = LOAD_TEXTURE(samp0, int2(vicoord), 0);
|
|
#else
|
|
float4 texel = SAMPLE_TEXTURE_LEVEL(samp0, float2(vicoord) * RCP_VRAM_SIZE, 0.0);
|
|
#endif
|
|
uint vram_value = RGBA8ToRGBA5551(texel);
|
|
|
|
// apply palette
|
|
#if PALETTE_4_BIT
|
|
uint subpixel = icoord.x & 3u;
|
|
uint palette_index = (vram_value >> (subpixel * 4u)) & 0x0Fu;
|
|
uint2 palette_icoord = uint2((texpage.z + palette_index), texpage.w);
|
|
#elif PALETTE_8_BIT
|
|
// can only wrap in X direction for 8-bit, 4-bit will fit in texpage size.
|
|
uint subpixel = icoord.x & 1u;
|
|
uint palette_index = (vram_value >> (subpixel * 8u)) & 0xFFu;
|
|
uint2 palette_icoord = uint2(((texpage.z + palette_index) & 0x3FFu), texpage.w);
|
|
#endif
|
|
|
|
#if !UPSCALED
|
|
return LOAD_TEXTURE(samp0, int2(palette_icoord), 0);
|
|
#else
|
|
return SAMPLE_TEXTURE_LEVEL(samp0, float2(palette_icoord) * RCP_VRAM_SIZE, 0.0);
|
|
#endif
|
|
#else
|
|
// Direct texturing - usually render-to-texture effects.
|
|
#if !UPSCALED
|
|
uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
|
|
uint2 vicoord = (texpage.xy + icoord) & uint2(1023, 511);
|
|
return LOAD_TEXTURE(samp0, int2(vicoord), 0);
|
|
#else
|
|
// Coordinates are already upscaled, we need to downscale them to apply the texture
|
|
// window, then re-upscale/offset. We can't round here, because it could result in
|
|
// going outside of the texture window.
|
|
float2 ncoords = coords * u_rcp_resolution_scale;
|
|
float2 nfpart = frac(ncoords);
|
|
uint2 nicoord = ApplyTextureWindow(uint2(floor(ncoords)));
|
|
uint2 nvicoord = (texpage.xy + nicoord) & uint2(1023, 511);
|
|
ncoords = (float2(nvicoord) + nfpart);
|
|
return SAMPLE_TEXTURE_LEVEL(samp0, ncoords * RCP_VRAM_SIZE, 0.0);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
#endif // !PAGE_TEXTURE || TEXTURE_FILTERING
|
|
|
|
#endif // TEXTURED
|
|
)";
|
|
|
|
const u32 num_fragment_outputs = use_rov ? 0 : (use_dual_source ? 2 : 1);
|
|
if (textured && page_texture)
|
|
{
|
|
if (texture_filtering != GPUTextureFilter::Nearest)
|
|
WriteBatchTextureFilter(ss, texture_filtering);
|
|
|
|
if (uv_limits)
|
|
{
|
|
DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "float4 v_uv_limits"}}, true, num_fragment_outputs,
|
|
use_dual_source, write_mask_as_depth, msaa, per_sample_shading, false,
|
|
disable_color_perspective, shader_blending && !use_rov, use_rov);
|
|
}
|
|
else
|
|
{
|
|
DeclareFragmentEntryPoint(ss, 1, 1, {}, true, num_fragment_outputs, use_dual_source, write_mask_as_depth, msaa,
|
|
per_sample_shading, false, disable_color_perspective, shader_blending && !use_rov,
|
|
use_rov);
|
|
}
|
|
}
|
|
else if (textured)
|
|
{
|
|
if (texture_filtering != GPUTextureFilter::Nearest)
|
|
WriteBatchTextureFilter(ss, texture_filtering);
|
|
|
|
if (uv_limits)
|
|
{
|
|
DeclareFragmentEntryPoint(ss, 1, 1,
|
|
{{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"},
|
|
{"nointerpolation", "float4 v_uv_limits"}},
|
|
true, num_fragment_outputs, use_dual_source, write_mask_as_depth, msaa,
|
|
per_sample_shading, false, disable_color_perspective, shader_blending && !use_rov,
|
|
use_rov);
|
|
}
|
|
else
|
|
{
|
|
DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, true,
|
|
num_fragment_outputs, use_dual_source, write_mask_as_depth, msaa, per_sample_shading,
|
|
false, disable_color_perspective, shader_blending && !use_rov, use_rov);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
DeclareFragmentEntryPoint(ss, 1, 0, {}, true, num_fragment_outputs, use_dual_source, write_mask_as_depth, msaa,
|
|
per_sample_shading, false, disable_color_perspective, shader_blending && !use_rov,
|
|
use_rov);
|
|
}
|
|
|
|
ss << R"(
|
|
{
|
|
uint3 vertcol = uint3(v_col0.rgb * float3(255.0, 255.0, 255.0));
|
|
uint2 fragpos = uint2(v_pos.xy);
|
|
|
|
bool semitransparent;
|
|
uint3 icolor;
|
|
float ialpha;
|
|
float oalpha;
|
|
|
|
#if INTERLACING
|
|
#if INTERLACING_SCALED || !UPSCALED
|
|
if ((fragpos.y & 1u) == u_interlaced_displayed_field)
|
|
discard;
|
|
#else
|
|
if ((uint(v_pos.y * u_rcp_resolution_scale) & 1u) == u_interlaced_displayed_field)
|
|
discard;
|
|
#endif
|
|
#endif
|
|
|
|
#if TEXTURED
|
|
float4 texcol;
|
|
#if PAGE_TEXTURE && !TEXTURE_FILTERING
|
|
#if UV_LIMITS
|
|
texcol = SampleFromPageTexture(clamp(v_tex0, v_uv_limits.xy, v_uv_limits.zw));
|
|
#else
|
|
texcol = SampleFromPageTexture(v_tex0);
|
|
#endif
|
|
if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR))
|
|
discard;
|
|
|
|
ialpha = 1.0;
|
|
#elif TEXTURE_FILTERING
|
|
#if PAGE_TEXTURE
|
|
FilteredSampleFromVRAM(VECTOR_BROADCAST(TEXPAGE_VALUE, 0u), v_tex0, v_uv_limits, texcol, ialpha);
|
|
#else
|
|
FilteredSampleFromVRAM(v_texpage, v_tex0, v_uv_limits, texcol, ialpha);
|
|
#endif
|
|
if (ialpha < 0.5)
|
|
discard;
|
|
#else
|
|
#if UV_LIMITS
|
|
texcol = SampleFromVRAM(v_texpage, clamp(v_tex0, v_uv_limits.xy, v_uv_limits.zw));
|
|
#else
|
|
texcol = SampleFromVRAM(v_texpage, v_tex0);
|
|
#endif
|
|
if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR))
|
|
discard;
|
|
|
|
ialpha = 1.0;
|
|
#endif
|
|
|
|
semitransparent = (texcol.a >= 0.5);
|
|
|
|
// If not using true color, truncate the framebuffer colors to 5-bit.
|
|
#if !TRUE_COLOR
|
|
icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)) >> 3;
|
|
icolor = (icolor * vertcol) >> 4;
|
|
#if DITHERING
|
|
icolor = ApplyDithering(fragpos, icolor);
|
|
#else
|
|
icolor = min(icolor >> 3, uint3(31u, 31u, 31u));
|
|
#endif
|
|
#else
|
|
icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0));
|
|
icolor = (icolor * vertcol) >> 7;
|
|
icolor = min(icolor, uint3(255u, 255u, 255u));
|
|
#endif
|
|
|
|
// Compute output alpha (mask bit)
|
|
oalpha = float(u_set_mask_while_drawing ? 1 : int(semitransparent));
|
|
#else
|
|
// All pixels are semitransparent for untextured polygons.
|
|
semitransparent = true;
|
|
icolor = vertcol;
|
|
ialpha = 1.0;
|
|
|
|
#if DITHERING
|
|
icolor = ApplyDithering(fragpos, icolor);
|
|
#else
|
|
#if !TRUE_COLOR
|
|
icolor >>= 3;
|
|
#endif
|
|
#endif
|
|
|
|
// However, the mask bit is cleared if set mask bit is false.
|
|
oalpha = float(u_set_mask_while_drawing);
|
|
#endif
|
|
|
|
#if SHADER_BLENDING
|
|
#if USE_ROV
|
|
BEGIN_ROV_REGION;
|
|
float4 bg_col = ROV_LOAD(rov_color, fragpos);
|
|
float4 o_col0;
|
|
bool discarded = false;
|
|
|
|
#if ROV_DEPTH_TEST
|
|
float bg_depth = ROV_LOAD(rov_depth, fragpos).r;
|
|
discarded = (v_pos.z > bg_depth);
|
|
#endif
|
|
#if CHECK_MASK_BIT
|
|
discarded = discarded || (bg_col.a != 0.0);
|
|
#endif
|
|
#else
|
|
float4 bg_col = LAST_FRAG_COLOR;
|
|
#if CHECK_MASK_BIT
|
|
if (bg_col.a != 0.0)
|
|
discard;
|
|
#endif
|
|
#endif
|
|
|
|
// Work in normalized space for true colour, matches HW blend.
|
|
float4 fg_col = float4(float3(icolor), oalpha);
|
|
#if TRUE_COLOR
|
|
fg_col.rgb /= 255.0;
|
|
#elif TRANSPARENCY // rgb not used in check-mask only
|
|
bg_col.rgb = roundEven(bg_col.rgb * 31.0);
|
|
#endif
|
|
|
|
#if TEXTURE_FILTERING
|
|
#if TRANSPARENCY_MODE == 0 || TRANSPARENCY_MODE == 3
|
|
bg_col.rgb /= ialpha;
|
|
#endif
|
|
fg_col.rgb *= ialpha;
|
|
#endif
|
|
|
|
o_col0.a = fg_col.a;
|
|
#if TRANSPARENCY_MODE == 0 // Half BG + Half FG.
|
|
o_col0.rgb = (bg_col.rgb * 0.5) + (fg_col.rgb * 0.5);
|
|
#elif TRANSPARENCY_MODE == 1 // BG + FG
|
|
o_col0.rgb = bg_col.rgb + fg_col.rgb;
|
|
#elif TRANSPARENCY_MODE == 2 // BG - FG
|
|
o_col0.rgb = bg_col.rgb - fg_col.rgb;
|
|
#elif TRANSPARENCY_MODE == 3 // BG + 1/4 FG.
|
|
o_col0.rgb = bg_col.rgb + (fg_col.rgb * 0.25);
|
|
#else
|
|
o_col0.rgb = fg_col.rgb;
|
|
#endif
|
|
|
|
// 16-bit truncation.
|
|
#if !TRUE_COLOR && TRANSPARENCY
|
|
o_col0.rgb = floor(o_col0.rgb);
|
|
#endif
|
|
|
|
#if TRANSPARENCY
|
|
// If pixel isn't marked as semitransparent, replace with previous colour.
|
|
o_col0 = semitransparent ? o_col0 : fg_col;
|
|
#endif
|
|
|
|
// Normalize for non-true-color.
|
|
#if !TRUE_COLOR
|
|
o_col0.rgb /= 31.0;
|
|
#endif
|
|
|
|
#if USE_ROV
|
|
if (!discarded)
|
|
{
|
|
ROV_STORE(rov_color, fragpos, o_col0);
|
|
#if USE_ROV_DEPTH && ROV_DEPTH_WRITE
|
|
ROV_STORE(rov_depth, fragpos, float4(v_pos.z, 0.0, 0.0, 0.0));
|
|
#endif
|
|
}
|
|
END_ROV_REGION;
|
|
#endif
|
|
#else
|
|
// Premultiply alpha so we don't need to use a colour output for it.
|
|
float premultiply_alpha = ialpha;
|
|
#if TRANSPARENCY
|
|
premultiply_alpha = ialpha * (semitransparent ? u_src_alpha_factor : 1.0);
|
|
#endif
|
|
|
|
float3 color;
|
|
#if !TRUE_COLOR
|
|
// We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color
|
|
// into the blend unit, which can cause a small amount of error to accumulate.
|
|
color = floor(float3(icolor) * premultiply_alpha) / 31.0;
|
|
#else
|
|
// True color is actually simpler here since we want to preserve the precision.
|
|
color = (float3(icolor) * premultiply_alpha) / 255.0;
|
|
#endif
|
|
|
|
#if TRANSPARENCY && TEXTURED
|
|
// Apply semitransparency. If not a semitransparent texel, destination alpha is ignored.
|
|
if (semitransparent)
|
|
{
|
|
#if USE_DUAL_SOURCE
|
|
o_col0 = float4(color, oalpha);
|
|
o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha);
|
|
#else
|
|
o_col0 = float4(color, oalpha);
|
|
#endif
|
|
|
|
#if WRITE_MASK_AS_DEPTH
|
|
o_depth = oalpha * v_pos.z;
|
|
#endif
|
|
|
|
#if TRANSPARENCY_ONLY_OPAQUE
|
|
discard;
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
#if USE_DUAL_SOURCE
|
|
o_col0 = float4(color, oalpha);
|
|
o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha);
|
|
#else
|
|
o_col0 = float4(color, oalpha);
|
|
#endif
|
|
|
|
#if WRITE_MASK_AS_DEPTH
|
|
o_depth = oalpha * v_pos.z;
|
|
#endif
|
|
|
|
#if TRANSPARENCY_ONLY_TRANSPARENT
|
|
discard;
|
|
#endif
|
|
}
|
|
#elif TRANSPARENCY
|
|
// We shouldn't be rendering opaque geometry only when untextured, so no need to test/discard here.
|
|
#if USE_DUAL_SOURCE
|
|
o_col0 = float4(color, oalpha);
|
|
o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha);
|
|
#else
|
|
o_col0 = float4(color, oalpha);
|
|
#endif
|
|
|
|
#if WRITE_MASK_AS_DEPTH
|
|
o_depth = oalpha * v_pos.z;
|
|
#endif
|
|
#else
|
|
// Non-transparency won't enable blending so we can write the mask here regardless.
|
|
o_col0 = float4(color, oalpha);
|
|
|
|
#if USE_DUAL_SOURCE
|
|
o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha);
|
|
#endif
|
|
|
|
#if WRITE_MASK_AS_DEPTH
|
|
o_depth = oalpha * v_pos.z;
|
|
#endif
|
|
#endif
|
|
#endif
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateVRAMExtractFragmentShader(u32 resolution_scale, u32 multisamples,
|
|
bool color_24bit, bool depth_buffer) const
|
|
{
|
|
const bool msaa = (multisamples > 1);
|
|
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
WriteColorConversionFunctions(ss);
|
|
|
|
DefineMacro(ss, "COLOR_24BIT", color_24bit);
|
|
DefineMacro(ss, "DEPTH_BUFFER", depth_buffer);
|
|
DefineMacro(ss, "MULTISAMPLING", msaa);
|
|
ss << "CONSTANT uint RESOLUTION_SCALE = " << resolution_scale << "u;\n";
|
|
ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n";
|
|
ss << "CONSTANT uint MULTISAMPLES = " << multisamples << "u;\n";
|
|
|
|
DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "float u_skip_x", "float u_line_skip"}, true);
|
|
DeclareTexture(ss, "samp0", 0, msaa);
|
|
if (depth_buffer)
|
|
DeclareTexture(ss, "samp1", 1, msaa);
|
|
|
|
ss << R"(
|
|
float4 LoadVRAM(int2 coords)
|
|
{
|
|
#if MULTISAMPLING
|
|
float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u);
|
|
FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++)
|
|
value += LOAD_TEXTURE_MS(samp0, coords, sample_index);
|
|
value /= float(MULTISAMPLES);
|
|
return value;
|
|
#else
|
|
return LOAD_TEXTURE(samp0, coords, 0);
|
|
#endif
|
|
}
|
|
|
|
#if DEPTH_BUFFER
|
|
float LoadDepth(int2 coords)
|
|
{
|
|
// Need to duplicate because different types in different languages...
|
|
#if MULTISAMPLING
|
|
float value = LOAD_TEXTURE_MS(samp1, coords, 0u).r;
|
|
FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++)
|
|
value += LOAD_TEXTURE_MS(samp1, coords, sample_index).r;
|
|
value /= float(MULTISAMPLES);
|
|
return value;
|
|
#else
|
|
return LOAD_TEXTURE(samp1, coords, 0).r;
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
float3 SampleVRAM24(uint2 icoords)
|
|
{
|
|
// load adjacent 16-bit texels
|
|
uint2 clamp_size = uint2(1024, 512);
|
|
|
|
// relative to start of scanout
|
|
uint2 vram_coords = u_vram_offset + uint2((icoords.x * 3u) / 2u, icoords.y);
|
|
uint s0 = RGBA8ToRGBA5551(LoadVRAM(int2((vram_coords % clamp_size) * RESOLUTION_SCALE)));
|
|
uint s1 = RGBA8ToRGBA5551(LoadVRAM(int2(((vram_coords + uint2(1, 0)) % clamp_size) * RESOLUTION_SCALE)));
|
|
|
|
// select which part of the combined 16-bit texels we are currently shading
|
|
uint s1s0 = ((s1 << 16) | s0) >> ((icoords.x & 1u) * 8u);
|
|
|
|
// extract components and normalize
|
|
return float3(float(s1s0 & 0xFFu) / 255.0, float((s1s0 >> 8u) & 0xFFu) / 255.0,
|
|
float((s1s0 >> 16u) & 0xFFu) / 255.0);
|
|
}
|
|
)";
|
|
|
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, depth_buffer ? 2 : 1);
|
|
ss << R"(
|
|
{
|
|
// Have to floor because SV_Position is at the pixel center.
|
|
float2 v_pos_floored = floor(v_pos.xy);
|
|
uint2 icoords = uint2(v_pos_floored.x + u_skip_x, v_pos_floored.y * u_line_skip);
|
|
int2 wrapped_coords = int2((icoords + u_vram_offset) % VRAM_SIZE);
|
|
|
|
#if COLOR_24BIT
|
|
o_col0 = float4(SampleVRAM24(icoords), 1.0);
|
|
#else
|
|
o_col0 = float4(LoadVRAM(wrapped_coords).rgb, 1.0);
|
|
#endif
|
|
|
|
#if DEPTH_BUFFER
|
|
o_col1 = float4(LoadDepth(wrapped_coords), 0.0, 0.0, 0.0);
|
|
#endif
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateVRAMReplacementBlitFragmentShader() const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
DeclareTexture(ss, "samp0", 0);
|
|
DeclareFragmentEntryPoint(ss, 0, 1);
|
|
|
|
ss << R"(
|
|
{
|
|
o_col0 = SAMPLE_TEXTURE(samp0, v_tex0);
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateWireframeGeometryShader() const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
|
|
if (m_glsl)
|
|
{
|
|
ss << R"(
|
|
layout(triangles) in;
|
|
layout(line_strip, max_vertices = 6) out;
|
|
|
|
void main()
|
|
{
|
|
gl_Position = gl_in[0].gl_Position;
|
|
EmitVertex();
|
|
gl_Position = gl_in[1].gl_Position;
|
|
EmitVertex();
|
|
EndPrimitive();
|
|
gl_Position = gl_in[1].gl_Position;
|
|
EmitVertex();
|
|
gl_Position = gl_in[2].gl_Position;
|
|
EmitVertex();
|
|
EndPrimitive();
|
|
gl_Position = gl_in[2].gl_Position;
|
|
EmitVertex();
|
|
gl_Position = gl_in[0].gl_Position;
|
|
EmitVertex();
|
|
EndPrimitive();
|
|
}
|
|
)";
|
|
}
|
|
else
|
|
{
|
|
ss << R"(
|
|
struct GSInput
|
|
{
|
|
float4 col0 : COLOR0;
|
|
float4 pos : SV_Position;
|
|
};
|
|
|
|
struct GSOutput
|
|
{
|
|
float4 pos : SV_Position;
|
|
};
|
|
|
|
GSOutput GetVertex(GSInput vi)
|
|
{
|
|
GSOutput vo;
|
|
vo.pos = vi.pos;
|
|
return vo;
|
|
}
|
|
|
|
[maxvertexcount(6)]
|
|
void main(triangle GSInput input[3], inout LineStream<GSOutput> output)
|
|
{
|
|
output.Append(GetVertex(input[0]));
|
|
output.Append(GetVertex(input[1]));
|
|
output.RestartStrip();
|
|
|
|
output.Append(GetVertex(input[1]));
|
|
output.Append(GetVertex(input[2]));
|
|
output.RestartStrip();
|
|
|
|
output.Append(GetVertex(input[2]));
|
|
output.Append(GetVertex(input[0]));
|
|
output.RestartStrip();
|
|
}
|
|
)";
|
|
}
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateWireframeFragmentShader() const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
|
|
DeclareFragmentEntryPoint(ss, 0, 0);
|
|
ss << R"(
|
|
{
|
|
o_col0 = float4(1.0, 1.0, 1.0, 0.5);
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateVRAMReadFragmentShader(u32 resolution_scale, u32 multisamples) const
|
|
{
|
|
const bool msaa = (multisamples > 1);
|
|
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
WriteColorConversionFunctions(ss);
|
|
|
|
DefineMacro(ss, "MULTISAMPLING", msaa);
|
|
ss << "CONSTANT uint RESOLUTION_SCALE = " << resolution_scale << "u;\n";
|
|
ss << "CONSTANT uint MULTISAMPLES = " << multisamples << "u;\n";
|
|
|
|
DeclareUniformBuffer(ss, {"uint2 u_base_coords", "uint2 u_size"}, true);
|
|
DeclareTexture(ss, "samp0", 0, msaa);
|
|
|
|
ss << R"(
|
|
float4 LoadVRAM(int2 coords)
|
|
{
|
|
#if MULTISAMPLING
|
|
float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u);
|
|
FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++)
|
|
value += LOAD_TEXTURE_MS(samp0, coords, sample_index);
|
|
value /= float(MULTISAMPLES);
|
|
return value;
|
|
#else
|
|
return LOAD_TEXTURE(samp0, coords, 0);
|
|
#endif
|
|
}
|
|
|
|
uint SampleVRAM(uint2 coords)
|
|
{
|
|
if (RESOLUTION_SCALE == 1u)
|
|
return RGBA8ToRGBA5551(LoadVRAM(int2(coords)));
|
|
|
|
// Box filter for downsampling.
|
|
float4 value = float4(0.0, 0.0, 0.0, 0.0);
|
|
uint2 base_coords = coords * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE);
|
|
for (uint offset_x = 0u; offset_x < RESOLUTION_SCALE; offset_x++)
|
|
{
|
|
for (uint offset_y = 0u; offset_y < RESOLUTION_SCALE; offset_y++)
|
|
value += LoadVRAM(int2(base_coords + uint2(offset_x, offset_y)));
|
|
}
|
|
value /= float(RESOLUTION_SCALE * RESOLUTION_SCALE);
|
|
return RGBA8ToRGBA5551(value);
|
|
}
|
|
)";
|
|
|
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1);
|
|
ss << R"(
|
|
{
|
|
uint2 sample_coords = uint2(uint(v_pos.x) * 2u, uint(v_pos.y));
|
|
sample_coords += u_base_coords;
|
|
|
|
// We're encoding as 32-bit, so the output width is halved and we pack two 16-bit pixels in one 32-bit pixel.
|
|
uint left = SampleVRAM(sample_coords);
|
|
uint right = SampleVRAM(uint2(sample_coords.x + 1u, sample_coords.y));
|
|
|
|
o_col0 = float4(float(left & 0xFFu), float((left >> 8) & 0xFFu),
|
|
float(right & 0xFFu), float((right >> 8) & 0xFFu))
|
|
/ float4(255.0, 255.0, 255.0, 255.0);
|
|
})";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(bool use_buffer, bool use_ssbo, bool write_mask_as_depth,
|
|
bool write_depth_as_rt) const
|
|
{
|
|
Assert(!write_mask_as_depth || (write_mask_as_depth != write_depth_as_rt));
|
|
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
WriteColorConversionFunctions(ss);
|
|
|
|
DefineMacro(ss, "WRITE_MASK_AS_DEPTH", write_mask_as_depth);
|
|
DefineMacro(ss, "WRITE_DEPTH_AS_RT", write_depth_as_rt);
|
|
DefineMacro(ss, "USE_BUFFER", use_buffer);
|
|
|
|
ss << "CONSTANT float2 VRAM_SIZE = float2(" << VRAM_WIDTH << ".0, " << VRAM_HEIGHT << ".0);\n";
|
|
|
|
DeclareUniformBuffer(ss,
|
|
{"float2 u_base_coords", "float2 u_end_coords", "float2 u_size", "float u_resolution_scale",
|
|
"uint u_buffer_base_offset", "uint u_mask_or_bits", "float u_depth_value"},
|
|
true);
|
|
|
|
if (!use_buffer)
|
|
{
|
|
DeclareTexture(ss, "samp0", 0, false, true, true);
|
|
}
|
|
else if (use_ssbo && m_glsl)
|
|
{
|
|
ss << "layout(std430";
|
|
if (IsVulkan())
|
|
ss << ", set = 0, binding = 0";
|
|
else if (IsMetal())
|
|
ss << ", set = 0, binding = 1";
|
|
else if (m_use_glsl_binding_layout)
|
|
ss << ", binding = 0";
|
|
|
|
ss << ") readonly restrict buffer SSBO {\n";
|
|
ss << " uint ssbo_data[];\n";
|
|
ss << "};\n\n";
|
|
|
|
ss << "#define GET_VALUE(buffer_offset) (ssbo_data[(buffer_offset) / 2u] >> (((buffer_offset) % 2u) * 16u))\n\n";
|
|
}
|
|
else
|
|
{
|
|
DeclareTextureBuffer(ss, "samp0", 0, true, true);
|
|
ss << "#define GET_VALUE(buffer_offset) (LOAD_TEXTURE_BUFFER(samp0, int(buffer_offset)).r)\n\n";
|
|
}
|
|
|
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1 + BoolToUInt32(write_depth_as_rt), false, write_mask_as_depth);
|
|
ss << R"(
|
|
{
|
|
float2 coords = floor(v_pos.xy / u_resolution_scale);
|
|
|
|
// make sure it's not oversized and out of range
|
|
if ((coords.x < u_base_coords.x && coords.x >= u_end_coords.x) ||
|
|
(coords.y < u_base_coords.y && coords.y >= u_end_coords.y))
|
|
{
|
|
discard;
|
|
}
|
|
|
|
// find offset from the start of the row/column
|
|
float2 offset;
|
|
offset.x = (coords.x < u_base_coords.x) ? (VRAM_SIZE.x - u_base_coords.x + coords.x) : (coords.x - u_base_coords.x);
|
|
offset.y = (coords.y < u_base_coords.y) ? (VRAM_SIZE.y - u_base_coords.y + coords.y) : (coords.y - u_base_coords.y);
|
|
|
|
#if !USE_BUFFER
|
|
uint value = LOAD_TEXTURE(samp0, int2(offset), 0).x;
|
|
#else
|
|
uint buffer_offset = u_buffer_base_offset + uint((offset.y * u_size.x) + offset.x);
|
|
uint value = GET_VALUE(buffer_offset) | u_mask_or_bits;
|
|
#endif
|
|
|
|
o_col0 = RGBA5551ToRGBA8(value);
|
|
#if WRITE_MASK_AS_DEPTH
|
|
o_depth = (o_col0.a == 1.0) ? u_depth_value : 0.0;
|
|
#elif WRITE_DEPTH_AS_RT
|
|
o_col1 = float4(1.0f, 0.0f, 0.0f, 0.0f);
|
|
#endif
|
|
})";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader(bool write_mask_as_depth, bool write_depth_as_rt) const
|
|
{
|
|
Assert(!write_mask_as_depth || (write_mask_as_depth != write_depth_as_rt));
|
|
|
|
// TODO: This won't currently work because we can't bind the texture to both the shader and framebuffer.
|
|
const bool msaa = false;
|
|
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
DefineMacro(ss, "WRITE_MASK_AS_DEPTH", write_mask_as_depth);
|
|
DefineMacro(ss, "WRITE_DEPTH_AS_RT", write_depth_as_rt);
|
|
DefineMacro(ss, "MSAA_COPY", msaa);
|
|
|
|
DeclareUniformBuffer(ss,
|
|
{"float2 u_src_coords", "float2 u_dst_coords", "float2 u_end_coords", "float2 u_vram_size",
|
|
"float u_resolution_scale", "bool u_set_mask_bit", "float u_depth_value"},
|
|
true);
|
|
|
|
DeclareTexture(ss, "samp0", 0, msaa);
|
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1 + BoolToUInt32(write_depth_as_rt), false, write_mask_as_depth, false,
|
|
false, msaa);
|
|
ss << R"(
|
|
{
|
|
float2 dst_coords = floor(v_pos.xy);
|
|
|
|
// make sure it's not oversized and out of range
|
|
if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) ||
|
|
(dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y))
|
|
{
|
|
discard;
|
|
}
|
|
|
|
// find offset from the start of the row/column
|
|
float2 offset;
|
|
offset.x = (dst_coords.x < u_dst_coords.x) ? (u_vram_size.x - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x);
|
|
offset.y = (dst_coords.y < u_dst_coords.y) ? (u_vram_size.y - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y);
|
|
|
|
// find the source coordinates to copy from
|
|
float2 offset_coords = u_src_coords + offset;
|
|
float2 src_coords = offset_coords - (floor(offset_coords / u_vram_size) * u_vram_size);
|
|
|
|
// sample and apply mask bit
|
|
#if MSAA_COPY
|
|
float4 color = LOAD_TEXTURE_MS(samp0, int2(src_coords), f_sample_index);
|
|
#else
|
|
float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0);
|
|
#endif
|
|
o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a);
|
|
#if WRITE_MASK_AS_DEPTH
|
|
o_depth = (u_set_mask_bit ? 1.0f : ((o_col0.a == 1.0) ? u_depth_value : 0.0));
|
|
#elif WRITE_DEPTH_AS_RT
|
|
o_col1 = float4(1.0f, 0.0f, 0.0f, 0.0f);
|
|
#endif
|
|
})";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateVRAMFillFragmentShader(bool wrapped, bool interlaced, bool write_mask_as_depth,
|
|
bool write_depth_as_rt) const
|
|
{
|
|
Assert(!write_mask_as_depth || (write_mask_as_depth != write_depth_as_rt));
|
|
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
DefineMacro(ss, "WRITE_MASK_AS_DEPTH", write_mask_as_depth);
|
|
DefineMacro(ss, "WRITE_DEPTH_AS_RT", write_depth_as_rt);
|
|
DefineMacro(ss, "WRAPPED", wrapped);
|
|
DefineMacro(ss, "INTERLACED", interlaced);
|
|
|
|
DeclareUniformBuffer(
|
|
ss, {"uint2 u_dst_coords", "uint2 u_end_coords", "float4 u_fill_color", "uint u_interlaced_displayed_field"}, true);
|
|
|
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, interlaced || wrapped, 1 + BoolToUInt32(write_depth_as_rt), false,
|
|
write_mask_as_depth, false, false, false);
|
|
ss << R"(
|
|
{
|
|
#if INTERLACED || WRAPPED
|
|
uint2 dst_coords = uint2(v_pos.xy);
|
|
#endif
|
|
|
|
#if INTERLACED
|
|
if ((dst_coords.y & 1u) == u_interlaced_displayed_field)
|
|
discard;
|
|
#endif
|
|
|
|
#if WRAPPED
|
|
// make sure it's not oversized and out of range
|
|
if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) ||
|
|
(dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y))
|
|
{
|
|
discard;
|
|
}
|
|
#endif
|
|
|
|
o_col0 = u_fill_color;
|
|
#if WRITE_MASK_AS_DEPTH
|
|
o_depth = u_fill_color.a;
|
|
#elif WRITE_DEPTH_AS_RT
|
|
o_col1 = float4(1.0f, 0.0f, 0.0f, 0.0f);
|
|
#endif
|
|
})";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateVRAMUpdateDepthFragmentShader(bool msaa) const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
DefineMacro(ss, "MULTISAMPLING", msaa);
|
|
DeclareTexture(ss, "samp0", 0, msaa);
|
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 0, false, true, false, false, msaa);
|
|
|
|
ss << R"(
|
|
{
|
|
#if MULTISAMPLING
|
|
o_depth = LOAD_TEXTURE_MS(samp0, int2(v_pos.xy), f_sample_index).a;
|
|
#else
|
|
o_depth = LOAD_TEXTURE(samp0, int2(v_pos.xy), 0).a;
|
|
#endif
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateVRAMClearDepthFragmentShader(bool write_depth_as_rt) const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
DefineMacro(ss, "WRITE_DEPTH_AS_RT", write_depth_as_rt);
|
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, false, BoolToUInt32(write_depth_as_rt), false, false, false, false, false);
|
|
|
|
ss << R"(
|
|
{
|
|
#if WRITE_DEPTH_AS_RT
|
|
o_col0 = float4(1.0f, 0.0f, 0.0f, 0.0f);
|
|
#endif
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
void GPU_HW_ShaderGen::WriteAdaptiveDownsampleUniformBuffer(std::stringstream& ss) const
|
|
{
|
|
DeclareUniformBuffer(ss, {"float2 u_uv_min", "float2 u_uv_max", "float2 u_pixel_size", "float u_lod"}, true);
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleVertexShader() const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
WriteAdaptiveDownsampleUniformBuffer(ss);
|
|
DeclareVertexEntryPoint(ss, {}, 0, 1, {}, true);
|
|
ss << R"(
|
|
{
|
|
v_tex0 = float2(float((v_id << 1) & 2u), float(v_id & 2u));
|
|
v_pos = float4(v_tex0 * float2(2.0f, -2.0f) + float2(-1.0f, 1.0f), 0.0f, 1.0f);
|
|
v_tex0 = u_uv_min + (u_uv_max - u_uv_min) * v_tex0;
|
|
#if API_OPENGL || API_OPENGL_ES || API_VULKAN
|
|
v_pos.y = -v_pos.y;
|
|
#endif
|
|
}
|
|
)";
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleMipFragmentShader() const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
WriteAdaptiveDownsampleUniformBuffer(ss);
|
|
DeclareTexture(ss, "samp0", 0, false);
|
|
DeclareFragmentEntryPoint(ss, 0, 1);
|
|
ss << R"(
|
|
{
|
|
// Gather 4 samples for bilinear filtering.
|
|
float2 uv = v_tex0 - u_pixel_size; // * 0.25 done on CPU
|
|
float4 c00 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 0));
|
|
float4 c01 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 1));
|
|
float4 c10 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 0));
|
|
float4 c11 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 1));
|
|
float3 cavg = (c00.rgb + c01.rgb + c10.rgb + c11.rgb) * 0.25;
|
|
|
|
// Compute variance between pixels with logarithmic scaling to aggressively reduce along the edges.
|
|
float variance =
|
|
1.0 - log2(1000.0 * (dot(c00.rgb - cavg.rgb, c00.rgb - cavg.rgb) + dot(c01.rgb - cavg, c01.rgb - cavg) +
|
|
dot(c10.rgb - cavg.rgb, c10.rgb - cavg.rgb) + dot(c11.rgb - cavg, c11.rgb - cavg)) +
|
|
1.0);
|
|
|
|
// Write variance to the alpha channel, weighted by the previous LOD's variance.
|
|
// There's no variance in the first LOD.
|
|
float aavg = (c00.a + c01.a + c10.a + c11.a) * 0.25;
|
|
o_col0.rgb = cavg.rgb;
|
|
o_col0.a = variance * ((u_lod == 0.0) ? 1.0 : aavg);
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleBlurFragmentShader() const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
WriteColorConversionFunctions(ss);
|
|
WriteAdaptiveDownsampleUniformBuffer(ss);
|
|
DeclareTexture(ss, "samp0", 0, false);
|
|
DeclareFragmentEntryPoint(ss, 0, 1);
|
|
ss << R"(
|
|
{
|
|
// Bog standard blur kernel unrolled for speed:
|
|
// [ 0.0625, 0.125, 0.0625
|
|
// 0.125, 0.25, 0.125
|
|
// 0.0625, 0.125, 0.0625 ]
|
|
//
|
|
// Can't use offset for sampling here, because we need to clamp, and the source texture is larger.
|
|
//
|
|
#define KERNEL_SAMPLE(weight, xoff, yoff) \
|
|
(weight) * SAMPLE_TEXTURE_LEVEL( \
|
|
samp0, clamp((v_tex0 + float2(float(xoff), float(yoff)) * u_pixel_size), u_uv_min, u_uv_max), 0.0) \
|
|
.a
|
|
float blur = KERNEL_SAMPLE(0.0625, -1, -1);
|
|
blur += KERNEL_SAMPLE(0.0625, 1, -1);
|
|
blur += KERNEL_SAMPLE(0.0625, -1, 1);
|
|
blur += KERNEL_SAMPLE(0.0625, 1, 1);
|
|
blur += KERNEL_SAMPLE(0.125, 0, -1);
|
|
blur += KERNEL_SAMPLE(0.125, -1, 0);
|
|
blur += KERNEL_SAMPLE(0.125, 1, 0);
|
|
blur += KERNEL_SAMPLE(0.125, 0, 1);
|
|
blur += KERNEL_SAMPLE(0.25, 0, 0);
|
|
o_col0 = float4(blur, blur, blur, blur);
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleCompositeFragmentShader() const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
WriteAdaptiveDownsampleUniformBuffer(ss);
|
|
DeclareTexture(ss, "samp0", 0, false);
|
|
DeclareTexture(ss, "samp1", 1, false);
|
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, true);
|
|
ss << R"(
|
|
{
|
|
// Sample the mip level determined by the weight texture. samp0 is trilinear, so it will blend between levels.
|
|
o_col0 = float4(SAMPLE_TEXTURE_LEVEL(samp0, v_tex0, SAMPLE_TEXTURE(samp1, v_tex0).r * u_lod).rgb, 1.0);
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateBoxSampleDownsampleFragmentShader(u32 factor) const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
DeclareUniformBuffer(ss, {"uint2 u_base_coords"}, true);
|
|
DeclareTexture(ss, "samp0", 0, false);
|
|
|
|
ss << "CONSTANT uint FACTOR = " << factor << "u;\n";
|
|
|
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, true);
|
|
ss << R"(
|
|
{
|
|
float3 color = float3(0.0, 0.0, 0.0);
|
|
uint2 base_coords = u_base_coords + uint2(v_pos.xy) * uint2(FACTOR, FACTOR);
|
|
for (uint offset_x = 0u; offset_x < FACTOR; offset_x++)
|
|
{
|
|
for (uint offset_y = 0u; offset_y < FACTOR; offset_y++)
|
|
color += LOAD_TEXTURE(samp0, int2(base_coords + uint2(offset_x, offset_y)), 0).rgb;
|
|
}
|
|
color /= float(FACTOR * FACTOR);
|
|
o_col0 = float4(color, 1.0);
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
}
|
|
|
|
std::string GPU_HW_ShaderGen::GenerateReplacementMergeFragmentShader(bool replacement, bool semitransparent,
|
|
bool bilinear_filter) const
|
|
{
|
|
std::stringstream ss;
|
|
WriteHeader(ss);
|
|
DefineMacro(ss, "REPLACEMENT", replacement);
|
|
DefineMacro(ss, "SEMITRANSPARENT", semitransparent);
|
|
DefineMacro(ss, "BILINEAR_FILTER", bilinear_filter);
|
|
DeclareUniformBuffer(ss, {"float4 u_src_rect", "float4 u_texture_size"}, true);
|
|
DeclareTexture(ss, "samp0", 0);
|
|
DeclareFragmentEntryPoint(ss, 0, 1);
|
|
|
|
ss << R"(
|
|
{
|
|
float2 start_coords = u_src_rect.xy + v_tex0 * u_src_rect.zw;
|
|
|
|
#if BILINEAR_FILTER
|
|
// Compute the coordinates of the four texels we will be interpolating between.
|
|
// Clamp this to the triangle texture coordinates.
|
|
float2 coords = start_coords * u_texture_size.xy;
|
|
float2 texel_top_left = frac(coords) - float2(0.5, 0.5);
|
|
float2 texel_offset = sign(texel_top_left);
|
|
float4 fcoords = max(coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y),
|
|
float4(0.0, 0.0, 0.0, 0.0)) * u_texture_size.zwzw;
|
|
|
|
// Load four texels.
|
|
float4 s00 = SAMPLE_TEXTURE_LEVEL(samp0, fcoords.xy, 0.0);
|
|
float4 s10 = SAMPLE_TEXTURE_LEVEL(samp0, fcoords.zy, 0.0);
|
|
float4 s01 = SAMPLE_TEXTURE_LEVEL(samp0, fcoords.xw, 0.0);
|
|
float4 s11 = SAMPLE_TEXTURE_LEVEL(samp0, fcoords.zw, 0.0);
|
|
|
|
// Bilinearly interpolate.
|
|
float2 weights = abs(texel_top_left);
|
|
float4 color = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y);
|
|
float orig_alpha = float(color.a > 0.0);
|
|
|
|
#if !SEMITRANSPARENT
|
|
// Compute alpha from how many texels aren't pixel color 0000h.
|
|
float a00 = float(VECTOR_NEQ(s00, float4(0.0, 0.0, 0.0, 0.0)));
|
|
float a10 = float(VECTOR_NEQ(s10, float4(0.0, 0.0, 0.0, 0.0)));
|
|
float a01 = float(VECTOR_NEQ(s01, float4(0.0, 0.0, 0.0, 0.0)));
|
|
float a11 = float(VECTOR_NEQ(s11, float4(0.0, 0.0, 0.0, 0.0)));
|
|
color.a = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y);
|
|
|
|
// Compensate for partially transparent sampling.
|
|
color.rgb /= (color.a != 0.0) ? color.a : 1.0;
|
|
|
|
// Use binary alpha.
|
|
color.a = (color.a >= 0.5) ? 1.0 : 0.0;
|
|
#endif
|
|
#else
|
|
float4 color = SAMPLE_TEXTURE_LEVEL(samp0, start_coords, 0.0);
|
|
float orig_alpha = color.a;
|
|
#endif
|
|
o_col0.rgb = color.rgb;
|
|
|
|
// Alpha processing.
|
|
#if REPLACEMENT
|
|
#if SEMITRANSPARENT
|
|
// Map anything not 255 to 1 for semitransparent, otherwise zero for opaque.
|
|
o_col0.a = (color.a <= 0.95f) ? 1.0f : 0.0f;
|
|
o_col0.a = VECTOR_EQ(color, float4(0.0, 0.0, 0.0, 0.0)) ? 0.0f : o_col0.a;
|
|
#else
|
|
// Map anything with an alpha below 0.5 to transparent.
|
|
// Leave (0,0,0,0) as 0000 for opaque replacements for cutout alpha.
|
|
float alpha = float(color.a >= 0.5);
|
|
o_col0.rgb = lerp(float3(0.0, 0.0, 0.0), o_col0.rgb, alpha);
|
|
|
|
// We can't simply clear the alpha channel unconditionally here, because that
|
|
// would result in any black pixels with zero alpha being transparency-culled.
|
|
// Instead, we set it to a minimum value (2/255 in case of rounding error, I
|
|
// don't trust drivers here) so that transparent polygons in the source still
|
|
// set bit 15 to zero in the framebuffer, but are not transparency-culled.
|
|
// Silent Hill needs it to be zero, I'm not aware of anything that needs
|
|
// specific values yet. If it did, we'd need a different dumping technique.
|
|
o_col0.a = lerp(0.0, 2.0 / 255.0, alpha);
|
|
#endif
|
|
#else
|
|
// Preserve original bit 15 for non-replacements.
|
|
o_col0.a = orig_alpha;
|
|
#endif
|
|
}
|
|
)";
|
|
|
|
return std::move(ss).str();
|
|
} |