2024-02-28 19:08:20 -08:00

777 lines
20 KiB
HLSL

/*
Copyright (C) 2003 Ryan A. Nunn
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
//
// ** 2xSaI **
//
// 2xSaI algorithm is Copyright (c) 1999-2001 by Derek Liauw Kie Fa.
//
// 2xSaI is free under GPL
//
// Adapted by Ryan A. Nunn into Pixel and Vertex Shaders to be used with the
// Scaling2D entry in the Beyond3D/ATI Shader Competition
//
// Download original C/ASM code here: http://elektron.its.tudelft.nl/~dalikifa/
//
// Notes about the creation of the shaders
//
// The differences from the C/ASM code are quite substantial, which would be
// plainly obvious to anyone who has worked with the C/ASM versions of these
// algorithms.
//
// The reasons for this are as follows:
// 1) Can only output a single pixel at a time (Note MRT wouldn't be useful)
// 2) 'Small' instruction count limits
// 3) No program flow control
//
// As such, the algorithm couldn't be simply converted into pixel shaders
// because multiple nested if statements were used. While this may be fairly
// fast on a CPU, it is very unsuitable for a pixel shader. So I decided to do
// things differently. I decided that I should instead write boolean
// expressions for each of the possible values for each product, and convert
// those into pixel shaders with a final combining stage
//
// An example of one of the expressions is:
// useColA = (AD && BC && BD) || (((AC && AF && BJ && !BE) || (AD && AE && BL)) && !BC);
//
// That expression represents which colours much and must not be the same in
// order for product0 to be colorA
//
// After doing all that, I had managed to convert the algorithm from working
// out all the products at the same time, to working out each product
// individually. The algorithm would now be suitable for conversion into pixel
// shaders.
//
#include "Scaling.inc"
// The name of this effect
string name : NAME = "2xSaI";
float scaling : SCALING = 2.0;
//
// Techniques
//
// combineTechnique: Final combine steps. Outputs to destination frame buffer
string combineTechique : COMBINETECHNIQUE = "Combine_2xSaI";
// preprocessTechnique: PreProcessing steps. Outputs to WorkingTexture
string preprocessTechique : PREPROCESSTECHNIQUE = "Preprocess_2xSaI";
//
// Textures and Samplers
//
//
// Texture Gen Shaders
//
//
// Vertex Shader Output
//
// Texel Locations
//
// -1 0 1 2
// -1 I E F J
// 0 G A B K
// 1 H C D L
// 2 M N O P
struct VS_OUTPUT_PRODUCT0
{
float4 Position : POSITION;
float2 colorA : TEXCOORD0;
float2 colorB : TEXCOORD1;
float2 colorC : TEXCOORD2;
float2 colorD : TEXCOORD3;
float2 colorE : TEXCOORD4;
float2 colorF : TEXCOORD5;
float2 colorH : TEXCOORD6;
float2 colorJ : TEXCOORD7;
};
struct VS_OUTPUT_PRODUCT1
{
float4 Position : POSITION;
float2 colorA : TEXCOORD0;
float2 colorC : TEXCOORD1;
float2 colorB : TEXCOORD2;
float2 colorD : TEXCOORD3;
float2 colorG : TEXCOORD4;
float2 colorH : TEXCOORD5;
float2 colorF : TEXCOORD6;
float2 colorM : TEXCOORD7;
};
struct VS_OUTPUT_PRODUCT2_PASS0
{
float4 Position : POSITION;
float2 colorA : TEXCOORD0;
float2 colorB : TEXCOORD1;
float2 colorC : TEXCOORD2;
float2 colorD : TEXCOORD3;
};
struct VS_OUTPUT_PRODUCT2_PASS1
{
float4 Position : POSITION;
float2 colorA : TEXCOORD0;
float2 colorB : TEXCOORD1;
float2 colorG : TEXCOORD2;
float2 colorE : TEXCOORD3;
float2 colorK : TEXCOORD4;
float2 colorF : TEXCOORD5;
};
struct VS_OUTPUT_PRODUCT2_PASS2
{
float4 Position : POSITION;
float2 colorA : TEXCOORD0;
float2 colorB : TEXCOORD1;
float2 colorL : TEXCOORD2;
float2 colorO : TEXCOORD3;
float2 colorH : TEXCOORD4;
float2 colorN : TEXCOORD5;
};
struct VS_OUTPUT_COMBINE
{
float4 Position : POSITION;
float2 colorA : TEXCOORD0;
float2 colorB : TEXCOORD1;
float2 colorC : TEXCOORD2;
float2 colorD : TEXCOORD3;
float2 Selector : TEXCOORD4;
};
//
// Vertex Shaders
//
VS_OUTPUT_PRODUCT0 VS_Product0(
float3 Position : POSITION,
float2 TexCoord : TEXCOORD0)
{
VS_OUTPUT_PRODUCT0 Out = (VS_OUTPUT_PRODUCT0)0;
Out.Position = mul(float4(Position, 1), WorldViewProjection); // Matrix multipliy
Out.colorA = TexCoord;
Out.colorB = TexCoord + TexelSize * float2( 1, 0);
Out.colorC = TexCoord + TexelSize * float2( 0, 1);
Out.colorD = TexCoord + TexelSize * float2( 1, 1);
Out.colorF = TexCoord + TexelSize * float2( 1,-1);
Out.colorH = TexCoord + TexelSize * float2(-1, 1);
Out.colorE = TexCoord + TexelSize * float2( 0,-1);
Out.colorJ = TexCoord + TexelSize * float2( 2,-1);
return Out;
}
VS_OUTPUT_PRODUCT1 VS_Product1(
float3 Position : POSITION,
float2 TexCoord : TEXCOORD0)
{
VS_OUTPUT_PRODUCT1 Out = (VS_OUTPUT_PRODUCT1)0;
Out.Position = mul(float4(Position, 1), WorldViewProjection); // Matrix multipliy
Out.colorA = TexCoord;
Out.colorB = TexCoord + TexelSize * float2( 1, 0);
Out.colorC = TexCoord + TexelSize * float2( 0, 1);
Out.colorD = TexCoord + TexelSize * float2( 1, 1);
Out.colorF = TexCoord + TexelSize * float2( 1,-1);
Out.colorH = TexCoord + TexelSize * float2(-1, 1);
Out.colorG = TexCoord + TexelSize * float2(-1, 0);
Out.colorM = TexCoord + TexelSize * float2(-1, 2);
return Out;
}
VS_OUTPUT_PRODUCT2_PASS0 VS_Product2_Pass0(
float3 Position : POSITION,
float2 TexCoord : TEXCOORD0)
{
VS_OUTPUT_PRODUCT2_PASS0 Out = (VS_OUTPUT_PRODUCT2_PASS0)0;
Out.Position = mul(float4(Position, 1), WorldViewProjection); // Matrix multipliy
Out.colorA = TexCoord;
Out.colorB = TexCoord + TexelSize * float2( 1, 0);
Out.colorC = TexCoord + TexelSize * float2( 0, 1);
Out.colorD = TexCoord + TexelSize * float2( 1, 1);
return Out;
}
VS_OUTPUT_PRODUCT2_PASS1 VS_Product2_Pass1 (
float3 Position : POSITION,
float2 TexCoord : TEXCOORD0)
{
VS_OUTPUT_PRODUCT2_PASS1 Out = (VS_OUTPUT_PRODUCT2_PASS1)0;
Out.Position = mul(float4(Position, 1), WorldViewProjection); // Matrix multipliy
Out.colorA = TexCoord;
Out.colorB = TexCoord + TexelSize * float2( 1, 0);
Out.colorG = TexCoord + TexelSize * float2(-1, 0);
Out.colorE = TexCoord + TexelSize * float2( 0,-1);
Out.colorK = TexCoord + TexelSize * float2( 2, 0);
Out.colorF = TexCoord + TexelSize * float2( 1,-1);
return Out;
}
VS_OUTPUT_PRODUCT2_PASS2 VS_Product2_Pass2 (
float3 Position : POSITION,
float2 TexCoord : TEXCOORD0)
{
VS_OUTPUT_PRODUCT2_PASS2 Out = (VS_OUTPUT_PRODUCT2_PASS2)0;
Out.Position = mul(float4(Position, 1), WorldViewProjection); // Matrix multipliy
Out.colorA = TexCoord;
Out.colorB = TexCoord + TexelSize * float2( 1, 0);
Out.colorL = TexCoord + TexelSize * float2( 2, 1);
Out.colorO = TexCoord + TexelSize * float2( 1, 2);
Out.colorH = TexCoord + TexelSize * float2(-1, 1);
Out.colorN = TexCoord + TexelSize * float2( 0, 2);
return Out;
}
VS_OUTPUT_COMBINE VS_Combine(
float3 Position : POSITION,
float2 TexCoord : TEXCOORD0)
{
VS_OUTPUT_COMBINE Out = (VS_OUTPUT_COMBINE)0;
Out.Position = mul(float4(Position, 1), WorldViewProjection); // Matrix multipliy
Out.colorA = TexCoord;
Out.colorB = TexCoord + TexelSize * float2( 1, 0);
Out.colorC = TexCoord + TexelSize * float2( 0, 1);
Out.colorD = TexCoord + TexelSize * float2( 1, 1);
Out.Selector = TexCoord*SourceDims;
return Out;
}
//
// Pixel Shaders
//
//
// This pixel shader is used for the product0 and product1 preprocesses steps.
// By purely changing the vertex shader, does the effect of this pixel shader change.
// What happens is the texture coords become mirrored across the xy diagonal.
//
// This has been hand coded to reduce instruction counts so this can run in 1 pass, per
// product. This code is likely to run very poorly on NV3x hardware.
//
PIXELSHADER PS_Product0_1 = asm
{
ps_2_0
// Some opcode 'aliases' for bools
#define or add_sat
#define and mul
#define and_or mad_sat
// Same thing, but partial precision (using mad_sat_pp is causing 'issues')
#define or_pp add_sat_pp
#define and_pp mul_pp
#define and_or_pp mad_sat
//
// Constants
//
def c0, 0, 1, 0.35, 0.7
#define zero c0.r
#define one c0.g
#define ret_B c0.b
#define ret_A c0.a
//def c2, TexelSize // Set by the effect
#define TexelSize c2
//
// Samplers
//
// The only sampler we need. It is set by the effect
dcl_2d s0 // SourceSampler
//
// Texture coords. These match VS_OUTPUT_PRODUCT0
//
dcl t0.xy // input.colorA (-TexelSize -> colorI)
dcl t1.xy // input.colorB (+TexelSize -> colorL)
dcl t2.xy // input.colorC
dcl t3.xy // input.colorD
dcl t4.xy // input.colorE
dcl t5.xy // input.colorF
dcl t6.xy // input.colorH
dcl t7.xy // input.colorJ
//
// The code
//
// Sample the first 6 textures (ABCDEF)
texld_pp r5, t0, s0 // colorA
texld_pp r6, t1, s0 // colorB
texld_pp r7, t2, s0 // colorC
texld_pp r8, t3, s0 // colorD
texld_pp r9, t4, s0 // colorE
texld_pp r10, t5, s0 // colorF
//
// Note that when we do the initial comparisons, we actually check to see
// if any texels are different, rather than all same. This allows for an
// optimization, since it requires less instructions (3 less per 4 comparisons)
//
// The sub finds the differences
// the dp3 will make the diffs all positive, and then combine them into a
// single component of the register.
//
//
// R2 Register. Contains A==C, B==C, A==D, B==D
//
#define AC r2.a
#define BC r2.r
#define AD r2.g
#define BD r2.b
// Calc A!=C
sub_pp r0, r5, r7 // 0
dp4_pp AC, r0, r0 // 1
// Calc B!=C
sub_pp r0, r6, r7 // 2
dp4_pp BC, r0, r0 // 3
// Calc A!=D
sub_pp r0, r5, r8 // 4
dp4_pp AD, r0, r0 // 5
// Calc B!=D
sub_pp r0, r6, r8 // 6
dp4_pp BD, r0, r0 // 7
// Invert register: r2 = !r2
cmp_pp r2, -r2, one, zero // 8
//
// R3 Register. Contains A==E, B==E, A==F, B==F
//
#define AE r3.a
#define BE r3.r
#define AF r3.g
#define BF r3.b
// Calc A!=E
sub_pp r0, r5, r9 // 9
dp4_pp AE, r0, r0 // 10
// Calc B!=E
sub_pp r0, r6, r9 // 11
dp4_pp BE, r0, r0 // 12
// Calc A!=F
sub_pp r0, r5, r10 // 13
dp4_pp AF, r0, r0 // 14
// Calc B!=F
sub_pp r0, r6, r10 // 15
dp4_pp BF, r0, r0 // 16
// Invert register: r3 = !r3
cmp_pp r3, -r3, one, zero // 17
// Sample final 4 textures (colorH, colorJ, colorI, and colorL)
sub r0.xy, t0, TexelSize // 18
add r1.xy, t1, TexelSize // 19
texld_pp r7, t6, s0 // colorH
texld_pp r8, t7, s0 // colorJ
texld_pp r9, r0, s0 // colorI -> input.colorA-texelSize
texld_pp r10, r1, s0 // colorL -> input.colorB+texelSize
//
// R4 Register. Contains A==H, B==J, A==I, B==L
//
#define AH r4.a
#define BJ r4.r
#define AI r4.g
#define BL r4.b
// Calc A!=H
sub_pp r0, r5, r7 // 20
dp4_pp AH, r0, r0 // 21
// Calc B!=J
sub_pp r0, r6, r8 // 22
dp4_pp BJ, r0, r0 // 23
// Calc A!=I
sub_pp r0, r5, r9 // 24
dp4_pp AI, r0, r0 // 25
// Calc B!=L
sub_pp r0, r6, r10 // 26
dp4_pp BL, r0, r0 // 27
// Invert register: r4 = !r4
cmp_pp r4, -r4, one, zero // 28
//
// useColB = r5.rgb =
// ((BE && BD && AI && !AF) || (BC && BF && AH)) && !AD
//
// useColA = r5.a =
// (AD && BC && BD) || (((AC && AF && BJ && !BE) || (AD && AE && BL)) && !BC)
//
// As can easily be seen, mostly the same operations need to be done for
// calculating useColA and useColB, so the following code 'might' co-issue.
// So, I'll be nice, and keep useColB in the vector pipe, and usecolA in the
// scaler pipe. It would only get rid of 5 instructions, but that is a lot if
// this is repeating hundreds of thousands of times
//
// B: BC && BF
// A: AD && AE
and_pp r6.rgb, BC, BF // 29
and_pp r6.a, AD, AE // 30
// B: (BC && BF && AH)
// A: (AD && AE && BL)
and_pp r6.rgb, r6, AH // 31
and_pp r6.a, r6, BL // 32
// B: BE && BD
// A: AC && AF
and_pp r5.rgb, BE, BD // 33
and_pp r5.a, AC, AF // 34
// B: AI && !AF
// A: BJ && !BE
cmp_pp r7.rgb, -AF, AI, zero // 35
cmp_pp r7.a, -BE, BJ, zero // 36
// B: ((BE && BD && AI && !AF) || (BC && BF && AH))
// A: ((AC && AF && BJ && !BE) || (AD && AE && BL))
and_or_pp r5, r5, r7, r6 // 37
// B: ((BE && BD && AI && !AF) || (BC && BF && AH)) && !AD
// A: (((AC && AF && BJ && !BE) || (AD && AE && BL)) && !BC)
cmp_pp r5.rgb, -AD, r5, zero // 38
cmp_pp r5.a, -BC, r5, zero // 39
// A: AD && BC
and_pp r6.a, AD, BC // 40
// A: (AD && BC && BD) || (((AC && AF && BJ && !BE) || (AD && AE && BL)) && !BC)
and_or_pp r5.a, r6.a, BD, r5.a // 41
//
// We return ret_B, if only useColB is set.
// We return ret_A, if useColA is set.
// We return zero, if neither is set.
//
mul_pp r5.rgb, r5, ret_B // 42
cmp_pp r5, -r5.a, r5.b, ret_A // 43
mov_pp oC0, r5 // 44
#undef AC
#undef BC
#undef AD
#undef BD
#undef AE
#undef BE
#undef AF
#undef BF
#undef AH
#undef BJ
#undef AI
#undef BL
#undef one
#undef zero
#undef ret_A
#undef ret_B
#undef TexelSize
#undef or
#undef and
#undef and_or
#undef or_pp
#undef and_pp
#undef and_or_pp
};
//
// Calcuating product2 is quite complex. This function does basic calculations
// to see if r is required, and if not, then what should be done instead
//
float4 PS_Product2_Pass0 ( in VS_OUTPUT_PRODUCT2_PASS0 input ) : COLOR
{
float4 colorA = tex2D(SourceSampler, input.colorA);
float4 colorB = tex2D(SourceSampler, input.colorB);
float4 colorC = tex2D(SourceSampler, input.colorC);
float4 colorD = tex2D(SourceSampler, input.colorD);
bool AD = all(colorA == colorD);
bool BC = all(colorB == colorC);
bool AB = all(colorA == colorB);
float4 ret = 0; // Q
if (AD && !BC) ret += 1.0; // A
if (BC && !AD) ret += 0.3; // B
if (AD && BC)
{
ret = 0.6; // R
if (AB) ret = 1.0; // A
}
return ret;
}
//
// Modified GetResult code (see normal 2xSaI code for original).
//
// Difference is so we end up outputting a number in the range 0 to 1
// instead of -4 to +4 like normal 2xSaI code calculates. Each GetResult
// function outputs a number from 0 to 0.25. These are all added together
//
inline float GetResult1(float4 A, float4 B, float4 C, float4 D)
{
float x = 0;
float y = 0;
float r = 0.125;
bool AC = all(A==C);
bool AD = all(A==D);
bool BC = all(B==C);
bool BD = all(B==D);
x = AC;
x += AD;
y = BC && !AC;
y += BD && !AD;
if (x < 1.5) r+=0.125;
if (y < 1.5) r-=0.125;
return r;
}
//
// Same as above, except returns -r
//
inline float GetResult2(float4 A, float4 B, float4 C, float4 D)
{
float x = 0;
float y = 0;
float r = 0.125;
bool AC = all(A==C);
bool AD = all(A==D);
bool BC = all(B==C);
bool BD = all(B==D);
x = AC;
x += AD;
y = BC && !AC;
y += BD && !AD;
if (x < 1.5) r-=0.125;
if (y < 1.5) r+=0.125;
return r;
}
//
// We calculate 'r' in 2 passes. This is the first pass.
//
// Both passes output a number from 0 to 0.5. Additive blending is used
// for the second pass, creating a number in the 0 to 1 range
//
float4 PS_Product2_Pass1 ( in VS_OUTPUT_PRODUCT2_PASS1 input ) : COLOR
{
float4 colorA = tex2D(SourceSampler, input.colorA);
float4 colorB = tex2D(SourceSampler, input.colorB);
float4 colorG = tex2D(SourceSampler, input.colorG);
float4 colorE = tex2D(SourceSampler, input.colorE);
float4 colorK = tex2D(SourceSampler, input.colorK);
float4 colorF = tex2D(SourceSampler, input.colorF);
float r = 0;
r += GetResult1 (colorA, colorB, colorG, colorE);
r += GetResult2 (colorB, colorA, colorK, colorF);
return r;
}
//
// and this is the second pass
//
float4 PS_Product2_Pass2 ( in VS_OUTPUT_PRODUCT2_PASS2 input ) : COLOR
{
float4 colorA = tex2D(SourceSampler, input.colorA);
float4 colorB = tex2D(SourceSampler, input.colorB);
float4 colorL = tex2D(SourceSampler, input.colorL);
float4 colorO = tex2D(SourceSampler, input.colorO);
float4 colorH = tex2D(SourceSampler, input.colorH);
float4 colorN = tex2D(SourceSampler, input.colorN);
float r = 0;
r += GetResult2 (colorB, colorA, colorH, colorN);
r += GetResult1 (colorA, colorB, colorL, colorO);
return r;
}
//
// Final combine stage. Outputs to framebuffer.
//
// Uses preprocess information, and information about what pixel to output
// to various interpolations of the colours where required.
//
float4 PS_Combine ( in VS_OUTPUT_COMBINE input ) : COLOR
{
float4 selector = tex2D(ModuloSampler, input.Selector);
float4 working = tex2D(WorkingSampler, input.colorA);
float4 colorA = tex2D(SourceSampler, input.colorA);
float4 colorB = tex2D(SourceSampler, input.colorB);
float4 colorC = tex2D(SourceSampler, input.colorC);
float4 colorD = tex2D(SourceSampler, input.colorD);
float4 ABCD = (colorA+colorB+colorC+colorD)/4;
float4 product0 = colorA;
if (working.r < 0.6) product0 = colorB;
if (working.r < 0.3) product0 = (colorA+colorB)/2;
float4 product1 = colorA;
if (working.g < 0.6) product1 = colorC;
if (working.g < 0.3) product1 = (colorA+colorC)/2;
bool AD = all(colorA == colorD);
bool BC = all(colorB == colorC);
bool AB = all(colorA == colorB);
float4 product2 = ABCD;
if (working.a > 0.55) product2 = colorA;
if (working.a < 0.45) product2 = colorB;
if (working.b < 0.5) product2 = colorB;
if (working.b < 0.25) product2 = ABCD;
if (working.b > 0.75) product2 = colorA;
float4 ret = colorA;
if (selector.x >= 0.5) ret = product0;
if (selector.y >= 0.5) ret = product1;
if (selector.x >= 0.5 && selector.y >= 0.5) ret = product2;
return ret;
}
technique Preprocess_2xSaI
{
pass prod0
{
// shaders
VertexShader = compile vs_1_1 VS_Product0();
PixelShader = (PS_Product0_1);
Sampler[0] = (SourceSampler);
PixelShaderConstant[2] = (TexelSize);
AlphaBlendEnable = FALSE;
ColorWriteEnable = RED;
SRGBWRITEENABLE = FALSE;
}
pass prod1
{
// shaders
VertexShader = compile vs_1_1 VS_Product1();
PixelShader = (PS_Product0_1);
Sampler[0] = (SourceSampler);
PixelShaderConstant[2] = (TexelSize);
AlphaBlendEnable = FALSE;
ColorWriteEnable = GREEN;
SRGBWRITEENABLE = FALSE;
}
pass prod2_pass0
{
// shaders
VertexShader = compile vs_1_1 VS_Product2_Pass0();
PixelShader = compile ps_2_0 PS_Product2_Pass0();
ColorWriteEnable = BLUE;
AlphaBlendEnable = FALSE;
SRGBWRITEENABLE = FALSE;
}
pass prod2_pass1
{
// shaders
VertexShader = compile vs_1_1 VS_Product2_Pass1();
PixelShader = compile ps_2_0 PS_Product2_Pass1();
ColorWriteEnable = ALPHA;
AlphaBlendEnable = FALSE;
SRGBWRITEENABLE = FALSE;
}
pass prod2_pass2
{
// shaders
VertexShader = compile vs_1_1 VS_Product2_Pass2();
PixelShader = compile ps_2_0 PS_Product2_Pass2();
ColorWriteEnable = ALPHA;
AlphaBlendEnable = TRUE;
SrcBlend = ONE;
DestBlend = ONE;
SRGBWRITEENABLE = FALSE;
}
}
technique Combine_2xSaI
{
pass P0
{
// shaders
VertexShader = compile vs_1_1 VS_Combine();
PixelShader = compile ps_2_0 PS_Combine();
AlphaBlendEnable = FALSE;
ColorWriteEnable = RED|GREEN|BLUE|ALPHA;
SRGBWRITEENABLE = FALSE;
}
}