378 lines
18 KiB
C++
378 lines
18 KiB
C++
/* ========================================================================
|
|
$File: $
|
|
$Date: $
|
|
$Revision: $
|
|
$Creator: Casey Muratori $
|
|
$Notice: (C) Copyright 2015 by Molly Rocket, Inc. All Rights Reserved. $
|
|
======================================================================== */
|
|
|
|
#define IGNORED_TIMED_FUNCTION TIMED_FUNCTION
|
|
#define IGNORED_TIMED_BLOCK TIMED_BLOCK
|
|
|
|
#define internal
|
|
#include "handmade.h"
|
|
|
|
#if 0
|
|
#include <iacaMarks.h>
|
|
#else
|
|
#define IACA_VC64_START
|
|
#define IACA_VC64_END
|
|
#endif
|
|
|
|
void
|
|
DrawRectangleQuickly(loaded_bitmap *Buffer, v2 Origin, v2 XAxis, v2 YAxis, v4 Color,
|
|
loaded_bitmap *Texture, real32 PixelsToMeters,
|
|
rectangle2i ClipRect, bool32 Even)
|
|
{
|
|
IGNORED_TIMED_FUNCTION();
|
|
|
|
// NOTE(casey): Premultiply color up front
|
|
Color.rgb *= Color.a;
|
|
|
|
real32 XAxisLength = Length(XAxis);
|
|
real32 YAxisLength = Length(YAxis);
|
|
|
|
v2 NxAxis = (YAxisLength / XAxisLength) * XAxis;
|
|
v2 NyAxis = (XAxisLength / YAxisLength) * YAxis;
|
|
|
|
// NOTE(casey): NzScale could be a parameter if we want people to
|
|
// have control over the amount of scaling in the Z direction
|
|
// that the normals appear to have.
|
|
real32 NzScale = 0.5f*(XAxisLength + YAxisLength);
|
|
|
|
real32 InvXAxisLengthSq = 1.0f / LengthSq(XAxis);
|
|
real32 InvYAxisLengthSq = 1.0f / LengthSq(YAxis);
|
|
|
|
rectangle2i FillRect = InvertedInfinityRectangle2i();
|
|
|
|
v2 P[4] = {Origin, Origin + XAxis, Origin + XAxis + YAxis, Origin + YAxis};
|
|
for(int PIndex = 0;
|
|
PIndex < ArrayCount(P);
|
|
++PIndex)
|
|
{
|
|
v2 TestP = P[PIndex];
|
|
int FloorX = FloorReal32ToInt32(TestP.x);
|
|
int CeilX = CeilReal32ToInt32(TestP.x) + 1;
|
|
int FloorY = FloorReal32ToInt32(TestP.y);
|
|
int CeilY = CeilReal32ToInt32(TestP.y) + 1;
|
|
|
|
if(FillRect.MinX > FloorX) {FillRect.MinX = FloorX;}
|
|
if(FillRect.MinY > FloorY) {FillRect.MinY = FloorY;}
|
|
if(FillRect.MaxX < CeilX) {FillRect.MaxX = CeilX;}
|
|
if(FillRect.MaxY < CeilY) {FillRect.MaxY = CeilY;}
|
|
}
|
|
|
|
// rectangle2i ClipRect = {0, 0, WidthMax, HeightMax};
|
|
// rectangle2i ClipRect = {128, 128, 256, 256};
|
|
FillRect = Intersect(ClipRect, FillRect);
|
|
if(!Even == (FillRect.MinY & 1))
|
|
{
|
|
FillRect.MinY += 1;
|
|
}
|
|
|
|
if(HasArea(FillRect))
|
|
{
|
|
__m128i StartClipMask = _mm_set1_epi8(-1);
|
|
__m128i EndClipMask = _mm_set1_epi8(-1);
|
|
|
|
__m128i StartClipMasks[] =
|
|
{
|
|
_mm_slli_si128(StartClipMask, 0*4),
|
|
_mm_slli_si128(StartClipMask, 1*4),
|
|
_mm_slli_si128(StartClipMask, 2*4),
|
|
_mm_slli_si128(StartClipMask, 3*4),
|
|
};
|
|
|
|
__m128i EndClipMasks[] =
|
|
{
|
|
_mm_srli_si128(EndClipMask, 0*4),
|
|
_mm_srli_si128(EndClipMask, 3*4),
|
|
_mm_srli_si128(EndClipMask, 2*4),
|
|
_mm_srli_si128(EndClipMask, 1*4),
|
|
};
|
|
|
|
if(FillRect.MinX & 3)
|
|
{
|
|
StartClipMask = StartClipMasks[FillRect.MinX & 3];
|
|
FillRect.MinX = FillRect.MinX & ~3;
|
|
}
|
|
|
|
if(FillRect.MaxX & 3)
|
|
{
|
|
EndClipMask = EndClipMasks[FillRect.MaxX & 3];
|
|
FillRect.MaxX = (FillRect.MaxX & ~3) + 4;
|
|
}
|
|
|
|
v2 nXAxis = InvXAxisLengthSq*XAxis;
|
|
v2 nYAxis = InvYAxisLengthSq*YAxis;
|
|
|
|
real32 Inv255 = 1.0f / 255.0f;
|
|
__m128 Inv255_4x = _mm_set1_ps(Inv255);
|
|
real32 One255 = 255.0f;
|
|
|
|
__m128 One = _mm_set1_ps(1.0f);
|
|
__m128 Half = _mm_set1_ps(0.5f);
|
|
__m128 Four_4x = _mm_set1_ps(4.0f);
|
|
__m128 One255_4x = _mm_set1_ps(255.0f);
|
|
__m128 Zero = _mm_set1_ps(0.0f);
|
|
__m128i MaskFF = _mm_set1_epi32(0xFF);
|
|
__m128i MaskFFFF = _mm_set1_epi32(0xFFFF);
|
|
__m128i MaskFF00FF = _mm_set1_epi32(0x00FF00FF);
|
|
__m128 Colorr_4x = _mm_set1_ps(Color.r);
|
|
__m128 Colorg_4x = _mm_set1_ps(Color.g);
|
|
__m128 Colorb_4x = _mm_set1_ps(Color.b);
|
|
__m128 Colora_4x = _mm_set1_ps(Color.a);
|
|
__m128 nXAxisx_4x = _mm_set1_ps(nXAxis.x);
|
|
__m128 nXAxisy_4x = _mm_set1_ps(nXAxis.y);
|
|
__m128 nYAxisx_4x = _mm_set1_ps(nYAxis.x);
|
|
__m128 nYAxisy_4x = _mm_set1_ps(nYAxis.y);
|
|
__m128 Originx_4x = _mm_set1_ps(Origin.x);
|
|
__m128 Originy_4x = _mm_set1_ps(Origin.y);
|
|
__m128 MaxColorValue = _mm_set1_ps(255.0f*255.0f);
|
|
__m128i TexturePitch_4x = _mm_set1_epi32(Texture->Pitch);
|
|
|
|
__m128 WidthM2 = _mm_set1_ps((real32)(Texture->Width - 2));
|
|
__m128 HeightM2 = _mm_set1_ps((real32)(Texture->Height - 2));
|
|
|
|
uint8 *Row = ((uint8 *)Buffer->Memory +
|
|
FillRect.MinX*BITMAP_BYTES_PER_PIXEL +
|
|
FillRect.MinY*Buffer->Pitch);
|
|
int32 RowAdvance = 2*Buffer->Pitch;
|
|
|
|
void *TextureMemory = Texture->Memory;
|
|
int32 TexturePitch = Texture->Pitch;
|
|
|
|
int MinY = FillRect.MinY;
|
|
int MaxY = FillRect.MaxY;
|
|
int MinX = FillRect.MinX;
|
|
int MaxX = FillRect.MaxX;
|
|
|
|
IGNORED_TIMED_BLOCK(PixelFill, GetClampedRectArea(FillRect) / 2);
|
|
for(int Y = MinY;
|
|
Y < MaxY;
|
|
Y += 2)
|
|
{
|
|
__m128 PixelPy = _mm_set1_ps((real32)Y);
|
|
PixelPy = _mm_sub_ps(PixelPy, Originy_4x);
|
|
__m128 PynX = _mm_mul_ps(PixelPy, nXAxisy_4x);
|
|
__m128 PynY = _mm_mul_ps(PixelPy, nYAxisy_4x);
|
|
|
|
__m128 PixelPx = _mm_set_ps((real32)(MinX + 3),
|
|
(real32)(MinX + 2),
|
|
(real32)(MinX + 1),
|
|
(real32)(MinX + 0));
|
|
PixelPx = _mm_sub_ps(PixelPx, Originx_4x);
|
|
|
|
__m128i ClipMask = StartClipMask;
|
|
|
|
uint32 *Pixel = (uint32 *)Row;
|
|
for(int XI = MinX;
|
|
XI < MaxX;
|
|
XI += 4)
|
|
{
|
|
#define mmSquare(a) _mm_mul_ps(a, a)
|
|
#define M(a, i) ((float *)&(a))[i]
|
|
#define Mi(a, i) ((uint32 *)&(a))[i]
|
|
|
|
|
|
IACA_VC64_START;
|
|
__m128 U = _mm_add_ps(_mm_mul_ps(PixelPx, nXAxisx_4x), PynX);
|
|
__m128 V = _mm_add_ps(_mm_mul_ps(PixelPx, nYAxisx_4x), PynY);
|
|
|
|
__m128i WriteMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero),
|
|
_mm_cmple_ps(U, One)),
|
|
_mm_and_ps(_mm_cmpge_ps(V, Zero),
|
|
_mm_cmple_ps(V, One))));
|
|
WriteMask = _mm_and_si128(WriteMask, ClipMask);
|
|
|
|
// TODO(casey): Later, re-check if this helps
|
|
// if(_mm_movemask_epi8(WriteMask))
|
|
{
|
|
__m128i OriginalDest = _mm_load_si128((__m128i *)Pixel);
|
|
|
|
U = _mm_min_ps(_mm_max_ps(U, Zero), One);
|
|
V = _mm_min_ps(_mm_max_ps(V, Zero), One);
|
|
|
|
// NOTE(casey): Bias texture coordinates to start
|
|
// on the boundary between the 0,0 and 1,1 pixels.
|
|
__m128 tX = _mm_add_ps(_mm_mul_ps(U, WidthM2), Half);
|
|
__m128 tY = _mm_add_ps(_mm_mul_ps(V, HeightM2), Half);
|
|
|
|
__m128i FetchX_4x = _mm_cvttps_epi32(tX);
|
|
__m128i FetchY_4x = _mm_cvttps_epi32(tY);
|
|
|
|
__m128 fX = _mm_sub_ps(tX, _mm_cvtepi32_ps(FetchX_4x));
|
|
__m128 fY = _mm_sub_ps(tY, _mm_cvtepi32_ps(FetchY_4x));
|
|
|
|
FetchX_4x = _mm_slli_epi32(FetchX_4x, 2);
|
|
FetchY_4x = _mm_or_si128(_mm_mullo_epi16(FetchY_4x, TexturePitch_4x),
|
|
_mm_slli_epi32(_mm_mulhi_epi16(FetchY_4x, TexturePitch_4x), 16));
|
|
__m128i Fetch_4x = _mm_add_epi32(FetchX_4x, FetchY_4x);
|
|
|
|
int32 Fetch0 = Mi(Fetch_4x, 0);
|
|
int32 Fetch1 = Mi(Fetch_4x, 1);
|
|
int32 Fetch2 = Mi(Fetch_4x, 2);
|
|
int32 Fetch3 = Mi(Fetch_4x, 3);
|
|
|
|
uint8 *TexelPtr0 = ((uint8 *)TextureMemory) + Fetch0;
|
|
uint8 *TexelPtr1 = ((uint8 *)TextureMemory) + Fetch1;
|
|
uint8 *TexelPtr2 = ((uint8 *)TextureMemory) + Fetch2;
|
|
uint8 *TexelPtr3 = ((uint8 *)TextureMemory) + Fetch3;
|
|
|
|
__m128i SampleA = _mm_setr_epi32(*(uint32 *)(TexelPtr0),
|
|
*(uint32 *)(TexelPtr1),
|
|
*(uint32 *)(TexelPtr2),
|
|
*(uint32 *)(TexelPtr3));
|
|
|
|
__m128i SampleB = _mm_setr_epi32(*(uint32 *)(TexelPtr0 + sizeof(uint32)),
|
|
*(uint32 *)(TexelPtr1 + sizeof(uint32)),
|
|
*(uint32 *)(TexelPtr2 + sizeof(uint32)),
|
|
*(uint32 *)(TexelPtr3 + sizeof(uint32)));
|
|
|
|
__m128i SampleC = _mm_setr_epi32(*(uint32 *)(TexelPtr0 + TexturePitch),
|
|
*(uint32 *)(TexelPtr1 + TexturePitch),
|
|
*(uint32 *)(TexelPtr2 + TexturePitch),
|
|
*(uint32 *)(TexelPtr3 + TexturePitch));
|
|
|
|
__m128i SampleD = _mm_setr_epi32(*(uint32 *)(TexelPtr0 + TexturePitch + sizeof(uint32)),
|
|
*(uint32 *)(TexelPtr1 + TexturePitch + sizeof(uint32)),
|
|
*(uint32 *)(TexelPtr2 + TexturePitch + sizeof(uint32)),
|
|
*(uint32 *)(TexelPtr3 + TexturePitch + sizeof(uint32)));
|
|
|
|
// NOTE(casey): Unpack bilinear samples
|
|
__m128i TexelArb = _mm_and_si128(SampleA, MaskFF00FF);
|
|
__m128i TexelAag = _mm_and_si128(_mm_srli_epi32(SampleA, 8), MaskFF00FF);
|
|
TexelArb = _mm_mullo_epi16(TexelArb, TexelArb);
|
|
__m128 TexelAa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelAag, 16));
|
|
TexelAag = _mm_mullo_epi16(TexelAag, TexelAag);
|
|
|
|
__m128i TexelBrb = _mm_and_si128(SampleB, MaskFF00FF);
|
|
__m128i TexelBag = _mm_and_si128(_mm_srli_epi32(SampleB, 8), MaskFF00FF);
|
|
TexelBrb = _mm_mullo_epi16(TexelBrb, TexelBrb);
|
|
__m128 TexelBa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelBag, 16));
|
|
TexelBag = _mm_mullo_epi16(TexelBag, TexelBag);
|
|
|
|
__m128i TexelCrb = _mm_and_si128(SampleC, MaskFF00FF);
|
|
__m128i TexelCag = _mm_and_si128(_mm_srli_epi32(SampleC, 8), MaskFF00FF);
|
|
TexelCrb = _mm_mullo_epi16(TexelCrb, TexelCrb);
|
|
__m128 TexelCa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelCag, 16));
|
|
TexelCag = _mm_mullo_epi16(TexelCag, TexelCag);
|
|
|
|
__m128i TexelDrb = _mm_and_si128(SampleD, MaskFF00FF);
|
|
__m128i TexelDag = _mm_and_si128(_mm_srli_epi32(SampleD, 8), MaskFF00FF);
|
|
TexelDrb = _mm_mullo_epi16(TexelDrb, TexelDrb);
|
|
__m128 TexelDa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelDag, 16));
|
|
TexelDag = _mm_mullo_epi16(TexelDag, TexelDag);
|
|
|
|
// NOTE(casey): Load destination
|
|
__m128 Destb = _mm_cvtepi32_ps(_mm_and_si128(OriginalDest, MaskFF));
|
|
__m128 Destg = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(OriginalDest, 8), MaskFF));
|
|
__m128 Destr = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(OriginalDest, 16), MaskFF));
|
|
__m128 Desta = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(OriginalDest, 24), MaskFF));
|
|
|
|
// NOTE(casey): Convert texture from 0-255 sRGB to "linear" 0-1 brightness space
|
|
__m128 TexelAr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelArb, 16));
|
|
__m128 TexelAg = _mm_cvtepi32_ps(_mm_and_si128(TexelAag, MaskFFFF));
|
|
__m128 TexelAb = _mm_cvtepi32_ps(_mm_and_si128(TexelArb, MaskFFFF));
|
|
|
|
__m128 TexelBr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelBrb, 16));
|
|
__m128 TexelBg = _mm_cvtepi32_ps(_mm_and_si128(TexelBag, MaskFFFF));
|
|
__m128 TexelBb = _mm_cvtepi32_ps(_mm_and_si128(TexelBrb, MaskFFFF));
|
|
|
|
__m128 TexelCr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelCrb, 16));
|
|
__m128 TexelCg = _mm_cvtepi32_ps(_mm_and_si128(TexelCag, MaskFFFF));
|
|
__m128 TexelCb = _mm_cvtepi32_ps(_mm_and_si128(TexelCrb, MaskFFFF));
|
|
|
|
__m128 TexelDr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelDrb, 16));
|
|
__m128 TexelDg = _mm_cvtepi32_ps(_mm_and_si128(TexelDag, MaskFFFF));
|
|
__m128 TexelDb = _mm_cvtepi32_ps(_mm_and_si128(TexelDrb, MaskFFFF));
|
|
|
|
// NOTE(casey): Bilinear texture blend
|
|
__m128 ifX = _mm_sub_ps(One, fX);
|
|
__m128 ifY = _mm_sub_ps(One, fY);
|
|
|
|
__m128 l0 = _mm_mul_ps(ifY, ifX);
|
|
__m128 l1 = _mm_mul_ps(ifY, fX);
|
|
__m128 l2 = _mm_mul_ps(fY, ifX);
|
|
__m128 l3 = _mm_mul_ps(fY, fX);
|
|
|
|
__m128 Texelr = _mm_add_ps(_mm_add_ps(_mm_mul_ps(l0, TexelAr), _mm_mul_ps(l1, TexelBr)),
|
|
_mm_add_ps(_mm_mul_ps(l2, TexelCr), _mm_mul_ps(l3, TexelDr)));
|
|
__m128 Texelg = _mm_add_ps(_mm_add_ps(_mm_mul_ps(l0, TexelAg), _mm_mul_ps(l1, TexelBg)),
|
|
_mm_add_ps(_mm_mul_ps(l2, TexelCg), _mm_mul_ps(l3, TexelDg)));
|
|
__m128 Texelb = _mm_add_ps(_mm_add_ps(_mm_mul_ps(l0, TexelAb), _mm_mul_ps(l1, TexelBb)),
|
|
_mm_add_ps(_mm_mul_ps(l2, TexelCb), _mm_mul_ps(l3, TexelDb)));
|
|
__m128 Texela = _mm_add_ps(_mm_add_ps(_mm_mul_ps(l0, TexelAa), _mm_mul_ps(l1, TexelBa)),
|
|
_mm_add_ps(_mm_mul_ps(l2, TexelCa), _mm_mul_ps(l3, TexelDa)));
|
|
|
|
// NOTE(casey): Modulate by incoming color
|
|
Texelr = _mm_mul_ps(Texelr, Colorr_4x);
|
|
Texelg = _mm_mul_ps(Texelg, Colorg_4x);
|
|
Texelb = _mm_mul_ps(Texelb, Colorb_4x);
|
|
Texela = _mm_mul_ps(Texela, Colora_4x);
|
|
|
|
Texelr = _mm_min_ps(_mm_max_ps(Texelr, Zero), MaxColorValue);
|
|
Texelg = _mm_min_ps(_mm_max_ps(Texelg, Zero), MaxColorValue);
|
|
Texelb = _mm_min_ps(_mm_max_ps(Texelb, Zero), MaxColorValue);
|
|
|
|
// NOTE(casey): Go from sRGB to "linear" brightness space
|
|
Destr = mmSquare(Destr);
|
|
Destg = mmSquare(Destg);
|
|
Destb = mmSquare(Destb);
|
|
|
|
// NOTE(casey): Destination blend
|
|
__m128 InvTexelA = _mm_sub_ps(One, _mm_mul_ps(Inv255_4x, Texela));
|
|
__m128 Blendedr = _mm_add_ps(_mm_mul_ps(InvTexelA, Destr), Texelr);
|
|
__m128 Blendedg = _mm_add_ps(_mm_mul_ps(InvTexelA, Destg), Texelg);
|
|
__m128 Blendedb = _mm_add_ps(_mm_mul_ps(InvTexelA, Destb), Texelb);
|
|
__m128 Blendeda = _mm_add_ps(_mm_mul_ps(InvTexelA, Desta), Texela);
|
|
|
|
// NOTE(casey): Go from "linear" 0-1 brightness space to sRGB 0-255
|
|
#if 1
|
|
Blendedr = _mm_mul_ps(Blendedr, _mm_rsqrt_ps(Blendedr));
|
|
Blendedg = _mm_mul_ps(Blendedg, _mm_rsqrt_ps(Blendedg));
|
|
Blendedb = _mm_mul_ps(Blendedb, _mm_rsqrt_ps(Blendedb));
|
|
#else
|
|
Blendedr = _mm_sqrt_ps(Blendedr);
|
|
Blendedg = _mm_sqrt_ps(Blendedg);
|
|
Blendedb = _mm_sqrt_ps(Blendedb);
|
|
#endif
|
|
Blendeda = Blendeda;
|
|
|
|
__m128i Intr = _mm_cvtps_epi32(Blendedr);
|
|
__m128i Intg = _mm_cvtps_epi32(Blendedg);
|
|
__m128i Intb = _mm_cvtps_epi32(Blendedb);
|
|
__m128i Inta = _mm_cvtps_epi32(Blendeda);
|
|
|
|
__m128i Sr = _mm_slli_epi32(Intr, 16);
|
|
__m128i Sg = _mm_slli_epi32(Intg, 8);
|
|
__m128i Sb = Intb;
|
|
__m128i Sa = _mm_slli_epi32(Inta, 24);
|
|
|
|
__m128i Out = _mm_or_si128(_mm_or_si128(Sr, Sg), _mm_or_si128(Sb, Sa));
|
|
|
|
__m128i MaskedOut = _mm_or_si128(_mm_and_si128(WriteMask, Out),
|
|
_mm_andnot_si128(WriteMask, OriginalDest));
|
|
_mm_store_si128((__m128i *)Pixel, MaskedOut);
|
|
}
|
|
|
|
PixelPx = _mm_add_ps(PixelPx, Four_4x);
|
|
Pixel += 4;
|
|
|
|
if((XI + 8) < MaxX)
|
|
{
|
|
ClipMask = _mm_set1_epi8(-1);
|
|
}
|
|
else
|
|
{
|
|
ClipMask = EndClipMask;
|
|
}
|
|
|
|
IACA_VC64_END;
|
|
}
|
|
|
|
Row += RowAdvance;
|
|
}
|
|
}
|
|
}
|