/* ======================================================================== $File: $ $Date: $ $Revision: $ $Creator: Casey Muratori $ $Notice: (C) Copyright 2015 by Molly Rocket, Inc. All Rights Reserved. $ ======================================================================== */ #define IGNORED_TIMED_FUNCTION TIMED_FUNCTION #define IGNORED_TIMED_BLOCK TIMED_BLOCK #define internal #include "handmade.h" #if 0 #include #else #define IACA_VC64_START #define IACA_VC64_END #endif void DrawRectangleQuickly(loaded_bitmap *Buffer, v2 Origin, v2 XAxis, v2 YAxis, v4 Color, loaded_bitmap *Texture, real32 PixelsToMeters, rectangle2i ClipRect, bool32 Even) { IGNORED_TIMED_FUNCTION(); // NOTE(casey): Premultiply color up front Color.rgb *= Color.a; real32 XAxisLength = Length(XAxis); real32 YAxisLength = Length(YAxis); v2 NxAxis = (YAxisLength / XAxisLength) * XAxis; v2 NyAxis = (XAxisLength / YAxisLength) * YAxis; // NOTE(casey): NzScale could be a parameter if we want people to // have control over the amount of scaling in the Z direction // that the normals appear to have. real32 NzScale = 0.5f*(XAxisLength + YAxisLength); real32 InvXAxisLengthSq = 1.0f / LengthSq(XAxis); real32 InvYAxisLengthSq = 1.0f / LengthSq(YAxis); rectangle2i FillRect = InvertedInfinityRectangle2i(); v2 P[4] = {Origin, Origin + XAxis, Origin + XAxis + YAxis, Origin + YAxis}; for(int PIndex = 0; PIndex < ArrayCount(P); ++PIndex) { v2 TestP = P[PIndex]; int FloorX = FloorReal32ToInt32(TestP.x); int CeilX = CeilReal32ToInt32(TestP.x) + 1; int FloorY = FloorReal32ToInt32(TestP.y); int CeilY = CeilReal32ToInt32(TestP.y) + 1; if(FillRect.MinX > FloorX) {FillRect.MinX = FloorX;} if(FillRect.MinY > FloorY) {FillRect.MinY = FloorY;} if(FillRect.MaxX < CeilX) {FillRect.MaxX = CeilX;} if(FillRect.MaxY < CeilY) {FillRect.MaxY = CeilY;} } // rectangle2i ClipRect = {0, 0, WidthMax, HeightMax}; // rectangle2i ClipRect = {128, 128, 256, 256}; FillRect = Intersect(ClipRect, FillRect); if(!Even == (FillRect.MinY & 1)) { FillRect.MinY += 1; } if(HasArea(FillRect)) { __m128i StartClipMask = _mm_set1_epi8(-1); __m128i EndClipMask = _mm_set1_epi8(-1); __m128i StartClipMasks[] = { _mm_slli_si128(StartClipMask, 0*4), _mm_slli_si128(StartClipMask, 1*4), _mm_slli_si128(StartClipMask, 2*4), _mm_slli_si128(StartClipMask, 3*4), }; __m128i EndClipMasks[] = { _mm_srli_si128(EndClipMask, 0*4), _mm_srli_si128(EndClipMask, 3*4), _mm_srli_si128(EndClipMask, 2*4), _mm_srli_si128(EndClipMask, 1*4), }; if(FillRect.MinX & 3) { StartClipMask = StartClipMasks[FillRect.MinX & 3]; FillRect.MinX = FillRect.MinX & ~3; } if(FillRect.MaxX & 3) { EndClipMask = EndClipMasks[FillRect.MaxX & 3]; FillRect.MaxX = (FillRect.MaxX & ~3) + 4; } v2 nXAxis = InvXAxisLengthSq*XAxis; v2 nYAxis = InvYAxisLengthSq*YAxis; real32 Inv255 = 1.0f / 255.0f; __m128 Inv255_4x = _mm_set1_ps(Inv255); real32 One255 = 255.0f; __m128 One = _mm_set1_ps(1.0f); __m128 Half = _mm_set1_ps(0.5f); __m128 Four_4x = _mm_set1_ps(4.0f); __m128 One255_4x = _mm_set1_ps(255.0f); __m128 Zero = _mm_set1_ps(0.0f); __m128i MaskFF = _mm_set1_epi32(0xFF); __m128i MaskFFFF = _mm_set1_epi32(0xFFFF); __m128i MaskFF00FF = _mm_set1_epi32(0x00FF00FF); __m128 Colorr_4x = _mm_set1_ps(Color.r); __m128 Colorg_4x = _mm_set1_ps(Color.g); __m128 Colorb_4x = _mm_set1_ps(Color.b); __m128 Colora_4x = _mm_set1_ps(Color.a); __m128 nXAxisx_4x = _mm_set1_ps(nXAxis.x); __m128 nXAxisy_4x = _mm_set1_ps(nXAxis.y); __m128 nYAxisx_4x = _mm_set1_ps(nYAxis.x); __m128 nYAxisy_4x = _mm_set1_ps(nYAxis.y); __m128 Originx_4x = _mm_set1_ps(Origin.x); __m128 Originy_4x = _mm_set1_ps(Origin.y); __m128 MaxColorValue = _mm_set1_ps(255.0f*255.0f); __m128i TexturePitch_4x = _mm_set1_epi32(Texture->Pitch); __m128 WidthM2 = _mm_set1_ps((real32)(Texture->Width - 2)); __m128 HeightM2 = _mm_set1_ps((real32)(Texture->Height - 2)); uint8 *Row = ((uint8 *)Buffer->Memory + FillRect.MinX*BITMAP_BYTES_PER_PIXEL + FillRect.MinY*Buffer->Pitch); int32 RowAdvance = 2*Buffer->Pitch; void *TextureMemory = Texture->Memory; int32 TexturePitch = Texture->Pitch; int MinY = FillRect.MinY; int MaxY = FillRect.MaxY; int MinX = FillRect.MinX; int MaxX = FillRect.MaxX; IGNORED_TIMED_BLOCK(PixelFill, GetClampedRectArea(FillRect) / 2); for(int Y = MinY; Y < MaxY; Y += 2) { __m128 PixelPy = _mm_set1_ps((real32)Y); PixelPy = _mm_sub_ps(PixelPy, Originy_4x); __m128 PynX = _mm_mul_ps(PixelPy, nXAxisy_4x); __m128 PynY = _mm_mul_ps(PixelPy, nYAxisy_4x); __m128 PixelPx = _mm_set_ps((real32)(MinX + 3), (real32)(MinX + 2), (real32)(MinX + 1), (real32)(MinX + 0)); PixelPx = _mm_sub_ps(PixelPx, Originx_4x); __m128i ClipMask = StartClipMask; uint32 *Pixel = (uint32 *)Row; for(int XI = MinX; XI < MaxX; XI += 4) { #define mmSquare(a) _mm_mul_ps(a, a) #define M(a, i) ((float *)&(a))[i] #define Mi(a, i) ((uint32 *)&(a))[i] IACA_VC64_START; __m128 U = _mm_add_ps(_mm_mul_ps(PixelPx, nXAxisx_4x), PynX); __m128 V = _mm_add_ps(_mm_mul_ps(PixelPx, nYAxisx_4x), PynY); __m128i WriteMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)), _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One)))); WriteMask = _mm_and_si128(WriteMask, ClipMask); // TODO(casey): Later, re-check if this helps // if(_mm_movemask_epi8(WriteMask)) { __m128i OriginalDest = _mm_load_si128((__m128i *)Pixel); U = _mm_min_ps(_mm_max_ps(U, Zero), One); V = _mm_min_ps(_mm_max_ps(V, Zero), One); // NOTE(casey): Bias texture coordinates to start // on the boundary between the 0,0 and 1,1 pixels. __m128 tX = _mm_add_ps(_mm_mul_ps(U, WidthM2), Half); __m128 tY = _mm_add_ps(_mm_mul_ps(V, HeightM2), Half); __m128i FetchX_4x = _mm_cvttps_epi32(tX); __m128i FetchY_4x = _mm_cvttps_epi32(tY); __m128 fX = _mm_sub_ps(tX, _mm_cvtepi32_ps(FetchX_4x)); __m128 fY = _mm_sub_ps(tY, _mm_cvtepi32_ps(FetchY_4x)); FetchX_4x = _mm_slli_epi32(FetchX_4x, 2); FetchY_4x = _mm_or_si128(_mm_mullo_epi16(FetchY_4x, TexturePitch_4x), _mm_slli_epi32(_mm_mulhi_epi16(FetchY_4x, TexturePitch_4x), 16)); __m128i Fetch_4x = _mm_add_epi32(FetchX_4x, FetchY_4x); int32 Fetch0 = Mi(Fetch_4x, 0); int32 Fetch1 = Mi(Fetch_4x, 1); int32 Fetch2 = Mi(Fetch_4x, 2); int32 Fetch3 = Mi(Fetch_4x, 3); uint8 *TexelPtr0 = ((uint8 *)TextureMemory) + Fetch0; uint8 *TexelPtr1 = ((uint8 *)TextureMemory) + Fetch1; uint8 *TexelPtr2 = ((uint8 *)TextureMemory) + Fetch2; uint8 *TexelPtr3 = ((uint8 *)TextureMemory) + Fetch3; __m128i SampleA = _mm_setr_epi32(*(uint32 *)(TexelPtr0), *(uint32 *)(TexelPtr1), *(uint32 *)(TexelPtr2), *(uint32 *)(TexelPtr3)); __m128i SampleB = _mm_setr_epi32(*(uint32 *)(TexelPtr0 + sizeof(uint32)), *(uint32 *)(TexelPtr1 + sizeof(uint32)), *(uint32 *)(TexelPtr2 + sizeof(uint32)), *(uint32 *)(TexelPtr3 + sizeof(uint32))); __m128i SampleC = _mm_setr_epi32(*(uint32 *)(TexelPtr0 + TexturePitch), *(uint32 *)(TexelPtr1 + TexturePitch), *(uint32 *)(TexelPtr2 + TexturePitch), *(uint32 *)(TexelPtr3 + TexturePitch)); __m128i SampleD = _mm_setr_epi32(*(uint32 *)(TexelPtr0 + TexturePitch + sizeof(uint32)), *(uint32 *)(TexelPtr1 + TexturePitch + sizeof(uint32)), *(uint32 *)(TexelPtr2 + TexturePitch + sizeof(uint32)), *(uint32 *)(TexelPtr3 + TexturePitch + sizeof(uint32))); // NOTE(casey): Unpack bilinear samples __m128i TexelArb = _mm_and_si128(SampleA, MaskFF00FF); __m128i TexelAag = _mm_and_si128(_mm_srli_epi32(SampleA, 8), MaskFF00FF); TexelArb = _mm_mullo_epi16(TexelArb, TexelArb); __m128 TexelAa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelAag, 16)); TexelAag = _mm_mullo_epi16(TexelAag, TexelAag); __m128i TexelBrb = _mm_and_si128(SampleB, MaskFF00FF); __m128i TexelBag = _mm_and_si128(_mm_srli_epi32(SampleB, 8), MaskFF00FF); TexelBrb = _mm_mullo_epi16(TexelBrb, TexelBrb); __m128 TexelBa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelBag, 16)); TexelBag = _mm_mullo_epi16(TexelBag, TexelBag); __m128i TexelCrb = _mm_and_si128(SampleC, MaskFF00FF); __m128i TexelCag = _mm_and_si128(_mm_srli_epi32(SampleC, 8), MaskFF00FF); TexelCrb = _mm_mullo_epi16(TexelCrb, TexelCrb); __m128 TexelCa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelCag, 16)); TexelCag = _mm_mullo_epi16(TexelCag, TexelCag); __m128i TexelDrb = _mm_and_si128(SampleD, MaskFF00FF); __m128i TexelDag = _mm_and_si128(_mm_srli_epi32(SampleD, 8), MaskFF00FF); TexelDrb = _mm_mullo_epi16(TexelDrb, TexelDrb); __m128 TexelDa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelDag, 16)); TexelDag = _mm_mullo_epi16(TexelDag, TexelDag); // NOTE(casey): Load destination __m128 Destb = _mm_cvtepi32_ps(_mm_and_si128(OriginalDest, MaskFF)); __m128 Destg = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(OriginalDest, 8), MaskFF)); __m128 Destr = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(OriginalDest, 16), MaskFF)); __m128 Desta = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(OriginalDest, 24), MaskFF)); // NOTE(casey): Convert texture from 0-255 sRGB to "linear" 0-1 brightness space __m128 TexelAr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelArb, 16)); __m128 TexelAg = _mm_cvtepi32_ps(_mm_and_si128(TexelAag, MaskFFFF)); __m128 TexelAb = _mm_cvtepi32_ps(_mm_and_si128(TexelArb, MaskFFFF)); __m128 TexelBr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelBrb, 16)); __m128 TexelBg = _mm_cvtepi32_ps(_mm_and_si128(TexelBag, MaskFFFF)); __m128 TexelBb = _mm_cvtepi32_ps(_mm_and_si128(TexelBrb, MaskFFFF)); __m128 TexelCr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelCrb, 16)); __m128 TexelCg = _mm_cvtepi32_ps(_mm_and_si128(TexelCag, MaskFFFF)); __m128 TexelCb = _mm_cvtepi32_ps(_mm_and_si128(TexelCrb, MaskFFFF)); __m128 TexelDr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelDrb, 16)); __m128 TexelDg = _mm_cvtepi32_ps(_mm_and_si128(TexelDag, MaskFFFF)); __m128 TexelDb = _mm_cvtepi32_ps(_mm_and_si128(TexelDrb, MaskFFFF)); // NOTE(casey): Bilinear texture blend __m128 ifX = _mm_sub_ps(One, fX); __m128 ifY = _mm_sub_ps(One, fY); __m128 l0 = _mm_mul_ps(ifY, ifX); __m128 l1 = _mm_mul_ps(ifY, fX); __m128 l2 = _mm_mul_ps(fY, ifX); __m128 l3 = _mm_mul_ps(fY, fX); __m128 Texelr = _mm_add_ps(_mm_add_ps(_mm_mul_ps(l0, TexelAr), _mm_mul_ps(l1, TexelBr)), _mm_add_ps(_mm_mul_ps(l2, TexelCr), _mm_mul_ps(l3, TexelDr))); __m128 Texelg = _mm_add_ps(_mm_add_ps(_mm_mul_ps(l0, TexelAg), _mm_mul_ps(l1, TexelBg)), _mm_add_ps(_mm_mul_ps(l2, TexelCg), _mm_mul_ps(l3, TexelDg))); __m128 Texelb = _mm_add_ps(_mm_add_ps(_mm_mul_ps(l0, TexelAb), _mm_mul_ps(l1, TexelBb)), _mm_add_ps(_mm_mul_ps(l2, TexelCb), _mm_mul_ps(l3, TexelDb))); __m128 Texela = _mm_add_ps(_mm_add_ps(_mm_mul_ps(l0, TexelAa), _mm_mul_ps(l1, TexelBa)), _mm_add_ps(_mm_mul_ps(l2, TexelCa), _mm_mul_ps(l3, TexelDa))); // NOTE(casey): Modulate by incoming color Texelr = _mm_mul_ps(Texelr, Colorr_4x); Texelg = _mm_mul_ps(Texelg, Colorg_4x); Texelb = _mm_mul_ps(Texelb, Colorb_4x); Texela = _mm_mul_ps(Texela, Colora_4x); Texelr = _mm_min_ps(_mm_max_ps(Texelr, Zero), MaxColorValue); Texelg = _mm_min_ps(_mm_max_ps(Texelg, Zero), MaxColorValue); Texelb = _mm_min_ps(_mm_max_ps(Texelb, Zero), MaxColorValue); // NOTE(casey): Go from sRGB to "linear" brightness space Destr = mmSquare(Destr); Destg = mmSquare(Destg); Destb = mmSquare(Destb); // NOTE(casey): Destination blend __m128 InvTexelA = _mm_sub_ps(One, _mm_mul_ps(Inv255_4x, Texela)); __m128 Blendedr = _mm_add_ps(_mm_mul_ps(InvTexelA, Destr), Texelr); __m128 Blendedg = _mm_add_ps(_mm_mul_ps(InvTexelA, Destg), Texelg); __m128 Blendedb = _mm_add_ps(_mm_mul_ps(InvTexelA, Destb), Texelb); __m128 Blendeda = _mm_add_ps(_mm_mul_ps(InvTexelA, Desta), Texela); // NOTE(casey): Go from "linear" 0-1 brightness space to sRGB 0-255 #if 1 Blendedr = _mm_mul_ps(Blendedr, _mm_rsqrt_ps(Blendedr)); Blendedg = _mm_mul_ps(Blendedg, _mm_rsqrt_ps(Blendedg)); Blendedb = _mm_mul_ps(Blendedb, _mm_rsqrt_ps(Blendedb)); #else Blendedr = _mm_sqrt_ps(Blendedr); Blendedg = _mm_sqrt_ps(Blendedg); Blendedb = _mm_sqrt_ps(Blendedb); #endif Blendeda = Blendeda; __m128i Intr = _mm_cvtps_epi32(Blendedr); __m128i Intg = _mm_cvtps_epi32(Blendedg); __m128i Intb = _mm_cvtps_epi32(Blendedb); __m128i Inta = _mm_cvtps_epi32(Blendeda); __m128i Sr = _mm_slli_epi32(Intr, 16); __m128i Sg = _mm_slli_epi32(Intg, 8); __m128i Sb = Intb; __m128i Sa = _mm_slli_epi32(Inta, 24); __m128i Out = _mm_or_si128(_mm_or_si128(Sr, Sg), _mm_or_si128(Sb, Sa)); __m128i MaskedOut = _mm_or_si128(_mm_and_si128(WriteMask, Out), _mm_andnot_si128(WriteMask, OriginalDest)); _mm_store_si128((__m128i *)Pixel, MaskedOut); } PixelPx = _mm_add_ps(PixelPx, Four_4x); Pixel += 4; if((XI + 8) < MaxX) { ClipMask = _mm_set1_epi8(-1); } else { ClipMask = EndClipMask; } IACA_VC64_END; } Row += RowAdvance; } } }