/**************************************************************************** * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * @file pa.h * * @brief Definitions for primitive assembly. * N primitives are assembled at a time, where N is the SIMD width. * A state machine, that is specific for a given topology, drives the * assembly of vertices into triangles. * ******************************************************************************/ #pragma once #include "frontend.h" struct PA_STATE { #if USE_SIMD16_FRONTEND enum { SIMD_WIDTH = KNOB_SIMD16_WIDTH, SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2, SIMD_WIDTH_LOG2 = 4 }; typedef simd16mask SIMDMASK; typedef simd16scalar SIMDSCALAR; typedef simd16vector SIMDVECTOR; typedef simd16vertex SIMDVERTEX; typedef simd16scalari SIMDSCALARI; #else enum { SIMD_WIDTH = KNOB_SIMD_WIDTH, SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2, SIMD_WIDTH_LOG2 = 3 }; typedef simdmask SIMDMASK; typedef simdscalar SIMDSCALAR; typedef simdvector SIMDVECTOR; typedef simdvertex SIMDVERTEX; typedef simdscalari SIMDSCALARI; #endif DRAW_CONTEXT* pDC{nullptr}; // draw context uint8_t* pStreamBase{nullptr}; // vertex stream uint32_t streamSizeInVerts{0}; // total size of the input stream in verts uint32_t vertexStride{0}; // stride of a vertex in simdvector units // The topology the binner will use. In some cases the FE changes the topology from the api // state. PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN}; #if ENABLE_AVX512_SIMD16 bool useAlternateOffset{false}; #endif bool viewportArrayActive{false}; bool rtArrayActive{false}; uint32_t numVertsPerPrim{0}; PA_STATE() {} PA_STATE(DRAW_CONTEXT* in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, uint32_t in_numVertsPerPrim) : pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) { } virtual bool HasWork() = 0; virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; #if ENABLE_AVX512_SIMD16 virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0; #endif virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; #if ENABLE_AVX512_SIMD16 virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0; #endif virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0; virtual bool NextPrim() = 0; virtual SIMDVERTEX& GetNextVsOutput() = 0; virtual bool GetNextStreamOutput() = 0; virtual SIMDMASK& GetNextVsIndices() = 0; virtual uint32_t NumPrims() = 0; virtual void Reset() = 0; virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0; }; // The Optimized PA is a state machine that assembles triangles from vertex shader simd // output. Here is the sequence // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd). // 2. Execute PA function to assemble and bin triangles. // a. The PA function is a set of functions that collectively make up the // state machine for a given topology. // 1. We use a state index to track which PA function to call. // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle. // 1. We call this the current and previous simd vertex. // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In // order to assemble the second triangle, for a triangle list, we'll need the // last vertex from the previous simd and the first 2 vertices from the current // simd. // 3. At times the PA can assemble multiple triangles from the 2 simd vertices. // // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without // cuts struct PA_STATE_OPT : public PA_STATE { uint32_t numPrims{0}; // Total number of primitives for draw. uint32_t numPrimsComplete{0}; // Total number of complete primitives. uint32_t numSimdPrims{0}; // Number of prims in current simd. uint32_t cur{0}; // index to current VS output. uint32_t prev{0}; // index to prev VS output. Not really needed in the state. const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop. uint32_t counter{0}; // state counter bool reset{false}; // reset state uint32_t primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2}) SIMDSCALARI primID; typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); #if ENABLE_AVX512_SIMD16 typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); #endif typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles. #if ENABLE_AVX512_SIMD16 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr}; #endif PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr}; // PA state machine function for assembling single triangle. PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset #if ENABLE_AVX512_SIMD16 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr}; #endif // state used to advance the PA when Next is called PFN_PA_FUNC pfnPaNextFunc{nullptr}; #if ENABLE_AVX512_SIMD16 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr}; #endif uint32_t nextNumSimdPrims{0}; uint32_t nextNumPrimsIncrement{0}; bool nextReset{false}; bool isStreaming{false}; SIMDMASK junkIndices{0}; // temporary index store for unused virtual function PA_STATE_OPT() {} PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, uint32_t vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; } simdvector& GetSimdVector(uint32_t index, uint32_t slot) { SWR_ASSERT(slot < vertexStride); uint32_t offset = index * vertexStride + slot; simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset]; return vertexSlot; } #if ENABLE_AVX512_SIMD16 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) { SWR_ASSERT(slot < vertexStride); uint32_t offset = index * vertexStride + slot; simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset]; return vertexSlot; } #endif // Assembles 4 triangles. Each simdvector is a single vertex from 4 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle. bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); } #if ENABLE_AVX512_SIMD16 bool Assemble(uint32_t slot, simd16vector verts[]) { return this->pfnPaFunc_simd16(*this, slot, verts); } #endif // Assembles 1 primitive. Each simdscalar is a vertex (xyzw). void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { return this->pfnPaSingleFunc(*this, slot, primIndex, verts); } bool NextPrim() { this->pfnPaFunc = this->pfnPaNextFunc; #if ENABLE_AVX512_SIMD16 this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16; #endif this->numSimdPrims = this->nextNumSimdPrims; this->numPrimsComplete += this->nextNumPrimsIncrement; this->reset = this->nextReset; if (this->isStreaming) { this->reset = false; } bool morePrims = false; if (this->numSimdPrims > 0) { morePrims = true; this->numSimdPrims--; } else { this->counter = (this->reset) ? 0 : (this->counter + 1); this->reset = false; } if (!HasWork()) { morePrims = false; // no more to do } return morePrims; } SIMDVERTEX& GetNextVsOutput() { const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH; // increment cur and prev indices if (counter < numSimdVerts) { // prev undefined for first state prev = cur; cur = counter; } else { // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in // the buffer uint32_t temp = prev; prev = cur; cur = temp; } SWR_ASSERT(cur < numSimdVerts); SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride]; return *(SIMDVERTEX*)pVertex; } SIMDMASK& GetNextVsIndices() { // unused in optimized PA, pass tmp buffer back return junkIndices; } bool GetNextStreamOutput() { this->prev = this->cur; this->cur = this->counter; return HasWork(); } uint32_t NumPrims() { return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ? (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH; } void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, uint32_t numSimdPrims = 0, uint32_t numPrimsIncrement = 0, bool reset = false) { this->pfnPaNextFunc = pfnPaNextFunc; this->nextNumSimdPrims = numSimdPrims; this->nextNumPrimsIncrement = numPrimsIncrement; this->nextReset = reset; this->pfnPaSingleFunc = pfnPaNextSingleFunc; } #if ENABLE_AVX512_SIMD16 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, uint32_t numSimdPrims = 0, uint32_t numPrimsIncrement = 0, bool reset = false) { this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16; this->pfnPaNextFunc = pfnPaNextFunc; this->nextNumSimdPrims = numSimdPrims; this->nextNumPrimsIncrement = numPrimsIncrement; this->nextReset = reset; this->pfnPaSingleFunc = pfnPaNextSingleFunc; } #endif void Reset() { #if ENABLE_AVX512_SIMD16 useAlternateOffset = false; #endif this->pfnPaFunc = this->pfnPaFuncReset; #if ENABLE_AVX512_SIMD16 this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16; #endif this->numPrimsComplete = 0; this->numSimdPrims = 0; this->cur = 0; this->prev = 0; this->counter = 0; this->reset = false; } SIMDSCALARI GetPrimID(uint32_t startID) { #if USE_SIMD16_FRONTEND return _simd16_add_epi32( this->primID, _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH))); #else return _simd_add_epi32( this->primID, _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH))); #endif } }; // helper C wrappers to avoid having to rewrite all the PA topology state functions INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, uint32_t numSimdPrims = 0, uint32_t numPrimsIncrement = 0, bool reset = false) { return pa.SetNextState( pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); } #if ENABLE_AVX512_SIMD16 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, uint32_t numSimdPrims = 0, uint32_t numPrimsIncrement = 0, bool reset = false) { return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); } #endif INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot) { return pa.GetSimdVector(index, slot); } #if ENABLE_AVX512_SIMD16 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot) { return pa.GetSimdVector_simd16(index, slot); } #endif // Cut-aware primitive assembler. struct PA_STATE_CUT : public PA_STATE { SIMDMASK* pCutIndices{nullptr}; // cut indices buffer, 1 bit per vertex uint32_t numVerts{0}; // number of vertices available in buffer store uint32_t numAttribs{0}; // number of attributes int32_t numRemainingVerts{0}; // number of verts remaining to be assembled uint32_t numVertsToAssemble{0}; // total number of verts to assemble for the draw #if ENABLE_AVX512_SIMD16 OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather #else OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather #endif SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd uint32_t numPrimsAssembled{0}; // number of primitives that are fully assembled uint32_t headVertex{0}; // current unused vertex slot in vertex buffer store uint32_t tailVertex{0}; // beginning vertex currently assembling uint32_t curVertex{0}; // current unprocessed vertex uint32_t startPrimId{0}; // starting prim id SIMDSCALARI vPrimId; // vector of prim ID bool needOffsets{false}; // need to compute gather offsets for current SIMD uint32_t vertsPerPrim{0}; bool processCutVerts{ false}; // vertex indices with cuts should be processed as normal, otherwise they // are ignored. Fetch shader sends invalid verts on cuts that should be ignored // while the GS sends valid verts for every index simdvector junkVector; // junk simdvector for unimplemented API #if ENABLE_AVX512_SIMD16 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API #endif // Topology state tracking uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; uint32_t curIndex{0}; bool reverseWinding{false}; // indicates reverse winding for strips int32_t adjExtraVert{0}; // extra vert uses for tristrip w/ adj typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish); PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert PA_STATE_CUT() {} PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts, uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts, uint32_t in_numVertsPerPrim) : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim) { numVerts = in_streamSizeInVerts; numAttribs = in_numAttribs; binTopology = topo; needOffsets = false; processCutVerts = in_processCutVerts; numVertsToAssemble = numRemainingVerts = in_numVerts; numPrimsAssembled = 0; headVertex = tailVertex = curVertex = 0; curIndex = 0; pCutIndices = in_pIndices; memset(indices, 0, sizeof(indices)); #if USE_SIMD16_FRONTEND vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); #else vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); #endif reverseWinding = false; adjExtraVert = -1; bool gsEnabled = pDC->pState->state.gsState.gsEnable; vertsPerPrim = NumVertsPerPrim(topo, gsEnabled); switch (topo) { case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break; case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break; case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break; case TOP_TRI_STRIP_ADJ: if (gsEnabled) { pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj; } else { pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj; } break; case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break; case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break; case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break; case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break; case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break; case TOP_RECT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertRectList; break; default: assert(0 && "Unimplemented topology"); } } SIMDVERTEX& GetNextVsOutput() { uint32_t vertexIndex = this->headVertex / SIMD_WIDTH; this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts; this->needOffsets = true; SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride]; return *(SIMDVERTEX*)pVertex; } SIMDMASK& GetNextVsIndices() { uint32_t vertexIndex = this->headVertex / SIMD_WIDTH; SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex; return *pCurCutIndex; } simdvector& GetSimdVector(uint32_t index, uint32_t slot) { // unused SWR_ASSERT(0 && "Not implemented"); return junkVector; } #if ENABLE_AVX512_SIMD16 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) { // unused SWR_ASSERT(0 && "Not implemented"); return junkVector_simd16; } #endif bool GetNextStreamOutput() { this->headVertex += SIMD_WIDTH; this->needOffsets = true; return HasWork(); } SIMDSCALARI GetPrimID(uint32_t startID) { #if USE_SIMD16_FRONTEND return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId); #else return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId); #endif } void Reset() { #if ENABLE_AVX512_SIMD16 useAlternateOffset = false; #endif this->numRemainingVerts = this->numVertsToAssemble; this->numPrimsAssembled = 0; this->curIndex = 0; this->curVertex = 0; this->tailVertex = 0; this->headVertex = 0; this->reverseWinding = false; this->adjExtraVert = -1; #if USE_SIMD16_FRONTEND this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); #else this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); #endif } bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; } bool IsVertexStoreFull() { return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex; } void RestartTopology() { this->curIndex = 0; this->reverseWinding = false; this->adjExtraVert = -1; } bool IsCutIndex(uint32_t vertex) { uint32_t vertexIndex = vertex / SIMD_WIDTH; uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1); return CheckBit(this->pCutIndices[vertexIndex], vertexOffset); } // iterates across the unprocessed verts until we hit the end or we // have assembled SIMD prims void ProcessVerts() { while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 && this->curVertex != this->headVertex) { // if cut index, restart topology if (IsCutIndex(this->curVertex)) { if (this->processCutVerts) { (this->*pfnPa)(this->curVertex, false); } // finish off tri strip w/ adj before restarting topo if (this->adjExtraVert != -1) { (this->*pfnPa)(this->curVertex, true); } RestartTopology(); } else { (this->*pfnPa)(this->curVertex, false); } this->curVertex++; if (this->curVertex >= this->numVerts) { this->curVertex = 0; } this->numRemainingVerts--; } // special case last primitive for tri strip w/ adj if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1) { (this->*pfnPa)(this->curVertex, true); } } void Advance() { // done with current batch // advance tail to the current unsubmitted vertex this->tailVertex = this->curVertex; this->numPrimsAssembled = 0; #if USE_SIMD16_FRONTEND this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH)); #else this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH)); #endif } bool NextPrim() { // if we've assembled enough prims, we can advance to the next set of verts if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0) { Advance(); } return false; } void ComputeOffsets() { for (uint32_t v = 0; v < this->vertsPerPrim; ++v) { uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR); SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0]; // step to simdvertex batch const uint32_t simdShift = SIMD_WIDTH_LOG2; #if USE_SIMD16_FRONTEND SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift); this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes)); #else SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift); this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes)); #endif // step to index const uint32_t simdMask = SIMD_WIDTH - 1; #if USE_SIMD16_FRONTEND SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask)); this->vOffsets[v] = _simd16_add_epi32( this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float)))); #else SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask)); this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float)))); #endif } } bool Assemble(uint32_t slot, simdvector* verts) { // process any outstanding verts ProcessVerts(); // return false if we don't have enough prims assembled if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0) { return false; } // cache off gather offsets given the current SIMD set of indices the first time we get an // assemble if (this->needOffsets) { ComputeOffsets(); this->needOffsets = false; } for (uint32_t v = 0; v < this->vertsPerPrim; ++v) { SIMDSCALARI offsets = this->vOffsets[v]; // step to attribute #if USE_SIMD16_FRONTEND offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR))); #else offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR))); #endif float* pBase = (float*)this->pStreamBase; for (uint32_t c = 0; c < 4; ++c) { #if USE_SIMD16_FRONTEND simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1); // Assigning to a temporary first to avoid an MSVC 2017 compiler bug simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); verts[v].v[c] = t; #else verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); #endif // move base to next component pBase += SIMD_WIDTH; } } // compute the implied 4th vertex, v3 if (this->binTopology == TOP_RECT_LIST) { for (uint32_t c = 0; c < 4; ++c) { // v1, v3 = v1 + v2 - v0, v2 // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2] simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]); temp = _simd16_sub_ps(temp, verts[1].v[c]); temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010 verts[1].v[c] = _simd16_extract_ps(temp, 0); } } return true; } #if ENABLE_AVX512_SIMD16 bool Assemble(uint32_t slot, simd16vector verts[]) { // process any outstanding verts ProcessVerts(); // return false if we don't have enough prims assembled if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0) { return false; } // cache off gather offsets given the current SIMD set of indices the first time we get an // assemble if (this->needOffsets) { ComputeOffsets(); this->needOffsets = false; } for (uint32_t v = 0; v < this->vertsPerPrim; ++v) { SIMDSCALARI offsets = this->vOffsets[v]; // step to attribute #if USE_SIMD16_FRONTEND offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR))); #else offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector))); #endif float* pBase = (float*)this->pStreamBase; for (uint32_t c = 0; c < 4; ++c) { #if USE_SIMD16_FRONTEND verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1); #else verts[v].v[c] = _simd16_insert_ps( _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0); #endif // move base to next component pBase += SIMD_WIDTH; } } // compute the implied 4th vertex, v3 if (this->binTopology == TOP_RECT_LIST) { for (uint32_t c = 0; c < 4; ++c) { // v1, v3 = v1 + v2 - v0, v2 // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2] simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]); temp = _simd16_sub_ps(temp, verts[1].v[c]); verts[1].v[c] = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010 } } return true; } #endif void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3]) { // move to slot for (uint32_t v = 0; v < this->vertsPerPrim; ++v) { uint32_t* pOffset = (uint32_t*)&this->vOffsets[v]; #if USE_SIMD16_FRONTEND uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex]; #else uint32_t offset = pOffset[triIndex]; #endif offset += sizeof(SIMDVECTOR) * slot; float* pVert = (float*)&tri[v]; for (uint32_t c = 0; c < 4; ++c) { float* pComponent = (float*)(this->pStreamBase + offset); pVert[c] = *pComponent; offset += SIMD_WIDTH * sizeof(float); } } // compute the implied 4th vertex, v3 if ((this->binTopology == TOP_RECT_LIST) && (triIndex % 2 == 1)) { // v1, v3 = v1 + v2 - v0, v2 // v1 stored in tri[0], v0 stored in tri[1], v2 stored in tri[2] float* pVert0 = (float*)&tri[1]; float* pVert1 = (float*)&tri[0]; float* pVert2 = (float*)&tri[2]; float* pVert3 = (float*)&tri[1]; for (uint32_t c = 0; c < 4; ++c) { pVert3[c] = pVert1[c] + pVert2[c] - pVert0[c]; } } } uint32_t NumPrims() { return this->numPrimsAssembled; } // Per-topology functions void ProcessVertTriStrip(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 3) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; if (reverseWinding) { this->indices[1][this->numPrimsAssembled] = this->vert[2]; this->indices[2][this->numPrimsAssembled] = this->vert[1]; } else { this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; } // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->vert[0] = this->vert[1]; this->vert[1] = this->vert[2]; this->curIndex = 2; this->reverseWinding ^= 1; } } template void AssembleTriStripAdj() { if (!gsEnabled) { this->vert[1] = this->vert[2]; this->vert[2] = this->vert[4]; this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->vert[4] = this->vert[2]; this->vert[2] = this->vert[1]; } else { this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->indices[3][this->numPrimsAssembled] = this->vert[3]; this->indices[4][this->numPrimsAssembled] = this->vert[4]; this->indices[5][this->numPrimsAssembled] = this->vert[5]; } this->numPrimsAssembled++; } template void ProcessVertTriStripAdj(uint32_t index, bool finish) { // handle last primitive of tristrip if (finish && this->adjExtraVert != -1) { this->vert[3] = this->adjExtraVert; AssembleTriStripAdj(); this->adjExtraVert = -1; return; } switch (this->curIndex) { case 0: case 1: case 2: case 4: this->vert[this->curIndex] = index; this->curIndex++; break; case 3: this->vert[5] = index; this->curIndex++; break; case 5: if (this->adjExtraVert == -1) { this->adjExtraVert = index; } else { this->vert[3] = index; if (!gsEnabled) { AssembleTriStripAdj(); uint32_t nextTri[6]; if (this->reverseWinding) { nextTri[0] = this->vert[4]; nextTri[1] = this->vert[0]; nextTri[2] = this->vert[2]; nextTri[4] = this->vert[3]; nextTri[5] = this->adjExtraVert; } else { nextTri[0] = this->vert[2]; nextTri[1] = this->adjExtraVert; nextTri[2] = this->vert[3]; nextTri[4] = this->vert[4]; nextTri[5] = this->vert[0]; } for (uint32_t i = 0; i < 6; ++i) { this->vert[i] = nextTri[i]; } this->adjExtraVert = -1; this->reverseWinding ^= 1; } else { this->curIndex++; } } break; case 6: SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!"); AssembleTriStripAdj(); uint32_t nextTri[6]; if (this->reverseWinding) { nextTri[0] = this->vert[4]; nextTri[1] = this->vert[0]; nextTri[2] = this->vert[2]; nextTri[4] = this->vert[3]; nextTri[5] = this->adjExtraVert; } else { nextTri[0] = this->vert[2]; nextTri[1] = this->adjExtraVert; nextTri[2] = this->vert[3]; nextTri[4] = this->vert[4]; nextTri[5] = this->vert[0]; } for (uint32_t i = 0; i < 6; ++i) { this->vert[i] = nextTri[i]; } this->reverseWinding ^= 1; this->adjExtraVert = index; this->curIndex--; break; } } void ProcessVertTriList(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 3) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->curIndex = 0; } } void ProcessVertTriListAdj(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 6) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->indices[3][this->numPrimsAssembled] = this->vert[3]; this->indices[4][this->numPrimsAssembled] = this->vert[4]; this->indices[5][this->numPrimsAssembled] = this->vert[5]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->curIndex = 0; } } void ProcessVertTriListAdjNoGs(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 6) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[2]; this->indices[2][this->numPrimsAssembled] = this->vert[4]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->curIndex = 0; } } void ProcessVertLineList(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 2) { this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->numPrimsAssembled++; this->curIndex = 0; } } void ProcessVertLineStrip(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 2) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->vert[0] = this->vert[1]; this->curIndex = 1; } } void ProcessVertLineStripAdj(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 4) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->indices[3][this->numPrimsAssembled] = this->vert[3]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->vert[0] = this->vert[1]; this->vert[1] = this->vert[2]; this->vert[2] = this->vert[3]; this->curIndex = 3; } } void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 4) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[1]; this->indices[1][this->numPrimsAssembled] = this->vert[2]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->vert[0] = this->vert[1]; this->vert[1] = this->vert[2]; this->vert[2] = this->vert[3]; this->curIndex = 3; } } void ProcessVertLineListAdj(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 4) { this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->indices[3][this->numPrimsAssembled] = this->vert[3]; this->numPrimsAssembled++; this->curIndex = 0; } } void ProcessVertLineListAdjNoGs(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 4) { this->indices[0][this->numPrimsAssembled] = this->vert[1]; this->indices[1][this->numPrimsAssembled] = this->vert[2]; this->numPrimsAssembled++; this->curIndex = 0; } } void ProcessVertPointList(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 1) { this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->numPrimsAssembled++; this->curIndex = 0; } } void ProcessVertRectList(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 3) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; // second triangle in the rectangle // v1, v3 = v1 + v2 - v0, v2 this->indices[0][this->numPrimsAssembled + 1] = this->vert[1]; this->indices[1][this->numPrimsAssembled + 1] = this->vert[0]; this->indices[2][this->numPrimsAssembled + 1] = this->vert[2]; // increment numPrimsAssembled this->numPrimsAssembled += 2; // set up next prim state this->curIndex = 0; } } }; // Primitive Assembly for data output from the DomainShader. struct PA_TESS : PA_STATE { PA_TESS(DRAW_CONTEXT* in_pDC, const SIMDSCALAR* in_pVertData, uint32_t in_attributeStrideInVectors, uint32_t in_vertexStride, uint32_t in_numAttributes, uint32_t* (&in_ppIndices)[3], uint32_t in_numPrims, PRIMITIVE_TOPOLOGY in_binTopology, uint32_t numVertsPerPrim) : PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim), m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors), m_numAttributes(in_numAttributes), m_numPrims(in_numPrims) { #if USE_SIMD16_FRONTEND m_vPrimId = _simd16_setzero_si(); #else m_vPrimId = _simd_setzero_si(); #endif binTopology = in_binTopology; m_ppIndices[0] = in_ppIndices[0]; m_ppIndices[1] = in_ppIndices[1]; m_ppIndices[2] = in_ppIndices[2]; switch (binTopology) { case TOP_POINT_LIST: m_numVertsPerPrim = 1; break; case TOP_LINE_LIST: m_numVertsPerPrim = 2; break; case TOP_TRIANGLE_LIST: m_numVertsPerPrim = 3; break; default: SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__); break; } } bool HasWork() { return m_numPrims != 0; } simdvector& GetSimdVector(uint32_t index, uint32_t slot) { SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__); return junkVector; } #if ENABLE_AVX512_SIMD16 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) { SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__); return junkVector_simd16; } #endif static SIMDSCALARI GenPrimMask(uint32_t numPrims) { SWR_ASSERT(numPrims <= SIMD_WIDTH); #if USE_SIMD16_FRONTEND static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]); #else static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]); #endif } bool Assemble(uint32_t slot, simdvector verts[]) { SWR_ASSERT(slot < m_numAttributes); uint32_t numPrimsToAssemble = PA_TESS::NumPrims(); if (0 == numPrimsToAssemble) { return false; } SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble); const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) { #if USE_SIMD16_FRONTEND SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]); #else SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]); #endif const float* pBase = pBaseAttrib; for (uint32_t c = 0; c < 4; ++c) { #if USE_SIMD16_FRONTEND simd16scalar temp = _simd16_mask_i32gather_ps(_simd16_setzero_ps(), pBase, indices, _simd16_castsi_ps(mask), 4 /* gcc doesn't like sizeof(float) */); verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); #else verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(), pBase, indices, _simd_castsi_ps(mask), 4); // gcc doesn't like sizeof(float) #endif pBase += m_attributeStrideInVectors * SIMD_WIDTH; } } return true; } #if ENABLE_AVX512_SIMD16 bool Assemble(uint32_t slot, simd16vector verts[]) { SWR_ASSERT(slot < m_numAttributes); uint32_t numPrimsToAssemble = PA_TESS::NumPrims(); if (0 == numPrimsToAssemble) { return false; } SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble); const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) { #if USE_SIMD16_FRONTEND SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]); #else SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]); #endif const float* pBase = pBaseAttrib; for (uint32_t c = 0; c < 4; ++c) { #if USE_SIMD16_FRONTEND verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(), pBase, indices, _simd16_castsi_ps(mask), 4 /* gcc doesn't like sizeof(float) */); #else simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), pBase, indices, _simd_castsi_ps(mask), 4 /* gcc doesn't like sizeof(float) */); verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); #endif pBase += m_attributeStrideInVectors * SIMD_WIDTH; } } return true; } #endif void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { SWR_ASSERT(slot < m_numAttributes); SWR_ASSERT(primIndex < PA_TESS::NumPrims()); const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) { #if USE_SIMD16_FRONTEND uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex]; #else uint32_t index = m_ppIndices[i][primIndex]; #endif const float* pVertData = pVertDataBase; float* pVert = (float*)&verts[i]; for (uint32_t c = 0; c < 4; ++c) { pVert[c] = pVertData[index]; pVertData += m_attributeStrideInVectors * SIMD_WIDTH; } } } bool NextPrim() { uint32_t numPrims = PA_TESS::NumPrims(); m_numPrims -= numPrims; m_ppIndices[0] += numPrims; m_ppIndices[1] += numPrims; m_ppIndices[2] += numPrims; return HasWork(); } SIMDVERTEX& GetNextVsOutput() { SWR_NOT_IMPL; return junkVertex; } bool GetNextStreamOutput() { SWR_NOT_IMPL; return false; } SIMDMASK& GetNextVsIndices() { SWR_NOT_IMPL; return junkIndices; } uint32_t NumPrims() { return std::min(m_numPrims, SIMD_WIDTH); } void Reset() { SWR_NOT_IMPL; } SIMDSCALARI GetPrimID(uint32_t startID) { #if USE_SIMD16_FRONTEND return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId); #else return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId); #endif } private: const SIMDSCALAR* m_pVertexData = nullptr; uint32_t m_attributeStrideInVectors = 0; uint32_t m_numAttributes = 0; uint32_t m_numPrims = 0; uint32_t* m_ppIndices[3]; uint32_t m_numVertsPerPrim = 0; SIMDSCALARI m_vPrimId; simdvector junkVector; // junk simdvector for unimplemented API #if ENABLE_AVX512_SIMD16 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API #endif SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API SIMDMASK junkIndices; // temporary index store for unused virtual function }; // Primitive Assembler factory class, responsible for creating and initializing the correct // assembler based on state. template struct PA_FACTORY { PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX* pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride, uint32_t numVertsPerPrim) : topo(in_topo) { #if KNOB_ENABLE_CUT_AWARE_PA == TRUE const API_STATE& state = GetApiState(pDC); if ((IsIndexedT::value && IsCutIndexEnabledT::value && (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) || // non-indexed draws with adjacency topologies must use cut-aware PA until we add // support for them in the optimized PA (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)) { memset(&indexStore, 0, sizeof(indexStore)); uint32_t numAttribs = state.feNumAttributes; new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false, numVertsPerPrim); cutPA = true; } else #endif { uint32_t numPrims = GetNumPrims(in_topo, numVerts); new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false, numVertsPerPrim); cutPA = false; } } PA_STATE& GetPA() { #if KNOB_ENABLE_CUT_AWARE_PA == TRUE if (cutPA) { return this->paCut; } else #endif { return this->paOpt; } } PA_STATE_OPT paOpt; PA_STATE_CUT paCut; bool cutPA{false}; PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN}; PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM]; };