"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp" (16 Sep 2020, 14736 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "streamout_jit.cpp" see the Fossies "Dox" file reference documentation.

    1 /****************************************************************************
    2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8  * and/or sell copies of the Software, and to permit persons to whom the
    9  * Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   21  * IN THE SOFTWARE.
   22  *
   23  * @file streamout_jit.cpp
   24  *
   25  * @brief Implementation of the streamout jitter
   26  *
   27  * Notes:
   28  *
   29  ******************************************************************************/
   30 #include "jit_pch.hpp"
   31 #include "builder_gfx_mem.h"
   32 #include "jit_api.h"
   33 #include "streamout_jit.h"
   34 #include "gen_state_llvm.h"
   35 #include "functionpasses/passes.h"
   36 
   37 using namespace llvm;
   38 using namespace SwrJit;
   39 
   40 //////////////////////////////////////////////////////////////////////////
   41 /// Interface to Jitting a fetch shader
   42 //////////////////////////////////////////////////////////////////////////
   43 struct StreamOutJit : public BuilderGfxMem
   44 {
   45     StreamOutJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr){};
   46 
   47     // returns pointer to SWR_STREAMOUT_BUFFER
   48     Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
   49     {
   50         return LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer});
   51     }
   52 
   53     //////////////////////////////////////////////////////////////////////////
   54     // @brief checks if streamout buffer is oob
   55     // @return <i1> true/false
   56     Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
   57     {
   58         Value* returnMask = C(false);
   59 
   60         Value* pBuf = getSOBuffer(pSoCtx, buffer);
   61 
   62         // load enable
   63         // @todo bool data types should generate <i1> llvm type
   64         Value* enabled = TRUNC(LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_enable}), IRB()->getInt1Ty());
   65 
   66         // load buffer size
   67         Value* bufferSize = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_bufferSize});
   68 
   69         // load current streamOffset
   70         Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
   71 
   72         // load buffer pitch
   73         Value* pitch = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});
   74 
   75         // buffer is considered oob if in use in a decl but not enabled
   76         returnMask = OR(returnMask, NOT(enabled));
   77 
   78         // buffer is oob if cannot fit a prims worth of verts
   79         Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
   80         returnMask       = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
   81 
   82         return returnMask;
   83     }
   84 
   85     //////////////////////////////////////////////////////////////////////////
   86     // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
   87     //        packing the active mask bits
   88     //        ex. bitmask 0011 -> (0, 1, 0, 0)
   89     //            bitmask 1000 -> (3, 0, 0, 0)
   90     //            bitmask 1100 -> (2, 3, 0, 0)
   91     Value* PackMask(uint32_t bitmask)
   92     {
   93         std::vector<Constant*> indices(4, C(0));
   94         DWORD                  index;
   95         uint32_t               elem = 0;
   96         while (_BitScanForward(&index, bitmask))
   97         {
   98             indices[elem++] = C((int)index);
   99             bitmask &= ~(1 << index);
  100         }
  101 
  102         return ConstantVector::get(indices);
  103     }
  104 
  105     //////////////////////////////////////////////////////////////////////////
  106     // @brief convert scalar bitmask to <4xfloat> bitmask
  107     Value* ToMask(uint32_t bitmask)
  108     {
  109         std::vector<Constant*> indices;
  110         for (uint32_t i = 0; i < 4; ++i)
  111         {
  112             if (bitmask & (1 << i))
  113             {
  114                 indices.push_back(C(true));
  115             }
  116             else
  117             {
  118                 indices.push_back(C(false));
  119             }
  120         }
  121         return ConstantVector::get(indices);
  122     }
  123 
  124     //////////////////////////////////////////////////////////////////////////
  125     // @brief processes a single decl from the streamout stream. Reads 4 components from the input
  126     //        stream and writes N components to the output buffer given the componentMask or if
  127     //        a hole, just increments the buffer pointer
  128     // @param pStream - pointer to current attribute
  129     // @param pOutBuffers - pointers to the current location of each output buffer
  130     // @param decl - input decl
  131     void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
  132     {
  133         uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
  134         uint32_t packedMask    = (1 << numComponents) - 1;
  135         if (!decl.hole)
  136         {
  137             // increment stream pointer to correct slot
  138             Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
  139 
  140             // load 4 components from stream
  141             Type* simd4Ty    = VectorType::get(IRB()->getFloatTy(), 4);
  142             Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
  143             pAttrib          = BITCAST(pAttrib, simd4PtrTy);
  144             Value* vattrib   = LOAD(pAttrib);
  145 
  146             // shuffle/pack enabled components
  147             Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
  148 
  149             // store to output buffer
  150             // cast SO buffer to i8*, needed by maskstore
  151             Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(simd4Ty, 0));
  152 
  153             // cast input to <4xfloat>
  154             Value* src = BITCAST(vpackedAttrib, simd4Ty);
  155 
  156             // cast mask to <4xi1>
  157             Value* mask = ToMask(packedMask);
  158             MASKED_STORE(src, pOut, 4, mask, PointerType::get(simd4Ty, 0), MEM_CLIENT::GFX_MEM_CLIENT_STREAMOUT);
  159         }
  160 
  161         // increment SO buffer
  162         pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
  163     }
  164 
  165     //////////////////////////////////////////////////////////////////////////
  166     // @brief builds a single vertex worth of data for the given stream
  167     // @param streamState - state for this stream
  168     // @param pCurVertex - pointer to src stream vertex data
  169     // @param pOutBuffer - pointers to up to 4 SO buffers
  170     void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
  171     {
  172         for (uint32_t d = 0; d < streamState.numDecls; ++d)
  173         {
  174             const STREAMOUT_DECL& decl = streamState.decl[d];
  175             buildDecl(pCurVertex, pOutBuffer, decl);
  176         }
  177     }
  178 
  179     void buildStream(const STREAMOUT_COMPILE_STATE& state,
  180                      const STREAMOUT_STREAM&        streamState,
  181                      Value*                         pSoCtx,
  182                      BasicBlock*                    returnBB,
  183                      Function*                      soFunc)
  184     {
  185         // get list of active SO buffers
  186         std::unordered_set<uint32_t> activeSOBuffers;
  187         for (uint32_t d = 0; d < streamState.numDecls; ++d)
  188         {
  189             const STREAMOUT_DECL& decl = streamState.decl[d];
  190             activeSOBuffers.insert(decl.bufferIndex);
  191         }
  192 
  193         // always increment numPrimStorageNeeded
  194         Value* numPrimStorageNeeded = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});
  195         numPrimStorageNeeded        = ADD(numPrimStorageNeeded, C(1));
  196         STORE(numPrimStorageNeeded, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});
  197 
  198         // check OOB on active SO buffers.  If any buffer is out of bound, don't write
  199         // the primitive to any buffer
  200         Value* oobMask = C(false);
  201         for (uint32_t buffer : activeSOBuffers)
  202         {
  203             oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
  204         }
  205 
  206         BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
  207 
  208         // early out if OOB
  209         COND_BR(oobMask, returnBB, validBB);
  210 
  211         IRB()->SetInsertPoint(validBB);
  212 
  213         Value* numPrimsWritten = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});
  214         numPrimsWritten        = ADD(numPrimsWritten, C(1));
  215         STORE(numPrimsWritten, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});
  216 
  217         // compute start pointer for each output buffer
  218         Value* pOutBuffer[4];
  219         Value* pOutBufferStartVertex[4];
  220         Value* outBufferPitch[4];
  221         for (uint32_t b : activeSOBuffers)
  222         {
  223             Value* pBuf              = getSOBuffer(pSoCtx, b);
  224             Value* pData             = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pBuffer});
  225             Value* streamOffset      = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
  226             pOutBuffer[b] = GEP(pData, streamOffset, PointerType::get(IRB()->getInt32Ty(), 0));
  227             pOutBufferStartVertex[b] = pOutBuffer[b];
  228 
  229             outBufferPitch[b] = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});
  230         }
  231 
  232         // loop over the vertices of the prim
  233         Value* pStreamData = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pPrimData});
  234         for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
  235         {
  236             buildVertex(streamState, pStreamData, pOutBuffer);
  237 
  238             // increment stream and output buffer pointers
  239             // stream verts are always 32*4 dwords apart
  240             pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4));
  241 
  242             // output buffers offset using pitch in buffer state
  243             for (uint32_t b : activeSOBuffers)
  244             {
  245                 pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
  246                 pOutBuffer[b]            = pOutBufferStartVertex[b];
  247             }
  248         }
  249 
  250         // update each active buffer's streamOffset
  251         for (uint32_t b : activeSOBuffers)
  252         {
  253             Value* pBuf         = getSOBuffer(pSoCtx, b);
  254             Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
  255             streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
  256             STORE(streamOffset, pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
  257         }
  258     }
  259 
  260     Function* Create(const STREAMOUT_COMPILE_STATE& state)
  261     {
  262         std::stringstream fnName("SO_",
  263                                  std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  264         fnName << ComputeCRC(0, &state, sizeof(state));
  265 
  266         std::vector<Type*> args{
  267             mInt8PtrTy,
  268             mInt8PtrTy,
  269             PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
  270         };
  271 
  272         FunctionType* fTy    = FunctionType::get(IRB()->getVoidTy(), args, false);
  273         Function*     soFunc = Function::Create(
  274             fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  275 
  276         soFunc->getParent()->setModuleIdentifier(soFunc->getName());
  277 
  278         // create return basic block
  279         BasicBlock* entry    = BasicBlock::Create(JM()->mContext, "entry", soFunc);
  280         BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
  281 
  282         IRB()->SetInsertPoint(entry);
  283 
  284         // arguments
  285         auto   argitr = soFunc->arg_begin();
  286 
  287         Value* privateContext = &*argitr++;
  288         privateContext->setName("privateContext");
  289         SetPrivateContext(privateContext);
  290 
  291         mpWorkerData = &*argitr;
  292         ++argitr;
  293         mpWorkerData->setName("pWorkerData");
  294 
  295         Value* pSoCtx = &*argitr++;
  296         pSoCtx->setName("pSoCtx");
  297 
  298         const STREAMOUT_STREAM& streamState = state.stream;
  299         buildStream(state, streamState, pSoCtx, returnBB, soFunc);
  300 
  301         BR(returnBB);
  302 
  303         IRB()->SetInsertPoint(returnBB);
  304         RET_VOID();
  305 
  306         JitManager::DumpToFile(soFunc, "SoFunc");
  307 
  308         ::FunctionPassManager passes(JM()->mpCurrentModule);
  309 
  310         passes.add(createBreakCriticalEdgesPass());
  311         passes.add(createCFGSimplificationPass());
  312         passes.add(createEarlyCSEPass());
  313         passes.add(createPromoteMemoryToRegisterPass());
  314         passes.add(createCFGSimplificationPass());
  315         passes.add(createEarlyCSEPass());
  316         passes.add(createInstructionCombiningPass());
  317         passes.add(createConstantPropagationPass());
  318         passes.add(createSCCPPass());
  319         passes.add(createAggressiveDCEPass());
  320 
  321         passes.add(createLowerX86Pass(this));
  322 
  323         passes.run(*soFunc);
  324 
  325         JitManager::DumpToFile(soFunc, "SoFunc_optimized");
  326 
  327 
  328         return soFunc;
  329     }
  330 };
  331 
  332 //////////////////////////////////////////////////////////////////////////
  333 /// @brief JITs from streamout shader IR
  334 /// @param hJitMgr - JitManager handle
  335 /// @param func   - LLVM function IR
  336 /// @return PFN_SO_FUNC - pointer to SOS function
  337 PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
  338 {
  339     llvm::Function* func    = (llvm::Function*)hFunc;
  340     JitManager*     pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
  341     PFN_SO_FUNC     pfnStreamOut;
  342     pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
  343     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
  344     // add new IR to the module
  345     pJitMgr->mIsModuleFinalized = true;
  346 
  347     pJitMgr->DumpAsm(func, "SoFunc_optimized");
  348 
  349 
  350     return pfnStreamOut;
  351 }
  352 
  353 //////////////////////////////////////////////////////////////////////////
  354 /// @brief JIT compiles streamout shader
  355 /// @param hJitMgr - JitManager handle
  356 /// @param state   - SO state to build function from
  357 extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE                         hJitMgr,
  358                                                    const STREAMOUT_COMPILE_STATE& state)
  359 {
  360     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
  361 
  362     STREAMOUT_COMPILE_STATE soState = state;
  363     if (soState.offsetAttribs)
  364     {
  365         for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
  366         {
  367             soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
  368         }
  369     }
  370 
  371     pJitMgr->SetupNewModule();
  372 
  373     StreamOutJit theJit(pJitMgr);
  374     HANDLE       hFunc = theJit.Create(soState);
  375 
  376     return JitStreamoutFunc(hJitMgr, hFunc);
  377 }