"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/swr/rasterizer/core/threads.cpp" (16 Sep 2020, 48750 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "threads.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 20.1.7_vs_20.1.8.

    1 /****************************************************************************
    2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8  * and/or sell copies of the Software, and to permit persons to whom the
    9  * Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   21  * IN THE SOFTWARE.
   22  ****************************************************************************/
   23 
   24 #include <stdio.h>
   25 #include <thread>
   26 #include <algorithm>
   27 #include <float.h>
   28 #include <vector>
   29 #include <utility>
   30 #include <fstream>
   31 #include <string>
   32 
   33 #if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
   34 #include <pthread.h>
   35 #include <sched.h>
   36 #include <unistd.h>
   37 #endif
   38 
   39 #ifdef __APPLE__
   40 #include <sys/types.h>
   41 #include <sys/sysctl.h>
   42 #endif
   43 
   44 #include "common/os.h"
   45 #include "core/api.h"
   46 #include "context.h"
   47 #include "frontend.h"
   48 #include "backend.h"
   49 #include "rasterizer.h"
   50 #include "rdtsc_core.h"
   51 #include "tilemgr.h"
   52 #include "tileset.h"
   53 
   54 
   55 // ThreadId
   56 struct Core
   57 {
   58     uint32_t              procGroup = 0;
   59     std::vector<uint32_t> threadIds;
   60 };
   61 
   62 struct NumaNode
   63 {
   64     uint32_t          numaId;
   65     std::vector<Core> cores;
   66 };
   67 
   68 typedef std::vector<NumaNode> CPUNumaNodes;
   69 
   70 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
   71 {
   72     out_nodes.clear();
   73     out_numThreadsPerProcGroup = 0;
   74 
   75 #if defined(_WIN32)
   76 
   77     std::vector<KAFFINITY> threadMaskPerProcGroup;
   78 
   79     static std::mutex           m;
   80     std::lock_guard<std::mutex> l(m);
   81 
   82     DWORD bufSize = 0;
   83 
   84     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
   85     SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
   86 
   87     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem =
   88         (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
   89     SWR_ASSERT(pBufferMem);
   90 
   91     ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
   92     SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
   93 
   94     uint32_t                                 count   = bufSize / pBufferMem->Size;
   95     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
   96 
   97     for (uint32_t i = 0; i < count; ++i)
   98     {
   99         SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
  100         for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
  101         {
  102             auto&    gmask     = pBuffer->Processor.GroupMask[g];
  103             uint32_t threadId  = 0;
  104             uint32_t procGroup = gmask.Group;
  105 
  106             Core* pCore = nullptr;
  107 
  108             while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
  109             {
  110                 // clear mask
  111                 KAFFINITY threadMask = KAFFINITY(1) << threadId;
  112                 gmask.Mask &= ~threadMask;
  113 
  114                 if (procGroup >= threadMaskPerProcGroup.size())
  115                 {
  116                     threadMaskPerProcGroup.resize(procGroup + 1);
  117                 }
  118 
  119                 if (threadMaskPerProcGroup[procGroup] & threadMask)
  120                 {
  121                     // Already seen this mask.  This means that we are in 32-bit mode and
  122                     // have seen more than 32 HW threads for this procGroup
  123                     // Don't use it
  124 #if defined(_WIN64)
  125                     SWR_INVALID("Shouldn't get here in 64-bit mode");
  126 #endif
  127                     continue;
  128                 }
  129 
  130                 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
  131 
  132                 // Find Numa Node
  133                 uint32_t         numaId  = 0;
  134                 PROCESSOR_NUMBER procNum = {};
  135                 procNum.Group            = WORD(procGroup);
  136                 procNum.Number           = UCHAR(threadId);
  137 
  138                 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
  139                 SWR_ASSERT(ret);
  140 
  141                 // Store data
  142                 if (out_nodes.size() <= numaId)
  143                 {
  144                     out_nodes.resize(numaId + 1);
  145                 }
  146                 auto& numaNode  = out_nodes[numaId];
  147                 numaNode.numaId = numaId;
  148 
  149                 if (nullptr == pCore)
  150                 {
  151                     numaNode.cores.push_back(Core());
  152                     pCore            = &numaNode.cores.back();
  153                     pCore->procGroup = procGroup;
  154                 }
  155                 pCore->threadIds.push_back(threadId);
  156                 if (procGroup == 0)
  157                 {
  158                     out_numThreadsPerProcGroup++;
  159                 }
  160             }
  161         }
  162         pBuffer = PtrAdd(pBuffer, pBuffer->Size);
  163     }
  164 
  165     free(pBufferMem);
  166 
  167 #elif defined(__linux__) || defined(__gnu_linux__)
  168 
  169     // Parse /proc/cpuinfo to get full topology
  170     std::ifstream input("/proc/cpuinfo");
  171     std::string   line;
  172     char*         c;
  173     uint32_t      procId = uint32_t(-1);
  174     uint32_t      coreId = uint32_t(-1);
  175     uint32_t      physId = uint32_t(-1);
  176 
  177     while (std::getline(input, line))
  178     {
  179         if (line.find("processor") != std::string::npos)
  180         {
  181             auto data_start = line.find(": ") + 2;
  182             procId          = std::strtoul(&line.c_str()[data_start], &c, 10);
  183             continue;
  184         }
  185         if (line.find("core id") != std::string::npos)
  186         {
  187             auto data_start = line.find(": ") + 2;
  188             coreId          = std::strtoul(&line.c_str()[data_start], &c, 10);
  189             continue;
  190         }
  191         if (line.find("physical id") != std::string::npos)
  192         {
  193             auto data_start = line.find(": ") + 2;
  194             physId          = std::strtoul(&line.c_str()[data_start], &c, 10);
  195             continue;
  196         }
  197         if (line.length() == 0)
  198         {
  199             if (physId + 1 > out_nodes.size())
  200                 out_nodes.resize(physId + 1);
  201             auto& numaNode  = out_nodes[physId];
  202             numaNode.numaId = physId;
  203 
  204             if (coreId + 1 > numaNode.cores.size())
  205                 numaNode.cores.resize(coreId + 1);
  206             auto& core     = numaNode.cores[coreId];
  207             core.procGroup = coreId;
  208             core.threadIds.push_back(procId);
  209         }
  210     }
  211 
  212     out_numThreadsPerProcGroup = 0;
  213     for (auto& node : out_nodes)
  214     {
  215         for (auto& core : node.cores)
  216         {
  217             out_numThreadsPerProcGroup += core.threadIds.size();
  218         }
  219     }
  220 
  221 #elif defined(__APPLE__)
  222 
  223     auto numProcessors  = 0;
  224     auto numCores       = 0;
  225     auto numPhysicalIds = 0;
  226 
  227     int    value;
  228     size_t size = sizeof(value);
  229 
  230     int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
  231     SWR_ASSERT(result == 0);
  232     numPhysicalIds = value;
  233 
  234     result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
  235     SWR_ASSERT(result == 0);
  236     numProcessors = value;
  237 
  238     result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
  239     SWR_ASSERT(result == 0);
  240     numCores = value;
  241 
  242     out_nodes.resize(numPhysicalIds);
  243 
  244     for (auto physId = 0; physId < numPhysicalIds; ++physId)
  245     {
  246         auto& numaNode = out_nodes[physId];
  247         auto  procId   = 0;
  248 
  249         numaNode.cores.resize(numCores);
  250 
  251         while (procId < numProcessors)
  252         {
  253             for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
  254             {
  255                 auto& core = numaNode.cores[coreId];
  256 
  257                 core.procGroup = coreId;
  258                 core.threadIds.push_back(procId);
  259             }
  260         }
  261     }
  262 
  263     out_numThreadsPerProcGroup = 0;
  264 
  265     for (auto& node : out_nodes)
  266     {
  267         for (auto& core : node.cores)
  268         {
  269             out_numThreadsPerProcGroup += core.threadIds.size();
  270         }
  271     }
  272 
  273 #else
  274 
  275 #error Unsupported platform
  276 
  277 #endif
  278 
  279     // Prune empty cores and numa nodes
  280     for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();)
  281     {
  282         // Erase empty cores (first)
  283         for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();)
  284         {
  285             if (core_it->threadIds.size() == 0)
  286             {
  287                 core_it = node_it->cores.erase(core_it);
  288             }
  289             else
  290             {
  291                 ++core_it;
  292             }
  293         }
  294 
  295         // Erase empty numa nodes (second)
  296         if (node_it->cores.size() == 0)
  297         {
  298             node_it = out_nodes.erase(node_it);
  299         }
  300         else
  301         {
  302             ++node_it;
  303         }
  304     }
  305 }
  306 
  307 void bindThread(SWR_CONTEXT* pContext,
  308                 uint32_t     threadId,
  309                 uint32_t     procGroupId   = 0,
  310                 bool         bindProcGroup = false)
  311 {
  312     // Only bind threads when MAX_WORKER_THREADS isn't set.
  313     if (pContext->threadInfo.SINGLE_THREADED ||
  314         (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
  315     {
  316         return;
  317     }
  318 
  319 #if defined(_WIN32)
  320 
  321     GROUP_AFFINITY affinity = {};
  322     affinity.Group          = procGroupId;
  323 
  324 #if !defined(_WIN64)
  325     if (threadId >= 32)
  326     {
  327         // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
  328         SWR_INVALID("Shouldn't get here");
  329 
  330         // In a 32-bit process on Windows it is impossible to bind
  331         // to logical processors 32-63 within a processor group.
  332         // In this case set the mask to 0 and let the system assign
  333         // the processor.  Hopefully it will make smart choices.
  334         affinity.Mask = 0;
  335     }
  336     else
  337 #endif
  338     {
  339         // If MAX_WORKER_THREADS is set, only bind to the proc group,
  340         // Not the individual HW thread.
  341         if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
  342         {
  343             affinity.Mask = KAFFINITY(1) << threadId;
  344         }
  345         else
  346         {
  347             affinity.Mask = KAFFINITY(0);
  348         }
  349     }
  350 
  351     if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
  352     {
  353         SWR_INVALID("Failed to set Thread Affinity");
  354     }
  355 
  356 #elif defined(__linux__) || defined(__gnu_linux__)
  357 
  358     cpu_set_t cpuset;
  359     pthread_t thread = pthread_self();
  360     CPU_ZERO(&cpuset);
  361     CPU_SET(threadId, &cpuset);
  362 
  363     int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
  364     if (err != 0)
  365     {
  366         fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
  367     }
  368 
  369 #endif
  370 }
  371 
  372 INLINE
  373 uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext)
  374 {
  375     return pContext->dcRing.GetHead();
  376 }
  377 
  378 INLINE
  379 DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId)
  380 {
  381     return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT];
  382 }
  383 
  384 INLINE
  385 bool IDComparesLess(uint32_t a, uint32_t b)
  386 {
  387     // Use signed delta to ensure that wrap-around to 0 is correctly handled.
  388     int32_t delta = int32_t(a - b);
  389     return (delta < 0);
  390 }
  391 
  392 // returns true if dependency not met
  393 INLINE
  394 bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
  395 {
  396     return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
  397 }
  398 
  399 bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
  400 {
  401     return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
  402 }
  403 
  404 //////////////////////////////////////////////////////////////////////////
  405 /// @brief Update client stats.
  406 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
  407 {
  408     if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
  409     {
  410         return;
  411     }
  412 
  413     DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
  414     OSALIGNLINE(SWR_STATS) stats{0};
  415 
  416     // Sum up stats across all workers before sending to client.
  417     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
  418     {
  419         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
  420         stats.PsInvocations += dynState.pStats[i].PsInvocations;
  421         stats.CsInvocations += dynState.pStats[i].CsInvocations;
  422 
  423     }
  424 
  425 
  426     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
  427 }
  428 
  429 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
  430 {
  431     UpdateClientStats(pContext, workerId, pDC);
  432 
  433     if (pDC->retireCallback.pfnCallbackFunc)
  434     {
  435         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
  436                                             pDC->retireCallback.userData2,
  437                                             pDC->retireCallback.userData3);
  438 
  439         // Callbacks to external code *could* change floating point control state
  440         // Reset our optimal flags
  441         SetOptimalVectorCSR();
  442     }
  443 }
  444 
  445 // inlined-only version
  446 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
  447 {
  448     int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
  449     SWR_ASSERT(result >= 0);
  450 
  451     AR_FLUSH(pDC->drawId);
  452 
  453     if (result == 0)
  454     {
  455         ExecuteCallbacks(pContext, workerId, pDC);
  456 
  457 
  458         // Cleanup memory allocations
  459         pDC->pArena->Reset(true);
  460         if (!pDC->isCompute)
  461         {
  462             pDC->pTileMgr->initialize();
  463         }
  464         if (pDC->cleanupState)
  465         {
  466             pDC->pState->pArena->Reset(true);
  467         }
  468 
  469         _ReadWriteBarrier();
  470 
  471         pContext->dcRing.Dequeue(); // Remove from tail
  472     }
  473 
  474     return result;
  475 }
  476 
  477 // available to other translation modules
  478 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
  479 {
  480     return CompleteDrawContextInl(pContext, 0, pDC);
  481 }
  482 
  483 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext,
  484                                     uint32_t     workerId,
  485                                     uint32_t&    curDrawBE,
  486                                     uint32_t&    drawEnqueued)
  487 {
  488     // increment our current draw id to the first incomplete draw
  489     drawEnqueued = GetEnqueuedDraw(pContext);
  490     while (IDComparesLess(curDrawBE, drawEnqueued))
  491     {
  492         DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
  493 
  494         // If its not compute and FE is not done then break out of loop.
  495         if (!pDC->doneFE && !pDC->isCompute)
  496             break;
  497 
  498         bool isWorkComplete =
  499             pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
  500 
  501         if (isWorkComplete)
  502         {
  503             curDrawBE++;
  504             CompleteDrawContextInl(pContext, workerId, pDC);
  505         }
  506         else
  507         {
  508             break;
  509         }
  510     }
  511 
  512     // If there are no more incomplete draws then return false.
  513     return IDComparesLess(curDrawBE, drawEnqueued);
  514 }
  515 
  516 //////////////////////////////////////////////////////////////////////////
  517 /// @brief If there is any BE work then go work on it.
  518 /// @param pContext - pointer to SWR context.
  519 /// @param workerId - The unique worker ID that is assigned to this thread.
  520 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
  521 /// thread
  522 ///                    has its own curDrawBE counter and this ensures that each worker processes all
  523 ///                    the draws in order.
  524 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
  525 ///                      own set and each time it fails to lock a macrotile, because its already
  526 ///                      locked, then it will add that tile to the lockedTiles set. As a worker
  527 ///                      begins to work on future draws the lockedTiles ensure that it doesn't work
  528 ///                      on tiles that may still have work pending in a previous draw. Additionally,
  529 ///                      the lockedTiles is hueristic that can steer a worker back to the same
  530 ///                      macrotile that it had been working on in a previous draw.
  531 /// @returns        true if worker thread should shutdown
  532 bool WorkOnFifoBE(SWR_CONTEXT* pContext,
  533                   uint32_t     workerId,
  534                   uint32_t&    curDrawBE,
  535                   TileSet&     lockedTiles,
  536                   uint32_t     numaNode,
  537                   uint32_t     numaMask)
  538 {
  539     bool bShutdown = false;
  540 
  541     // Find the first incomplete draw that has pending work. If no such draw is found then
  542     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
  543     uint32_t drawEnqueued = 0;
  544     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
  545     {
  546         return false;
  547     }
  548 
  549     uint32_t lastRetiredDraw =
  550         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
  551 
  552     // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
  553     lockedTiles.clear();
  554 
  555     // Try to work on each draw in order of the available draws in flight.
  556     //   1. If we're on curDrawBE, we can work on any macrotile that is available.
  557     //   2. If we're trying to work on draws after curDrawBE, we are restricted to
  558     //      working on those macrotiles that are known to be complete in the prior draw to
  559     //      maintain order. The locked tiles provides the history to ensures this.
  560     for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
  561     {
  562         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
  563 
  564         if (pDC->isCompute)
  565             return false; // We don't look at compute work.
  566 
  567         // First wait for FE to be finished with this draw. This keeps threading model simple
  568         // but if there are lots of bubbles between draws then serializing FE and BE may
  569         // need to be revisited.
  570         if (!pDC->doneFE)
  571             return false;
  572 
  573         // If this draw is dependent on a previous draw then we need to bail.
  574         if (CheckDependency(pContext, pDC, lastRetiredDraw))
  575         {
  576             return false;
  577         }
  578 
  579         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
  580         auto& macroTiles = pDC->pTileMgr->getDirtyTiles();
  581 
  582         for (auto tile : macroTiles)
  583         {
  584             uint32_t tileID = tile->mId;
  585 
  586             // Only work on tiles for this numa node
  587             uint32_t x, y;
  588             pDC->pTileMgr->getTileIndices(tileID, x, y);
  589             if (((x ^ y) & numaMask) != numaNode)
  590             {
  591                 _mm_pause();
  592                 continue;
  593             }
  594 
  595             if (!tile->getNumQueued())
  596             {
  597                 _mm_pause();
  598                 continue;
  599             }
  600 
  601             // can only work on this draw if it's not in use by other threads
  602             if (lockedTiles.get(tileID))
  603             {
  604                 _mm_pause();
  605                 continue;
  606             }
  607 
  608             if (tile->tryLock())
  609             {
  610                 BE_WORK* pWork;
  611 
  612                 RDTSC_BEGIN(pContext->pBucketMgr, WorkerFoundWork, pDC->drawId);
  613 
  614                 uint32_t numWorkItems = tile->getNumQueued();
  615                 SWR_ASSERT(numWorkItems);
  616 
  617                 pWork = tile->peek();
  618                 SWR_ASSERT(pWork);
  619                 if (pWork->type == DRAW)
  620                 {
  621                     pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
  622                 }
  623                 else if (pWork->type == SHUTDOWN)
  624                 {
  625                     bShutdown = true;
  626                 }
  627 
  628                 while ((pWork = tile->peek()) != nullptr)
  629                 {
  630                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
  631                     tile->dequeue();
  632                 }
  633                 RDTSC_END(pContext->pBucketMgr, WorkerFoundWork, numWorkItems);
  634 
  635                 _ReadWriteBarrier();
  636 
  637                 pDC->pTileMgr->markTileComplete(tileID);
  638 
  639                 // Optimization: If the draw is complete and we're the last one to have worked on it
  640                 // then we can reset the locked list as we know that all previous draws before the
  641                 // next are guaranteed to be complete.
  642                 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
  643                 {
  644                     // We can increment the current BE and safely move to next draw since we know
  645                     // this draw is complete.
  646                     curDrawBE++;
  647                     CompleteDrawContextInl(pContext, workerId, pDC);
  648 
  649                     lastRetiredDraw++;
  650 
  651                     lockedTiles.clear();
  652                     break;
  653                 }
  654 
  655                 if (bShutdown)
  656                 {
  657                     break;
  658                 }
  659             }
  660             else
  661             {
  662                 // This tile is already locked. So let's add it to our locked tiles set. This way we
  663                 // don't try locking this one again.
  664                 lockedTiles.set(tileID);
  665                 _mm_pause();
  666             }
  667         }
  668     }
  669 
  670     return bShutdown;
  671 }
  672 
  673 //////////////////////////////////////////////////////////////////////////
  674 /// @brief Called when FE work is complete for this DC.
  675 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
  676 {
  677     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
  678     {
  679         SWR_STATS_FE& stats = pDC->dynState.statsFE;
  680 
  681         AR_EVENT(FrontendStatsEvent(pDC->drawId,
  682                                     stats.IaVertices,
  683                                     stats.IaPrimitives,
  684                                     stats.VsInvocations,
  685                                     stats.HsInvocations,
  686                                     stats.DsInvocations,
  687                                     stats.GsInvocations,
  688                                     stats.GsPrimitives,
  689                                     stats.CInvocations,
  690                                     stats.CPrimitives,
  691                                     stats.SoPrimStorageNeeded[0],
  692                                     stats.SoPrimStorageNeeded[1],
  693                                     stats.SoPrimStorageNeeded[2],
  694                                     stats.SoPrimStorageNeeded[3],
  695                                     stats.SoNumPrimsWritten[0],
  696                                     stats.SoNumPrimsWritten[1],
  697                                     stats.SoNumPrimsWritten[2],
  698                                     stats.SoNumPrimsWritten[3]));
  699         AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
  700 
  701         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
  702     }
  703 
  704     if (pContext->pfnUpdateSoWriteOffset)
  705     {
  706         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
  707         {
  708             if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
  709                 (pDC->pState->state.soBuffer[i].soWriteEnable))
  710             {
  711                 pContext->pfnUpdateSoWriteOffset(
  712                     GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
  713             }
  714         }
  715     }
  716 
  717     if (pContext->pfnUpdateStreamOut)
  718         pContext->pfnUpdateStreamOut(GetPrivateState(pDC),  pDC->dynState.soPrims);
  719 
  720     // Ensure all streaming writes are globally visible before marking this FE done
  721     _mm_mfence();
  722     pDC->doneFE = true;
  723 
  724     InterlockedDecrement(&pContext->drawsOutstandingFE);
  725 }
  726 
  727 void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE)
  728 {
  729     // Try to grab the next DC from the ring
  730     uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
  731     while (IDComparesLess(curDrawFE, drawEnqueued))
  732     {
  733         uint32_t      dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
  734         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
  735         if (pDC->isCompute || pDC->doneFE)
  736         {
  737             CompleteDrawContextInl(pContext, workerId, pDC);
  738             curDrawFE++;
  739         }
  740         else
  741         {
  742             break;
  743         }
  744     }
  745 
  746     uint32_t lastRetiredFE = curDrawFE - 1;
  747     uint32_t curDraw       = curDrawFE;
  748     while (IDComparesLess(curDraw, drawEnqueued))
  749     {
  750         uint32_t      dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
  751         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
  752 
  753         if (!pDC->FeLock && !pDC->isCompute)
  754         {
  755             if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
  756             {
  757                 return;
  758             }
  759 
  760             uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
  761             if (initial == 0)
  762             {
  763                 // successfully grabbed the DC, now run the FE
  764                 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
  765 
  766                 CompleteDrawFE(pContext, workerId, pDC);
  767             }
  768             else
  769             {
  770                 _mm_pause();
  771             }
  772         }
  773         else
  774         {
  775             _mm_pause();
  776         }
  777 
  778         curDraw++;
  779     }
  780 }
  781 
  782 //////////////////////////////////////////////////////////////////////////
  783 /// @brief If there is any compute work then go work on it.
  784 /// @param pContext - pointer to SWR context.
  785 /// @param workerId - The unique worker ID that is assigned to this thread.
  786 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
  787 /// thread
  788 ///                    has its own curDrawBE counter and this ensures that each worker processes all
  789 ///                    the draws in order.
  790 void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE)
  791 {
  792     uint32_t drawEnqueued = 0;
  793     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
  794     {
  795         return;
  796     }
  797 
  798     uint32_t lastRetiredDraw =
  799         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
  800 
  801     for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
  802     {
  803         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
  804         if (pDC->isCompute == false)
  805             return;
  806 
  807         // check dependencies
  808         if (CheckDependency(pContext, pDC, lastRetiredDraw))
  809         {
  810             return;
  811         }
  812 
  813         SWR_ASSERT(pDC->pDispatch != nullptr);
  814         DispatchQueue& queue = *pDC->pDispatch;
  815 
  816         // Is there any work remaining?
  817         if (queue.getNumQueued() > 0)
  818         {
  819             void*    pSpillFillBuffer = nullptr;
  820             void*    pScratchSpace    = nullptr;
  821             uint32_t threadGroupId    = 0;
  822             while (queue.getWork(threadGroupId))
  823             {
  824                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
  825                 queue.finishedWork();
  826             }
  827 
  828             // Ensure all streaming writes are globally visible before moving onto the next draw
  829             _mm_mfence();
  830         }
  831     }
  832 }
  833 
  834 void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId)
  835 {
  836     if (nullptr == pContext)
  837     {
  838         return;
  839     }
  840 
  841     if (apiThreadId >= pContext->threadPool.numReservedThreads)
  842     {
  843         if (pContext->threadPool.numReservedThreads)
  844         {
  845             const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0];
  846             // Just bind to the process group used for API thread 0
  847             bindThread(pContext, 0, threadData.procGroupId, true);
  848         }
  849         return;
  850     }
  851 
  852     const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId];
  853 
  854     bindThread(
  855         pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
  856 }
  857 
  858 template <bool IsFEThread, bool IsBEThread>
  859 DWORD workerThreadMain(LPVOID pData)
  860 {
  861     THREAD_DATA* pThreadData = (THREAD_DATA*)pData;
  862     SWR_CONTEXT* pContext    = pThreadData->pContext;
  863     uint32_t     threadId    = pThreadData->threadId;
  864     uint32_t     workerId    = pThreadData->workerId;
  865 
  866     bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
  867 
  868     {
  869         char threadName[64];
  870         sprintf_s(threadName,
  871 #if defined(_WIN32)
  872                   "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
  873 #else
  874                   // linux pthread name limited to 16 chars (including \0)
  875                   "w%03d-n%d-c%03d-t%d",
  876 #endif
  877                   workerId,
  878                   pThreadData->numaId,
  879                   pThreadData->coreId,
  880                   pThreadData->htId);
  881         SetCurrentThreadName(threadName);
  882     }
  883 
  884     RDTSC_INIT(pContext->pBucketMgr, threadId);
  885 
  886     // Only need offset numa index from base for correct masking
  887     uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
  888     uint32_t numaMask = pContext->threadPool.numaMask;
  889 
  890     SetOptimalVectorCSR();
  891 
  892     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
  893     // locked then we'll add it to this list so that we don't try and lock it again.
  894     TileSet lockedTiles;
  895 
  896     // each worker has the ability to work on any of the queued draws as long as certain
  897     // conditions are met. the data associated
  898     // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
  899     // has moved on to the next draw when he determines there is no more work to do. The api
  900     // thread will not increment the head of the dc ring until all workers have moved past the
  901     // current head.
  902     // the logic to determine what to work on is:
  903     // 1- try to work on the FE any draw that is queued. For now there are no dependencies
  904     //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
  905     //    we'll need dependency tracking to force serialization on FEs.  The worker will try
  906     //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
  907     //    trying until he reaches the tail.
  908     // 2- BE work must be done in strict order. we accomplish this today by pulling work off
  909     //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
  910     //    any work left by comparing the total # of binned work items and the total # of completed
  911     //    work items. If they are equal, then there is no more work to do for this draw, and
  912     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
  913     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
  914 
  915     auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
  916 
  917     uint32_t curDrawBE = 0;
  918     uint32_t curDrawFE = 0;
  919 
  920     bool bShutdown = false;
  921 
  922     while (true)
  923     {
  924         if (bShutdown && !threadHasWork(curDrawBE))
  925         {
  926             break;
  927         }
  928 
  929         uint32_t loop = 0;
  930         while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
  931         {
  932             _mm_pause();
  933         }
  934 
  935         if (!threadHasWork(curDrawBE))
  936         {
  937             lock.lock();
  938 
  939             // check for thread idle condition again under lock
  940             if (threadHasWork(curDrawBE))
  941             {
  942                 lock.unlock();
  943                 continue;
  944             }
  945 
  946             pContext->FifosNotEmpty.wait(lock);
  947             lock.unlock();
  948         }
  949 
  950         if (IsBEThread)
  951         {
  952             RDTSC_BEGIN(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
  953             bShutdown |=
  954                 WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
  955             RDTSC_END(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
  956 
  957             WorkOnCompute(pContext, workerId, curDrawBE);
  958         }
  959 
  960         if (IsFEThread)
  961         {
  962             WorkOnFifoFE(pContext, workerId, curDrawFE);
  963 
  964             if (!IsBEThread)
  965             {
  966                 curDrawBE = curDrawFE;
  967             }
  968         }
  969     }
  970 
  971     return 0;
  972 }
  973 template <>
  974 DWORD workerThreadMain<false, false>(LPVOID) = delete;
  975 
  976 template <bool IsFEThread, bool IsBEThread>
  977 DWORD workerThreadInit(LPVOID pData)
  978 {
  979 #if defined(_MSC_VER)
  980     __try
  981 #endif // _WIN32
  982     {
  983         return workerThreadMain<IsFEThread, IsBEThread>(pData);
  984     }
  985 
  986 #if defined(_MSC_VER)
  987     __except (EXCEPTION_CONTINUE_SEARCH)
  988     {
  989     }
  990 
  991 #endif // _WIN32
  992 
  993     return 1;
  994 }
  995 template <>
  996 DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
  997 
  998 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
  999 {
 1000     // Initialize DRAW_CONTEXT's per-thread stats
 1001     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
 1002     {
 1003         pContext->dcRing[dc].dynState.pStats =
 1004             (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
 1005         memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
 1006     }
 1007 }
 1008 
 1009 //////////////////////////////////////////////////////////////////////////
 1010 /// @brief Creates thread pool info but doesn't launch threads.
 1011 /// @param pContext - pointer to context
 1012 /// @param pPool - pointer to thread pool object.
 1013 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 1014 {
 1015     CPUNumaNodes nodes;
 1016     uint32_t     numThreadsPerProcGroup = 0;
 1017     CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
 1018     assert(numThreadsPerProcGroup > 0);
 1019 
 1020     // Assumption, for asymmetric topologies, multi-threaded cores will appear
 1021     // in the list before single-threaded cores.  This appears to be true for
 1022     // Windows when the total HW threads is limited to 64.
 1023     uint32_t numHWNodes        = (uint32_t)nodes.size();
 1024     uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
 1025     uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
 1026 
 1027 #if defined(_WIN32) && !defined(_WIN64)
 1028     if (!pContext->threadInfo.MAX_WORKER_THREADS)
 1029     {
 1030         // Limit 32-bit windows to bindable HW threads only
 1031         if ((numHWCoresPerNode * numHWHyperThreads) > 32)
 1032         {
 1033             numHWCoresPerNode = 32 / numHWHyperThreads;
 1034         }
 1035     }
 1036 #endif
 1037 
 1038     // Calculate num HW threads.  Due to asymmetric topologies, this is not
 1039     // a trivial multiplication.
 1040     uint32_t numHWThreads = 0;
 1041     for (auto const& node : nodes)
 1042     {
 1043         for (auto const& core : node.cores)
 1044         {
 1045             numHWThreads += (uint32_t)core.threadIds.size();
 1046         }
 1047     }
 1048 
 1049     uint32_t numNodes        = numHWNodes;
 1050     uint32_t numCoresPerNode = numHWCoresPerNode;
 1051     uint32_t numHyperThreads = numHWHyperThreads;
 1052 
 1053     // Calc used threads per-core
 1054     if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
 1055     {
 1056         numHyperThreads -= pContext->threadInfo.BASE_THREAD;
 1057     }
 1058     else
 1059     {
 1060         SWR_ASSERT(false,
 1061                    "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
 1062                    pContext->threadInfo.BASE_THREAD,
 1063                    numHyperThreads);
 1064         pContext->threadInfo.BASE_THREAD = 0;
 1065     }
 1066 
 1067     if (pContext->threadInfo.MAX_THREADS_PER_CORE)
 1068     {
 1069         numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
 1070     }
 1071 
 1072     // Prune any cores that don't support the number of threads
 1073     if (numHyperThreads > 1)
 1074     {
 1075         for (auto& node : nodes)
 1076         {
 1077             uint32_t numUsableCores = 0;
 1078             for (auto& core : node.cores)
 1079             {
 1080                 numUsableCores += (core.threadIds.size() >= numHyperThreads);
 1081             }
 1082             numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
 1083         }
 1084     }
 1085 
 1086     // Calc used cores per NUMA node
 1087     if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
 1088     {
 1089         numCoresPerNode -= pContext->threadInfo.BASE_CORE;
 1090     }
 1091     else
 1092     {
 1093         SWR_ASSERT(false,
 1094                    "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
 1095                    pContext->threadInfo.BASE_CORE,
 1096                    numCoresPerNode);
 1097         pContext->threadInfo.BASE_CORE = 0;
 1098     }
 1099 
 1100     if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
 1101     {
 1102         numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
 1103     }
 1104 
 1105     // Calc used NUMA nodes
 1106     if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
 1107     {
 1108         numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
 1109     }
 1110     else
 1111     {
 1112         SWR_ASSERT(
 1113             false,
 1114             "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
 1115             pContext->threadInfo.BASE_NUMA_NODE,
 1116             numNodes);
 1117         pContext->threadInfo.BASE_NUMA_NODE = 0;
 1118     }
 1119 
 1120     if (pContext->threadInfo.MAX_NUMA_NODES)
 1121     {
 1122         numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
 1123     }
 1124 
 1125     // Calculate numThreads - at this point everything should be symmetric
 1126     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
 1127     SWR_REL_ASSERT(numThreads <= numHWThreads);
 1128 
 1129     uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
 1130     uint32_t& numAPIThreadsPerCore  = pContext->apiThreadInfo.numAPIThreadsPerCore;
 1131     uint32_t  numRemovedThreads     = 0;
 1132 
 1133     if (pContext->threadInfo.SINGLE_THREADED)
 1134     {
 1135         numAPIReservedThreads      = 0;
 1136         numThreads                 = 1;
 1137         pContext->NumWorkerThreads = 1;
 1138         pContext->NumFEThreads     = 1;
 1139         pContext->NumBEThreads     = 1;
 1140         pPool->numThreads          = 0;
 1141     }
 1142     else if (pContext->threadInfo.MAX_WORKER_THREADS)
 1143     {
 1144         numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
 1145         pContext->threadInfo.BASE_NUMA_NODE = 0;
 1146         pContext->threadInfo.BASE_CORE      = 0;
 1147         pContext->threadInfo.BASE_THREAD    = 0;
 1148         numAPIReservedThreads               = 0;
 1149     }
 1150     else
 1151     {
 1152         if (numAPIReservedThreads >= numThreads)
 1153         {
 1154             numAPIReservedThreads = 0;
 1155         }
 1156         else if (numAPIReservedThreads)
 1157         {
 1158             numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
 1159 
 1160             if (0 == numAPIThreadsPerCore)
 1161             {
 1162                 numAPIThreadsPerCore = numHWHyperThreads;
 1163             }
 1164 
 1165             numRemovedThreads = numAPIReservedThreads;
 1166             if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
 1167             {
 1168                 // Adjust removed threads to make logic below work
 1169                 numRemovedThreads =
 1170                     std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
 1171             }
 1172 
 1173             numThreads -= numRemovedThreads;
 1174         }
 1175     }
 1176 
 1177     InitPerThreadStats(pContext, numThreads);
 1178 
 1179     if (pContext->threadInfo.SINGLE_THREADED)
 1180     {
 1181         numAPIReservedThreads = 0;
 1182         numThreads            = 1;
 1183     }
 1184 
 1185     if (numAPIReservedThreads)
 1186     {
 1187         pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
 1188         SWR_ASSERT(pPool->pApiThreadData);
 1189         if (!pPool->pApiThreadData)
 1190         {
 1191             numAPIReservedThreads = 0;
 1192         }
 1193         else
 1194         {
 1195             memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads);
 1196         }
 1197     }
 1198     pPool->numReservedThreads = numAPIReservedThreads;
 1199 
 1200     pPool->numThreads          = numThreads;
 1201     pContext->NumWorkerThreads = pPool->numThreads;
 1202 
 1203     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
 1204     assert(pPool->pThreadData);
 1205     memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads);
 1206     pPool->numaMask = 0;
 1207 
 1208     // Allocate worker private data
 1209     pPool->pWorkerPrivateDataArray = nullptr;
 1210     if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
 1211     {
 1212         pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
 1213         pContext->workerPrivateState.pfnInitWorkerData = nullptr;
 1214         pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
 1215     }
 1216 
 1217     // initialize contents of SWR_WORKER_DATA
 1218     size_t perWorkerSize =
 1219         AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
 1220     size_t totalSize = perWorkerSize * pPool->numThreads;
 1221     if (totalSize)
 1222     {
 1223         pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
 1224         SWR_ASSERT(pPool->pWorkerPrivateDataArray);
 1225 
 1226         void* pWorkerData = pPool->pWorkerPrivateDataArray;
 1227         for (uint32_t i = 0; i < pPool->numThreads; ++i)
 1228         {
 1229             pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
 1230             if (pContext->workerPrivateState.pfnInitWorkerData)
 1231             {
 1232                 pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i);
 1233             }
 1234             pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
 1235         }
 1236     }
 1237 
 1238     if (pContext->threadInfo.SINGLE_THREADED)
 1239     {
 1240         return;
 1241     }
 1242 
 1243     pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
 1244     assert(pPool->pThreads);
 1245 
 1246     if (pContext->threadInfo.MAX_WORKER_THREADS)
 1247     {
 1248         bool     bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
 1249         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
 1250         // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
 1251         // But Windows will still require binding to specific process groups
 1252         for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
 1253         {
 1254             pPool->pThreadData[workerId].workerId           = workerId;
 1255             pPool->pThreadData[workerId].procGroupId        = workerId % numProcGroups;
 1256             pPool->pThreadData[workerId].threadId           = 0;
 1257             pPool->pThreadData[workerId].numaId             = 0;
 1258             pPool->pThreadData[workerId].coreId             = 0;
 1259             pPool->pThreadData[workerId].htId               = 0;
 1260             pPool->pThreadData[workerId].pContext           = pContext;
 1261             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
 1262 
 1263             pContext->NumBEThreads++;
 1264             pContext->NumFEThreads++;
 1265         }
 1266     }
 1267     else
 1268     {
 1269         // numa distribution assumes workers on all nodes
 1270         bool useNuma = true;
 1271         if (numCoresPerNode * numHyperThreads == 1)
 1272         {
 1273             useNuma = false;
 1274         }
 1275 
 1276         if (useNuma)
 1277         {
 1278             pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
 1279         }
 1280         else
 1281         {
 1282             pPool->numaMask = 0;
 1283         }
 1284 
 1285         uint32_t workerId           = 0;
 1286         uint32_t numReservedThreads = numAPIReservedThreads;
 1287         for (uint32_t n = 0; n < numNodes; ++n)
 1288         {
 1289             if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
 1290             {
 1291                 break;
 1292             }
 1293             auto&    node     = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
 1294             uint32_t numCores = numCoresPerNode;
 1295             for (uint32_t c = 0; c < numCores; ++c)
 1296             {
 1297                 if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
 1298                 {
 1299                     break;
 1300                 }
 1301 
 1302                 auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
 1303                 for (uint32_t t = 0; t < numHyperThreads; ++t)
 1304                 {
 1305                     if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
 1306                     {
 1307                         break;
 1308                     }
 1309 
 1310                     if (numRemovedThreads)
 1311                     {
 1312                         --numRemovedThreads;
 1313                         assert(numReservedThreads);
 1314                         --numReservedThreads;
 1315                         pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
 1316                         pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
 1317                         pPool->pApiThreadData[numReservedThreads].threadId    = core.threadIds[t];
 1318                         pPool->pApiThreadData[numReservedThreads].numaId =
 1319                             useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
 1320                         pPool->pApiThreadData[numReservedThreads].coreId =
 1321                             c + pContext->threadInfo.BASE_CORE;
 1322                         pPool->pApiThreadData[numReservedThreads].htId =
 1323                             t + pContext->threadInfo.BASE_THREAD;
 1324                         pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
 1325                         pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
 1326 
 1327                         if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
 1328                         {
 1329                             --numReservedThreads;
 1330                             pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
 1331                             pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
 1332                             pPool->pApiThreadData[numReservedThreads].threadId =
 1333                                 core.threadIds[t + 1];
 1334                             pPool->pApiThreadData[numReservedThreads].numaId =
 1335                                 useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
 1336                             pPool->pApiThreadData[numReservedThreads].coreId =
 1337                                 c + pContext->threadInfo.BASE_CORE;
 1338                             pPool->pApiThreadData[numReservedThreads].htId =
 1339                                 t + pContext->threadInfo.BASE_THREAD;
 1340                             pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
 1341                             pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
 1342                         }
 1343 
 1344                         continue;
 1345                     }
 1346 
 1347                     SWR_ASSERT(workerId < numThreads);
 1348 
 1349                     pPool->pThreadData[workerId].workerId    = workerId;
 1350                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
 1351                     pPool->pThreadData[workerId].threadId =
 1352                         core.threadIds[t + pContext->threadInfo.BASE_THREAD];
 1353                     pPool->pThreadData[workerId].numaId =
 1354                         useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
 1355                     pPool->pThreadData[workerId].coreId   = c + pContext->threadInfo.BASE_CORE;
 1356                     pPool->pThreadData[workerId].htId     = t + pContext->threadInfo.BASE_THREAD;
 1357                     pPool->pThreadData[workerId].pContext = pContext;
 1358                     pPool->pThreadData[workerId].forceBindProcGroup = false;
 1359 
 1360                     pContext->NumBEThreads++;
 1361                     pContext->NumFEThreads++;
 1362 
 1363                     ++workerId;
 1364                 }
 1365             }
 1366         }
 1367         SWR_ASSERT(workerId == pContext->NumWorkerThreads);
 1368     }
 1369 }
 1370 
 1371 //////////////////////////////////////////////////////////////////////////
 1372 /// @brief Launches worker threads in thread pool.
 1373 /// @param pContext - pointer to context
 1374 /// @param pPool - pointer to thread pool object.
 1375 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 1376 {
 1377     if (pContext->threadInfo.SINGLE_THREADED)
 1378     {
 1379         return;
 1380     }
 1381 
 1382     for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
 1383     {
 1384         pPool->pThreads[workerId] =
 1385             new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
 1386     }
 1387 }
 1388 
 1389 //////////////////////////////////////////////////////////////////////////
 1390 /// @brief Destroys thread pool.
 1391 /// @param pContext - pointer to context
 1392 /// @param pPool - pointer to thread pool object.
 1393 void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 1394 {
 1395     // Wait for all threads to finish
 1396     SwrWaitForIdle(pContext);
 1397 
 1398     // Wait for threads to finish and destroy them
 1399     for (uint32_t t = 0; t < pPool->numThreads; ++t)
 1400     {
 1401         if (!pContext->threadInfo.SINGLE_THREADED)
 1402         {
 1403             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
 1404             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
 1405             pPool->pThreads[t]->detach();
 1406             delete (pPool->pThreads[t]);
 1407         }
 1408 
 1409         if (pContext->workerPrivateState.pfnFinishWorkerData)
 1410         {
 1411             pContext->workerPrivateState.pfnFinishWorkerData(
 1412                 pContext, pPool->pThreadData[t].pWorkerPrivateData, t);
 1413         }
 1414     }
 1415 
 1416     delete[] pPool->pThreads;
 1417 
 1418     // Clean up data used by threads
 1419     delete[] pPool->pThreadData;
 1420     delete[] pPool->pApiThreadData;
 1421 
 1422     AlignedFree(pPool->pWorkerPrivateDataArray);
 1423 }