"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/freedreno/vulkan/tu_query.c" (16 Sep 2020, 29190 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "tu_query.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 20.1.5_vs_20.2.0-rc1.

    1 /*
    2  * Copyrigh 2016 Red Hat Inc.
    3  * Based on anv:
    4  * Copyright © 2015 Intel Corporation
    5  *
    6  * Permission is hereby granted, free of charge, to any person obtaining a
    7  * copy of this software and associated documentation files (the "Software"),
    8  * to deal in the Software without restriction, including without limitation
    9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   10  * and/or sell copies of the Software, and to permit persons to whom the
   11  * Software is furnished to do so, subject to the following conditions:
   12  *
   13  * The above copyright notice and this permission notice (including the next
   14  * paragraph) shall be included in all copies or substantial portions of the
   15  * Software.
   16  *
   17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
   23  * DEALINGS IN THE SOFTWARE.
   24  */
   25 
   26 #include "tu_private.h"
   27 
   28 #include <assert.h>
   29 #include <fcntl.h>
   30 #include <stdbool.h>
   31 #include <string.h>
   32 #include <unistd.h>
   33 
   34 #include "registers/adreno_pm4.xml.h"
   35 #include "registers/adreno_common.xml.h"
   36 #include "registers/a6xx.xml.h"
   37 
   38 #include "nir/nir_builder.h"
   39 #include "util/os_time.h"
   40 
   41 #include "tu_cs.h"
   42 
   43 #define NSEC_PER_SEC 1000000000ull
   44 #define WAIT_TIMEOUT 5
   45 
   46 /* Depending on the query type, there might be 2 integer values.
   47  * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
   48  *   values[0] : primitives written, values[1]: primitives generated
   49  */
   50 struct PACKED slot_value {
   51    uint64_t values[2];
   52 };
   53 
   54 struct PACKED query_slot {
   55    struct slot_value available; /* 0 when unavailable, 1 when available */
   56    struct slot_value result;
   57 };
   58 
   59 struct PACKED occlusion_query_slot {
   60    struct query_slot common;
   61    struct slot_value begin;
   62    struct slot_value end;
   63 };
   64 
   65 /* The result of transform feedback queries is two integer values:
   66  *   common.result.values[0] is the count of primitives written,
   67  *   common.result.values[1] is the count of primitives generated.
   68  * Also a result for each stream is stored at 4 slots respectively.
   69  */
   70 struct PACKED primitive_query_slot {
   71    struct query_slot common;
   72    struct slot_value begin[4];
   73    struct slot_value end[4];
   74 };
   75 
   76 /* Returns the IOVA of a given uint64_t field in a given slot of a query
   77  * pool. */
   78 #define query_iova(type, pool, query, field, value_index)            \
   79    pool->bo.iova + pool->stride * query + offsetof(type, field) +    \
   80          offsetof(struct slot_value, values[value_index])
   81 
   82 #define occlusion_query_iova(pool, query, field)                     \
   83    query_iova(struct occlusion_query_slot, pool, query, field, 0)
   84 
   85 #define primitive_query_iova(pool, query, field, i)                  \
   86    query_iova(struct primitive_query_slot, pool, query, field, i)
   87 
   88 #define query_available_iova(pool, query)                            \
   89    query_iova(struct query_slot, pool, query, available, 0)
   90 
   91 #define query_result_iova(pool, query, i)                            \
   92    query_iova(struct query_slot, pool, query, result, i)
   93 
   94 #define query_is_available(slot) slot->available.values[0]
   95 
   96 /*
   97  * Returns a pointer to a given slot in a query pool.
   98  */
   99 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
  100 {
  101    return (char*)pool->bo.map + query * pool->stride;
  102 }
  103 
  104 VkResult
  105 tu_CreateQueryPool(VkDevice _device,
  106                    const VkQueryPoolCreateInfo *pCreateInfo,
  107                    const VkAllocationCallbacks *pAllocator,
  108                    VkQueryPool *pQueryPool)
  109 {
  110    TU_FROM_HANDLE(tu_device, device, _device);
  111    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
  112    assert(pCreateInfo->queryCount > 0);
  113 
  114    uint32_t slot_size;
  115    switch (pCreateInfo->queryType) {
  116    case VK_QUERY_TYPE_OCCLUSION:
  117       slot_size = sizeof(struct occlusion_query_slot);
  118       break;
  119    case VK_QUERY_TYPE_TIMESTAMP:
  120       slot_size = sizeof(struct query_slot);
  121       break;
  122    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  123       slot_size = sizeof(struct primitive_query_slot);
  124       break;
  125    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
  126       unreachable("Unimplemented query type");
  127    default:
  128       assert(!"Invalid query type");
  129    }
  130 
  131    struct tu_query_pool *pool =
  132       vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
  133                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
  134 
  135    if (!pool)
  136       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
  137 
  138    VkResult result = tu_bo_init_new(device, &pool->bo,
  139          pCreateInfo->queryCount * slot_size);
  140    if (result != VK_SUCCESS) {
  141       vk_free2(&device->alloc, pAllocator, pool);
  142       return result;
  143    }
  144 
  145    result = tu_bo_map(device, &pool->bo);
  146    if (result != VK_SUCCESS) {
  147       tu_bo_finish(device, &pool->bo);
  148       vk_free2(&device->alloc, pAllocator, pool);
  149       return result;
  150    }
  151 
  152    /* Initialize all query statuses to unavailable */
  153    memset(pool->bo.map, 0, pool->bo.size);
  154 
  155    pool->type = pCreateInfo->queryType;
  156    pool->stride = slot_size;
  157    pool->size = pCreateInfo->queryCount;
  158    pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
  159    *pQueryPool = tu_query_pool_to_handle(pool);
  160 
  161    return VK_SUCCESS;
  162 }
  163 
  164 void
  165 tu_DestroyQueryPool(VkDevice _device,
  166                     VkQueryPool _pool,
  167                     const VkAllocationCallbacks *pAllocator)
  168 {
  169    TU_FROM_HANDLE(tu_device, device, _device);
  170    TU_FROM_HANDLE(tu_query_pool, pool, _pool);
  171 
  172    if (!pool)
  173       return;
  174 
  175    tu_bo_finish(device, &pool->bo);
  176    vk_free2(&device->alloc, pAllocator, pool);
  177 }
  178 
  179 static uint32_t
  180 get_result_count(struct tu_query_pool *pool)
  181 {
  182    switch (pool->type) {
  183    /* Occulusion and timestamp queries write one integer value */
  184    case VK_QUERY_TYPE_OCCLUSION:
  185    case VK_QUERY_TYPE_TIMESTAMP:
  186       return 1;
  187    /* Transform feedback queries write two integer values */
  188    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  189       return 2;
  190    default:
  191       assert(!"Invalid query type");
  192       return 0;
  193    }
  194 }
  195 
  196 /* Wait on the the availability status of a query up until a timeout. */
  197 static VkResult
  198 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
  199                    uint32_t query)
  200 {
  201    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
  202     * scheduler friendly way instead of busy polling once the patch has landed
  203     * upstream. */
  204    struct query_slot *slot = slot_address(pool, query);
  205    uint64_t abs_timeout = os_time_get_absolute_timeout(
  206          WAIT_TIMEOUT * NSEC_PER_SEC);
  207    while(os_time_get_nano() < abs_timeout) {
  208       if (query_is_available(slot))
  209          return VK_SUCCESS;
  210    }
  211    return vk_error(device->instance, VK_TIMEOUT);
  212 }
  213 
  214 /* Writes a query value to a buffer from the CPU. */
  215 static void
  216 write_query_value_cpu(char* base,
  217                       uint32_t offset,
  218                       uint64_t value,
  219                       VkQueryResultFlags flags)
  220 {
  221    if (flags & VK_QUERY_RESULT_64_BIT) {
  222       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
  223    } else {
  224       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
  225    }
  226 }
  227 
  228 static VkResult
  229 get_query_pool_results(struct tu_device *device,
  230                        struct tu_query_pool *pool,
  231                        uint32_t firstQuery,
  232                        uint32_t queryCount,
  233                        size_t dataSize,
  234                        void *pData,
  235                        VkDeviceSize stride,
  236                        VkQueryResultFlags flags)
  237 {
  238    assert(dataSize >= stride * queryCount);
  239 
  240    char *result_base = pData;
  241    VkResult result = VK_SUCCESS;
  242    for (uint32_t i = 0; i < queryCount; i++) {
  243       uint32_t query = firstQuery + i;
  244       struct query_slot *slot = slot_address(pool, query);
  245       bool available = query_is_available(slot);
  246       uint32_t result_count = get_result_count(pool);
  247 
  248       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
  249          VkResult wait_result = wait_for_available(device, pool, query);
  250          if (wait_result != VK_SUCCESS)
  251             return wait_result;
  252          available = true;
  253       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
  254          /* From the Vulkan 1.1.130 spec:
  255           *
  256           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
  257           *    both not set then no result values are written to pData for
  258           *    queries that are in the unavailable state at the time of the
  259           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
  260           *    availability state is still written to pData for those queries
  261           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
  262           */
  263          result = VK_NOT_READY;
  264          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
  265             result_base += stride;
  266             continue;
  267          }
  268       }
  269 
  270       for (uint32_t k = 0; k < result_count; k++) {
  271          if (available)
  272             write_query_value_cpu(result_base, k, slot->result.values[k], flags);
  273          else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
  274              /* From the Vulkan 1.1.130 spec:
  275               *
  276               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
  277               *   is not set, and the query’s status is unavailable, an
  278               *   intermediate result value between zero and the final result
  279               *   value is written to pData for that query.
  280               *
  281               * Just return 0 here for simplicity since it's a valid result.
  282               */
  283             write_query_value_cpu(result_base, k, 0, flags);
  284       }
  285 
  286       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
  287          /* From the Vulkan 1.1.130 spec:
  288           *
  289           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
  290           *    integer value written for each query is non-zero if the query’s
  291           *    status was available or zero if the status was unavailable.
  292           */
  293          write_query_value_cpu(result_base, result_count, available, flags);
  294 
  295       result_base += stride;
  296    }
  297    return result;
  298 }
  299 
  300 VkResult
  301 tu_GetQueryPoolResults(VkDevice _device,
  302                        VkQueryPool queryPool,
  303                        uint32_t firstQuery,
  304                        uint32_t queryCount,
  305                        size_t dataSize,
  306                        void *pData,
  307                        VkDeviceSize stride,
  308                        VkQueryResultFlags flags)
  309 {
  310    TU_FROM_HANDLE(tu_device, device, _device);
  311    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
  312    assert(firstQuery + queryCount <= pool->size);
  313 
  314    switch (pool->type) {
  315    case VK_QUERY_TYPE_OCCLUSION:
  316    case VK_QUERY_TYPE_TIMESTAMP:
  317    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  318       return get_query_pool_results(device, pool, firstQuery, queryCount,
  319                                     dataSize, pData, stride, flags);
  320    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
  321       unreachable("Unimplemented query type");
  322    default:
  323       assert(!"Invalid query type");
  324    }
  325    return VK_SUCCESS;
  326 }
  327 
  328 /* Copies a query value from one buffer to another from the GPU. */
  329 static void
  330 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
  331                      struct tu_cs *cs,
  332                      uint64_t src_iova,
  333                      uint64_t base_write_iova,
  334                      uint32_t offset,
  335                      VkQueryResultFlags flags) {
  336    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
  337          sizeof(uint64_t) : sizeof(uint32_t);
  338    uint64_t write_iova = base_write_iova + (offset * element_size);
  339 
  340    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
  341    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
  342          CP_MEM_TO_MEM_0_DOUBLE : 0;
  343    tu_cs_emit(cs, mem_to_mem_flags);
  344    tu_cs_emit_qw(cs, write_iova);
  345    tu_cs_emit_qw(cs, src_iova);
  346 }
  347 
  348 static void
  349 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
  350                              struct tu_cs *cs,
  351                              struct tu_query_pool *pool,
  352                              uint32_t firstQuery,
  353                              uint32_t queryCount,
  354                              struct tu_buffer *buffer,
  355                              VkDeviceSize dstOffset,
  356                              VkDeviceSize stride,
  357                              VkQueryResultFlags flags)
  358 {
  359    /* From the Vulkan 1.1.130 spec:
  360     *
  361     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
  362     *    uses of vkCmdResetQueryPool in the same queue, without any additional
  363     *    synchronization.
  364     *
  365     * To ensure that previous writes to the available bit are coherent, first
  366     * wait for all writes to complete.
  367     */
  368    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
  369 
  370    for (uint32_t i = 0; i < queryCount; i++) {
  371       uint32_t query = firstQuery + i;
  372       uint64_t available_iova = query_available_iova(pool, query);
  373       uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
  374       uint32_t result_count = get_result_count(pool);
  375 
  376       /* Wait for the available bit to be set if executed with the
  377        * VK_QUERY_RESULT_WAIT_BIT flag. */
  378       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
  379          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
  380          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
  381                         CP_WAIT_REG_MEM_0_POLL_MEMORY);
  382          tu_cs_emit_qw(cs, available_iova);
  383          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
  384          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
  385          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
  386       }
  387 
  388       for (uint32_t k = 0; k < result_count; k++) {
  389          uint64_t result_iova = query_result_iova(pool, query, k);
  390 
  391          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
  392             /* Unconditionally copying the bo->result into the buffer here is
  393              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
  394              * if the query is unavailable, this will copy the correct partial
  395              * value of 0.
  396              */
  397             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
  398                                  k /* offset */, flags);
  399          } else {
  400             /* Conditionally copy bo->result into the buffer based on whether the
  401              * query is available.
  402              *
  403              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
  404              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
  405              * that 0 < available < 2, aka available == 1.
  406              */
  407             tu_cs_reserve(cs, 7 + 6);
  408             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
  409             tu_cs_emit_qw(cs, available_iova);
  410             tu_cs_emit_qw(cs, available_iova);
  411             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
  412             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
  413 
  414             /* Start of conditional execution */
  415             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
  416                               k /* offset */, flags);
  417             /* End of conditional execution */
  418          }
  419       }
  420 
  421       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
  422          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
  423                               result_count /* offset */, flags);
  424       }
  425    }
  426 
  427    tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
  428 }
  429 
  430 void
  431 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
  432                            VkQueryPool queryPool,
  433                            uint32_t firstQuery,
  434                            uint32_t queryCount,
  435                            VkBuffer dstBuffer,
  436                            VkDeviceSize dstOffset,
  437                            VkDeviceSize stride,
  438                            VkQueryResultFlags flags)
  439 {
  440    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
  441    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
  442    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
  443    struct tu_cs *cs = &cmdbuf->cs;
  444    assert(firstQuery + queryCount <= pool->size);
  445 
  446    switch (pool->type) {
  447    case VK_QUERY_TYPE_OCCLUSION:
  448    case VK_QUERY_TYPE_TIMESTAMP:
  449    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  450       return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
  451                queryCount, buffer, dstOffset, stride, flags);
  452    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
  453       unreachable("Unimplemented query type");
  454    default:
  455       assert(!"Invalid query type");
  456    }
  457 }
  458 
  459 static void
  460 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
  461                       struct tu_query_pool *pool,
  462                       uint32_t firstQuery,
  463                       uint32_t queryCount)
  464 {
  465    struct tu_cs *cs = &cmdbuf->cs;
  466 
  467    for (uint32_t i = 0; i < queryCount; i++) {
  468       uint32_t query = firstQuery + i;
  469 
  470       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
  471       tu_cs_emit_qw(cs, query_available_iova(pool, query));
  472       tu_cs_emit_qw(cs, 0x0);
  473 
  474       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
  475       tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
  476       tu_cs_emit_qw(cs, 0x0);
  477       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
  478       tu_cs_emit_qw(cs, query_result_iova(pool, query, 1));
  479       tu_cs_emit_qw(cs, 0x0);
  480    }
  481 }
  482 
  483 void
  484 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
  485                      VkQueryPool queryPool,
  486                      uint32_t firstQuery,
  487                      uint32_t queryCount)
  488 {
  489    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
  490    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
  491 
  492    switch (pool->type) {
  493    case VK_QUERY_TYPE_TIMESTAMP:
  494    case VK_QUERY_TYPE_OCCLUSION:
  495    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  496       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
  497       break;
  498    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
  499       unreachable("Unimplemented query type");
  500    default:
  501       assert(!"Invalid query type");
  502    }
  503 
  504    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
  505 }
  506 
  507 static void
  508 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
  509                            struct tu_query_pool *pool,
  510                            uint32_t query)
  511 {
  512    /* From the Vulkan 1.1.130 spec:
  513     *
  514     *    A query must begin and end inside the same subpass of a render pass
  515     *    instance, or must both begin and end outside of a render pass
  516     *    instance.
  517     *
  518     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
  519     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
  520     * query begins/ends inside the same subpass of a render pass, we need to
  521     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
  522     * is then run on every tile during render, so we just need to accumulate
  523     * sample counts in slot->result to compute the query result.
  524     */
  525    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
  526 
  527    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
  528 
  529    tu_cs_emit_regs(cs,
  530                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
  531 
  532    tu_cs_emit_regs(cs,
  533                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
  534 
  535    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
  536    tu_cs_emit(cs, ZPASS_DONE);
  537 }
  538 
  539 static void
  540 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
  541                      struct tu_query_pool *pool,
  542                      uint32_t query,
  543                      uint32_t stream_id)
  544 {
  545    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
  546    uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
  547 
  548    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova));
  549    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS, false);
  550 }
  551 
  552 void
  553 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
  554                  VkQueryPool queryPool,
  555                  uint32_t query,
  556                  VkQueryControlFlags flags)
  557 {
  558    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
  559    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
  560    assert(query < pool->size);
  561 
  562    switch (pool->type) {
  563    case VK_QUERY_TYPE_OCCLUSION:
  564       /* In freedreno, there is no implementation difference between
  565        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
  566        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
  567        */
  568       emit_begin_occlusion_query(cmdbuf, pool, query);
  569       break;
  570    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  571       emit_begin_xfb_query(cmdbuf, pool, query, 0);
  572       break;
  573    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
  574    case VK_QUERY_TYPE_TIMESTAMP:
  575       unreachable("Unimplemented query type");
  576    default:
  577       assert(!"Invalid query type");
  578    }
  579 
  580    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
  581 }
  582 
  583 void
  584 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
  585                            VkQueryPool queryPool,
  586                            uint32_t query,
  587                            VkQueryControlFlags flags,
  588                            uint32_t index)
  589 {
  590    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
  591    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
  592    assert(query < pool->size);
  593 
  594    switch (pool->type) {
  595    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  596       emit_begin_xfb_query(cmdbuf, pool, query, index);
  597       break;
  598    default:
  599       assert(!"Invalid query type");
  600    }
  601 
  602    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
  603 }
  604 
  605 static void
  606 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
  607                          struct tu_query_pool *pool,
  608                          uint32_t query)
  609 {
  610    /* Ending an occlusion query happens in a few steps:
  611     *    1) Set the slot->end to UINT64_MAX.
  612     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
  613     *       write the current sample count value into slot->end.
  614     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
  615     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
  616     *    4) Accumulate the results of the query (slot->end - slot->begin) into
  617     *       slot->result.
  618     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
  619     *       pass, set the slot's available bit since the query is now done.
  620     *    6) If vkCmdEndQuery *is* called from within the scope of a render
  621     *       pass, we cannot mark as available yet since the commands in
  622     *       draw_cs are not run until vkCmdEndRenderPass.
  623     */
  624    const struct tu_render_pass *pass = cmdbuf->state.pass;
  625    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
  626 
  627    uint64_t available_iova = query_available_iova(pool, query);
  628    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
  629    uint64_t end_iova = occlusion_query_iova(pool, query, end);
  630    uint64_t result_iova = query_result_iova(pool, query, 0);
  631    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
  632    tu_cs_emit_qw(cs, end_iova);
  633    tu_cs_emit_qw(cs, 0xffffffffffffffffull);
  634 
  635    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
  636 
  637    tu_cs_emit_regs(cs,
  638                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
  639 
  640    tu_cs_emit_regs(cs,
  641                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
  642 
  643    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
  644    tu_cs_emit(cs, ZPASS_DONE);
  645 
  646    tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
  647    tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
  648                   CP_WAIT_REG_MEM_0_POLL_MEMORY);
  649    tu_cs_emit_qw(cs, end_iova);
  650    tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
  651    tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
  652    tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
  653 
  654    /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
  655    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
  656    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
  657    tu_cs_emit_qw(cs, result_iova);
  658    tu_cs_emit_qw(cs, result_iova);
  659    tu_cs_emit_qw(cs, end_iova);
  660    tu_cs_emit_qw(cs, begin_iova);
  661 
  662    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
  663 
  664    if (pass)
  665       /* Technically, queries should be tracked per-subpass, but here we track
  666        * at the render pass level to simply the code a bit. This is safe
  667        * because the only commands that use the available bit are
  668        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
  669        * cannot be invoked from inside a render pass scope.
  670        */
  671       cs = &cmdbuf->draw_epilogue_cs;
  672 
  673    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
  674    tu_cs_emit_qw(cs, available_iova);
  675    tu_cs_emit_qw(cs, 0x1);
  676 }
  677 
  678 static void
  679 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
  680                    struct tu_query_pool *pool,
  681                    uint32_t query,
  682                    uint32_t stream_id)
  683 {
  684    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
  685 
  686    uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
  687    uint64_t result_written_iova = query_result_iova(pool, query, 0);
  688    uint64_t result_generated_iova = query_result_iova(pool, query, 1);
  689    uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
  690    uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
  691    uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
  692    uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
  693    uint64_t available_iova = query_available_iova(pool, query);
  694 
  695    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova));
  696    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS, false);
  697 
  698    tu_cs_emit_wfi(cs);
  699    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS, true);
  700 
  701    /* Set the count of written primitives */
  702    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
  703    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
  704                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
  705    tu_cs_emit_qw(cs, result_written_iova);
  706    tu_cs_emit_qw(cs, result_written_iova);
  707    tu_cs_emit_qw(cs, end_written_iova);
  708    tu_cs_emit_qw(cs, begin_written_iova);
  709 
  710    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS, true);
  711 
  712    /* Set the count of generated primitives */
  713    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
  714    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
  715                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
  716    tu_cs_emit_qw(cs, result_generated_iova);
  717    tu_cs_emit_qw(cs, result_generated_iova);
  718    tu_cs_emit_qw(cs, end_generated_iova);
  719    tu_cs_emit_qw(cs, begin_generated_iova);
  720 
  721    /* Set the availability to 1 */
  722    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
  723    tu_cs_emit_qw(cs, available_iova);
  724    tu_cs_emit_qw(cs, 0x1);
  725 }
  726 
  727 void
  728 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
  729                VkQueryPool queryPool,
  730                uint32_t query)
  731 {
  732    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
  733    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
  734    assert(query < pool->size);
  735 
  736    switch (pool->type) {
  737    case VK_QUERY_TYPE_OCCLUSION:
  738       emit_end_occlusion_query(cmdbuf, pool, query);
  739       break;
  740    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  741       emit_end_xfb_query(cmdbuf, pool, query, 0);
  742       break;
  743    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
  744    case VK_QUERY_TYPE_TIMESTAMP:
  745       unreachable("Unimplemented query type");
  746    default:
  747       assert(!"Invalid query type");
  748    }
  749 
  750    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
  751 }
  752 
  753 void
  754 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
  755                          VkQueryPool queryPool,
  756                          uint32_t query,
  757                          uint32_t index)
  758 {
  759    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
  760    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
  761    assert(query < pool->size);
  762 
  763    switch (pool->type) {
  764    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  765       assert(index <= 4);
  766       emit_end_xfb_query(cmdbuf, pool, query, index);
  767       break;
  768    default:
  769       assert(!"Invalid query type");
  770    }
  771 
  772    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
  773 }
  774 
  775 void
  776 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
  777                      VkPipelineStageFlagBits pipelineStage,
  778                      VkQueryPool queryPool,
  779                      uint32_t query)
  780 {
  781    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
  782    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
  783    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
  784 
  785    /* WFI to get more accurate timestamp */
  786    tu_cs_emit_wfi(cs);
  787 
  788    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
  789    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
  790                   CP_REG_TO_MEM_0_CNT(2) |
  791                   CP_REG_TO_MEM_0_64B);
  792    tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
  793 
  794    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
  795    tu_cs_emit_qw(cs, query_available_iova(pool, query));
  796    tu_cs_emit_qw(cs, 0x1);
  797 
  798    if (cmd->state.pass) {
  799       /* TODO: to have useful in-renderpass timestamps:
  800        * for sysmem path, we can just emit the timestamp in draw_cs,
  801        * for gmem renderpass, we do something with accumulate,
  802        * but I'm not sure that would follow the spec
  803        */
  804       tu_finishme("CmdWriteTimestam in renderpass not accurate");
  805    }
  806 }