"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/r600/evergreen_compute.c" (16 Sep 2020, 45656 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "evergreen_compute.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * on the rights to use, copy, modify, merge, publish, distribute, sub
    8  * license, and/or sell copies of the Software, and to permit persons to whom
    9  * the Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
   18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
   19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
   20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
   21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
   22  *
   23  * Authors:
   24  *      Adam Rak <adam.rak@streamnovation.com>
   25  */
   26 
   27 #ifdef HAVE_OPENCL
   28 #include <gelf.h>
   29 #include <libelf.h>
   30 #endif
   31 #include <stdio.h>
   32 #include <errno.h>
   33 #include "pipe/p_defines.h"
   34 #include "pipe/p_state.h"
   35 #include "pipe/p_context.h"
   36 #include "util/u_blitter.h"
   37 #include "util/list.h"
   38 #include "util/u_transfer.h"
   39 #include "util/u_surface.h"
   40 #include "util/u_pack_color.h"
   41 #include "util/u_memory.h"
   42 #include "util/u_inlines.h"
   43 #include "util/u_framebuffer.h"
   44 #include "tgsi/tgsi_parse.h"
   45 #include "pipebuffer/pb_buffer.h"
   46 #include "evergreend.h"
   47 #include "r600_shader.h"
   48 #include "r600_pipe.h"
   49 #include "r600_formats.h"
   50 #include "evergreen_compute.h"
   51 #include "evergreen_compute_internal.h"
   52 #include "compute_memory_pool.h"
   53 #include "sb/sb_public.h"
   54 #include <inttypes.h>
   55 
   56 /**
   57 RAT0 is for global binding write
   58 VTX1 is for global binding read
   59 
   60 for wrting images RAT1...
   61 for reading images TEX2...
   62   TEX2-RAT1 is paired
   63 
   64 TEX2... consumes the same fetch resources, that VTX2... would consume
   65 
   66 CONST0 and VTX0 is for parameters
   67   CONST0 is binding smaller input parameter buffer, and for constant indexing,
   68   also constant cached
   69   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
   70   the constant cache can handle
   71 
   72 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
   73 because we reserve RAT0 for global bindings. With byteaddressing enabled,
   74 we should reserve another one too.=> 10 image binding for writing max.
   75 
   76 from Nvidia OpenCL:
   77   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
   78   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8 
   79 
   80 so 10 for writing is enough. 176 is the max for reading according to the docs
   81 
   82 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
   83 writable images will consume TEX slots, VTX slots too because of linear indexing
   84 
   85 */
   86 
   87 #ifdef HAVE_OPENCL
   88 static void radeon_shader_binary_init(struct r600_shader_binary *b)
   89 {
   90     memset(b, 0, sizeof(*b));
   91 }
   92 
   93 static void radeon_shader_binary_clean(struct r600_shader_binary *b)
   94 {
   95     if (!b)
   96         return;
   97     FREE(b->code);
   98     FREE(b->config);
   99     FREE(b->rodata);
  100     FREE(b->global_symbol_offsets);
  101     FREE(b->relocs);
  102     FREE(b->disasm_string);
  103 }
  104 #endif
  105 
  106 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  107                              unsigned size)
  108 {
  109     struct pipe_resource *buffer = NULL;
  110     assert(size);
  111 
  112     buffer = pipe_buffer_create((struct pipe_screen*) screen,
  113                     0, PIPE_USAGE_IMMUTABLE, size);
  114 
  115     return (struct r600_resource *)buffer;
  116 }
  117 
  118 
  119 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
  120                   unsigned id,
  121                   struct r600_resource *bo,
  122                   int start,
  123                   int size)
  124 {
  125     struct pipe_surface rat_templ;
  126     struct r600_surface *surf = NULL;
  127     struct r600_context *rctx = NULL;
  128 
  129     assert(id < 12);
  130     assert((size & 3) == 0);
  131     assert((start & 0xFF) == 0);
  132 
  133     rctx = pipe->ctx;
  134 
  135     COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
  136 
  137     /* Create the RAT surface */
  138     memset(&rat_templ, 0, sizeof(rat_templ));
  139     rat_templ.format = PIPE_FORMAT_R32_UINT;
  140     rat_templ.u.tex.level = 0;
  141     rat_templ.u.tex.first_layer = 0;
  142     rat_templ.u.tex.last_layer = 0;
  143 
  144     /* Add the RAT the list of color buffers. Drop the old buffer first. */
  145     pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
  146     pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
  147         (struct pipe_context *)pipe->ctx,
  148         (struct pipe_resource *)bo, &rat_templ);
  149 
  150     /* Update the number of color buffers */
  151     pipe->ctx->framebuffer.state.nr_cbufs =
  152         MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
  153 
  154     /* Update the cb_target_mask
  155      * XXX: I think this is a potential spot for bugs once we start doing
  156      * GL interop.  cb_target_mask may be modified in the 3D sections
  157      * of this driver. */
  158     pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
  159 
  160     surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
  161     evergreen_init_color_surface_rat(rctx, surf);
  162 }
  163 
  164 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
  165                        unsigned vb_index,
  166                        unsigned offset,
  167                        struct pipe_resource *buffer)
  168 {
  169     struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
  170     struct pipe_vertex_buffer *vb = &state->vb[vb_index];
  171     vb->stride = 1;
  172     vb->buffer_offset = offset;
  173     vb->buffer.resource = buffer;
  174     vb->is_user_buffer = false;
  175 
  176     /* The vertex instructions in the compute shaders use the texture cache,
  177      * so we need to invalidate it. */
  178     rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
  179     state->enabled_mask |= 1 << vb_index;
  180     state->dirty_mask |= 1 << vb_index;
  181     r600_mark_atom_dirty(rctx, &state->atom);
  182 }
  183 
  184 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
  185                          unsigned cb_index,
  186                          unsigned offset,
  187                          unsigned size,
  188                          struct pipe_resource *buffer)
  189 {
  190     struct pipe_constant_buffer cb;
  191     cb.buffer_size = size;
  192     cb.buffer_offset = offset;
  193     cb.buffer = buffer;
  194     cb.user_buffer = NULL;
  195 
  196     rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
  197 }
  198 
  199 /* We need to define these R600 registers here, because we can't include
  200  * evergreend.h and r600d.h.
  201  */
  202 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
  203 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
  204 
  205 #ifdef HAVE_OPENCL
  206 static void parse_symbol_table(Elf_Data *symbol_table_data,
  207                 const GElf_Shdr *symbol_table_header,
  208                 struct r600_shader_binary *binary)
  209 {
  210     GElf_Sym symbol;
  211     unsigned i = 0;
  212     unsigned symbol_count =
  213         symbol_table_header->sh_size / symbol_table_header->sh_entsize;
  214 
  215     /* We are over allocating this list, because symbol_count gives the
  216      * total number of symbols, and we will only be filling the list
  217      * with offsets of global symbols.  The memory savings from
  218      * allocating the correct size of this list will be small, and
  219      * I don't think it is worth the cost of pre-computing the number
  220      * of global symbols.
  221      */
  222     binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
  223 
  224     while (gelf_getsym(symbol_table_data, i++, &symbol)) {
  225         unsigned i;
  226         if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
  227             symbol.st_shndx == 0 /* Undefined symbol */) {
  228             continue;
  229         }
  230 
  231         binary->global_symbol_offsets[binary->global_symbol_count] =
  232                     symbol.st_value;
  233 
  234         /* Sort the list using bubble sort.  This list will usually
  235          * be small. */
  236         for (i = binary->global_symbol_count; i > 0; --i) {
  237             uint64_t lhs = binary->global_symbol_offsets[i - 1];
  238             uint64_t rhs = binary->global_symbol_offsets[i];
  239             if (lhs < rhs) {
  240                 break;
  241             }
  242             binary->global_symbol_offsets[i] = lhs;
  243             binary->global_symbol_offsets[i - 1] = rhs;
  244         }
  245         ++binary->global_symbol_count;
  246     }
  247 }
  248 
  249 
  250 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
  251             unsigned symbol_sh_link,
  252             struct r600_shader_binary *binary)
  253 {
  254     unsigned i;
  255 
  256     if (!relocs || !symbols || !binary->reloc_count) {
  257         return;
  258     }
  259     binary->relocs = CALLOC(binary->reloc_count,
  260             sizeof(struct r600_shader_reloc));
  261     for (i = 0; i < binary->reloc_count; i++) {
  262         GElf_Sym symbol;
  263         GElf_Rel rel;
  264         char *symbol_name;
  265         struct r600_shader_reloc *reloc = &binary->relocs[i];
  266 
  267         gelf_getrel(relocs, i, &rel);
  268         gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
  269         symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
  270 
  271         reloc->offset = rel.r_offset;
  272         strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
  273         reloc->name[sizeof(reloc->name)-1] = 0;
  274     }
  275 }
  276 
  277 static void r600_elf_read(const char *elf_data, unsigned elf_size,
  278          struct r600_shader_binary *binary)
  279 {
  280     char *elf_buffer;
  281     Elf *elf;
  282     Elf_Scn *section = NULL;
  283     Elf_Data *symbols = NULL, *relocs = NULL;
  284     size_t section_str_index;
  285     unsigned symbol_sh_link = 0;
  286 
  287     /* One of the libelf implementations
  288      * (http://www.mr511.de/software/english.htm) requires calling
  289      * elf_version() before elf_memory().
  290      */
  291     elf_version(EV_CURRENT);
  292     elf_buffer = MALLOC(elf_size);
  293     memcpy(elf_buffer, elf_data, elf_size);
  294 
  295     elf = elf_memory(elf_buffer, elf_size);
  296 
  297     elf_getshdrstrndx(elf, &section_str_index);
  298 
  299     while ((section = elf_nextscn(elf, section))) {
  300         const char *name;
  301         Elf_Data *section_data = NULL;
  302         GElf_Shdr section_header;
  303         if (gelf_getshdr(section, &section_header) != &section_header) {
  304             fprintf(stderr, "Failed to read ELF section header\n");
  305             return;
  306         }
  307         name = elf_strptr(elf, section_str_index, section_header.sh_name);
  308         if (!strcmp(name, ".text")) {
  309             section_data = elf_getdata(section, section_data);
  310             binary->code_size = section_data->d_size;
  311             binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
  312             memcpy(binary->code, section_data->d_buf, binary->code_size);
  313         } else if (!strcmp(name, ".AMDGPU.config")) {
  314             section_data = elf_getdata(section, section_data);
  315             binary->config_size = section_data->d_size;
  316             binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
  317             memcpy(binary->config, section_data->d_buf, binary->config_size);
  318         } else if (!strcmp(name, ".AMDGPU.disasm")) {
  319             /* Always read disassembly if it's available. */
  320             section_data = elf_getdata(section, section_data);
  321             binary->disasm_string = strndup(section_data->d_buf,
  322                             section_data->d_size);
  323         } else if (!strncmp(name, ".rodata", 7)) {
  324             section_data = elf_getdata(section, section_data);
  325             binary->rodata_size = section_data->d_size;
  326             binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
  327             memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
  328         } else if (!strncmp(name, ".symtab", 7)) {
  329             symbols = elf_getdata(section, section_data);
  330             symbol_sh_link = section_header.sh_link;
  331             parse_symbol_table(symbols, &section_header, binary);
  332         } else if (!strcmp(name, ".rel.text")) {
  333             relocs = elf_getdata(section, section_data);
  334             binary->reloc_count = section_header.sh_size /
  335                     section_header.sh_entsize;
  336         }
  337     }
  338 
  339     parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
  340 
  341     if (elf){
  342         elf_end(elf);
  343     }
  344     FREE(elf_buffer);
  345 
  346     /* Cache the config size per symbol */
  347     if (binary->global_symbol_count) {
  348         binary->config_size_per_symbol =
  349             binary->config_size / binary->global_symbol_count;
  350     } else {
  351         binary->global_symbol_count = 1;
  352         binary->config_size_per_symbol = binary->config_size;
  353     }
  354 }
  355 
  356 static const unsigned char *r600_shader_binary_config_start(
  357     const struct r600_shader_binary *binary,
  358     uint64_t symbol_offset)
  359 {
  360     unsigned i;
  361     for (i = 0; i < binary->global_symbol_count; ++i) {
  362         if (binary->global_symbol_offsets[i] == symbol_offset) {
  363             unsigned offset = i * binary->config_size_per_symbol;
  364             return binary->config + offset;
  365         }
  366     }
  367     return binary->config;
  368 }
  369 
  370 static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
  371                        struct r600_bytecode *bc,
  372                        uint64_t symbol_offset,
  373                        boolean *use_kill)
  374 {
  375        unsigned i;
  376        const unsigned char *config =
  377                r600_shader_binary_config_start(binary, symbol_offset);
  378 
  379        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
  380                unsigned reg =
  381                        util_le32_to_cpu(*(uint32_t*)(config + i));
  382                unsigned value =
  383                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
  384                switch (reg) {
  385                /* R600 / R700 */
  386                case R_028850_SQ_PGM_RESOURCES_PS:
  387                case R_028868_SQ_PGM_RESOURCES_VS:
  388                /* Evergreen / Northern Islands */
  389                case R_028844_SQ_PGM_RESOURCES_PS:
  390                case R_028860_SQ_PGM_RESOURCES_VS:
  391                case R_0288D4_SQ_PGM_RESOURCES_LS:
  392                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
  393                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
  394                        break;
  395                case R_02880C_DB_SHADER_CONTROL:
  396                        *use_kill = G_02880C_KILL_ENABLE(value);
  397                        break;
  398                case R_0288E8_SQ_LDS_ALLOC:
  399                        bc->nlds_dw = value;
  400                        break;
  401                }
  402        }
  403 }
  404 
  405 static unsigned r600_create_shader(struct r600_bytecode *bc,
  406                    const struct r600_shader_binary *binary,
  407                    boolean *use_kill)
  408 
  409 {
  410     assert(binary->code_size % 4 == 0);
  411     bc->bytecode = CALLOC(1, binary->code_size);
  412     memcpy(bc->bytecode, binary->code, binary->code_size);
  413     bc->ndw = binary->code_size / 4;
  414 
  415     r600_shader_binary_read_config(binary, bc, 0, use_kill);
  416     return 0;
  417 }
  418 
  419 #endif
  420 
  421 static void r600_destroy_shader(struct r600_bytecode *bc)
  422 {
  423     FREE(bc->bytecode);
  424 }
  425 
  426 static void *evergreen_create_compute_state(struct pipe_context *ctx,
  427                         const struct pipe_compute_state *cso)
  428 {
  429     struct r600_context *rctx = (struct r600_context *)ctx;
  430     struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
  431 #ifdef HAVE_OPENCL
  432     const struct pipe_binary_program_header *header;
  433     void *p;
  434     boolean use_kill;
  435 #endif
  436 
  437     shader->ctx = rctx;
  438     shader->local_size = cso->req_local_mem;
  439     shader->private_size = cso->req_private_mem;
  440     shader->input_size = cso->req_input_mem;
  441 
  442     shader->ir_type = cso->ir_type;
  443 
  444     if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
  445         shader->ir_type == PIPE_SHADER_IR_NIR) {
  446         shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
  447         return shader;
  448     }
  449 #ifdef HAVE_OPENCL
  450     COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
  451     header = cso->prog;
  452     radeon_shader_binary_init(&shader->binary);
  453     r600_elf_read(header->blob, header->num_bytes, &shader->binary);
  454     r600_create_shader(&shader->bc, &shader->binary, &use_kill);
  455 
  456     /* Upload code + ROdata */
  457     shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
  458                             shader->bc.ndw * 4);
  459     p = r600_buffer_map_sync_with_rings(
  460         &rctx->b, shader->code_bo,
  461         PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
  462     //TODO: use util_memcpy_cpu_to_le32 ?
  463     memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
  464     rctx->b.ws->buffer_unmap(shader->code_bo->buf);
  465 #endif
  466 
  467     return shader;
  468 }
  469 
  470 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
  471 {
  472     struct r600_context *rctx = (struct r600_context *)ctx;
  473     struct r600_pipe_compute *shader = state;
  474 
  475     COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
  476 
  477     if (!shader)
  478         return;
  479 
  480     if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
  481         shader->ir_type == PIPE_SHADER_IR_NIR) {
  482         r600_delete_shader_selector(ctx, shader->sel);
  483     } else {
  484 #ifdef HAVE_OPENCL
  485         radeon_shader_binary_clean(&shader->binary);
  486         pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
  487         pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
  488 #endif
  489         r600_destroy_shader(&shader->bc);
  490     }
  491     FREE(shader);
  492 }
  493 
  494 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
  495 {
  496     struct r600_context *rctx = (struct r600_context *)ctx;
  497     struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
  498     COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
  499 
  500     if (!state) {
  501         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
  502         return;
  503     }
  504 
  505     if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
  506         cstate->ir_type == PIPE_SHADER_IR_NIR) {
  507         bool compute_dirty;
  508         cstate->sel->ir_type = cstate->ir_type;
  509         if (r600_shader_select(ctx, cstate->sel, &compute_dirty))
  510             R600_ERR("Failed to select compute shader\n");
  511     }
  512     
  513     rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
  514 }
  515 
  516 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
  517  * kernel parameters there are implicit parameters that need to be stored
  518  * in the vertex buffer as well.  Here is how these parameters are organized in
  519  * the buffer:
  520  *
  521  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
  522  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
  523  * DWORDS 6-8: Number of work items within each work group in each dimension
  524  *             (x,y,z)
  525  * DWORDS 9+ : Kernel parameters
  526  */
  527 static void evergreen_compute_upload_input(struct pipe_context *ctx,
  528                        const struct pipe_grid_info *info)
  529 {
  530     struct r600_context *rctx = (struct r600_context *)ctx;
  531     struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
  532     unsigned i;
  533     /* We need to reserve 9 dwords (36 bytes) for implicit kernel
  534      * parameters.
  535      */
  536     unsigned input_size;
  537     uint32_t *num_work_groups_start;
  538     uint32_t *global_size_start;
  539     uint32_t *local_size_start;
  540     uint32_t *kernel_parameters_start;
  541     struct pipe_box box;
  542     struct pipe_transfer *transfer = NULL;
  543 
  544     if (!shader)
  545         return;
  546     if (shader->input_size == 0) {
  547         return;
  548     }
  549     input_size = shader->input_size + 36;
  550     if (!shader->kernel_param) {
  551         /* Add space for the grid dimensions */
  552         shader->kernel_param = (struct r600_resource *)
  553             pipe_buffer_create(ctx->screen, 0,
  554                     PIPE_USAGE_IMMUTABLE, input_size);
  555     }
  556 
  557     u_box_1d(0, input_size, &box);
  558     num_work_groups_start = ctx->transfer_map(ctx,
  559             (struct pipe_resource*)shader->kernel_param,
  560             0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
  561             &box, &transfer);
  562     global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
  563     local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
  564     kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
  565 
  566     /* Copy the work group size */
  567     memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
  568 
  569     /* Copy the global size */
  570     for (i = 0; i < 3; i++) {
  571         global_size_start[i] = info->grid[i] * info->block[i];
  572     }
  573 
  574     /* Copy the local dimensions */
  575     memcpy(local_size_start, info->block, 3 * sizeof(uint));
  576 
  577     /* Copy the kernel inputs */
  578     memcpy(kernel_parameters_start, info->input, shader->input_size);
  579 
  580     for (i = 0; i < (input_size / 4); i++) {
  581         COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
  582             ((unsigned*)num_work_groups_start)[i]);
  583     }
  584 
  585     ctx->transfer_unmap(ctx, transfer);
  586 
  587     /* ID=0 and ID=3 are reserved for the parameters.
  588      * LLVM will preferably use ID=0, but it does not work for dynamic
  589      * indices. */
  590     evergreen_cs_set_vertex_buffer(rctx, 3, 0,
  591             (struct pipe_resource*)shader->kernel_param);
  592     evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
  593             (struct pipe_resource*)shader->kernel_param);
  594 }
  595 
  596 static void evergreen_emit_dispatch(struct r600_context *rctx,
  597                     const struct pipe_grid_info *info,
  598                     uint32_t indirect_grid[3])
  599 {
  600     int i;
  601     struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
  602     struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
  603     bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
  604     unsigned num_waves;
  605     unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
  606     unsigned wave_divisor = (16 * num_pipes);
  607     int group_size = 1;
  608     int grid_size = 1;
  609     unsigned lds_size = shader->local_size / 4;
  610 
  611     if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
  612         shader->ir_type != PIPE_SHADER_IR_NIR)
  613         lds_size += shader->bc.nlds_dw;
  614     
  615     /* Calculate group_size/grid_size */
  616     for (i = 0; i < 3; i++) {
  617         group_size *= info->block[i];
  618     }
  619 
  620     for (i = 0; i < 3; i++) {
  621         grid_size *= info->grid[i];
  622     }
  623 
  624     /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
  625     num_waves = (info->block[0] * info->block[1] * info->block[2] +
  626             wave_divisor - 1) / wave_divisor;
  627 
  628     COMPUTE_DBG(rctx->screen, "Using %u pipes, "
  629                 "%u wavefronts per thread block, "
  630                 "allocating %u dwords lds.\n",
  631                 num_pipes, num_waves, lds_size);
  632 
  633     radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
  634 
  635     radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
  636     radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
  637     radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
  638     radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
  639 
  640     radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
  641                                 group_size);
  642 
  643     radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
  644     radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
  645     radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
  646     radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
  647 
  648     if (rctx->b.chip_class < CAYMAN) {
  649         assert(lds_size <= 8192);
  650     } else {
  651         /* Cayman appears to have a slightly smaller limit, see the
  652          * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
  653         assert(lds_size <= 8160);
  654     }
  655 
  656     radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
  657                     lds_size | (num_waves << 14));
  658 
  659     if (info->indirect) {
  660         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
  661         radeon_emit(cs, indirect_grid[0]);
  662         radeon_emit(cs, indirect_grid[1]);
  663         radeon_emit(cs, indirect_grid[2]);
  664         radeon_emit(cs, 1);
  665     } else {
  666         /* Dispatch packet */
  667         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
  668         radeon_emit(cs, info->grid[0]);
  669         radeon_emit(cs, info->grid[1]);
  670         radeon_emit(cs, info->grid[2]);
  671         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
  672         radeon_emit(cs, 1);
  673     }
  674 
  675     if (rctx->is_debug)
  676         eg_trace_emit(rctx);
  677 }
  678 
  679 static void compute_setup_cbs(struct r600_context *rctx)
  680 {
  681     struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
  682     unsigned i;
  683 
  684     /* Emit colorbuffers. */
  685     /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
  686     for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
  687         struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
  688         unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
  689                                (struct r600_resource*)cb->base.texture,
  690                                RADEON_USAGE_READWRITE,
  691                                RADEON_PRIO_SHADER_RW_BUFFER);
  692 
  693         radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
  694         radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
  695         radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
  696         radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
  697         radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
  698         radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
  699         radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
  700         radeon_emit(cs, cb->cb_color_dim);      /* R_028C78_CB_COLOR0_DIM */
  701 
  702         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
  703         radeon_emit(cs, reloc);
  704 
  705         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
  706         radeon_emit(cs, reloc);
  707     }
  708     for (; i < 8 ; i++)
  709         radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
  710                            S_028C70_FORMAT(V_028C70_COLOR_INVALID));
  711     for (; i < 12; i++)
  712         radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
  713                            S_028C70_FORMAT(V_028C70_COLOR_INVALID));
  714 
  715     /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
  716     radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
  717                        rctx->compute_cb_target_mask);
  718 }
  719 
  720 static void compute_emit_cs(struct r600_context *rctx,
  721                 const struct pipe_grid_info *info)
  722 {
  723     struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
  724     bool compute_dirty = false;
  725     struct r600_pipe_shader *current;
  726     struct r600_shader_atomic combined_atomics[8];
  727     uint8_t atomic_used_mask;
  728     uint32_t indirect_grid[3] = { 0, 0, 0 };
  729 
  730     /* make sure that the gfx ring is only one active */
  731     if (radeon_emitted(rctx->b.dma.cs, 0)) {
  732         rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
  733     }
  734 
  735     r600_update_compressed_resource_state(rctx, true);
  736 
  737     if (!rctx->cmd_buf_is_compute) {
  738         rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
  739         rctx->cmd_buf_is_compute = true;
  740     }
  741 
  742     if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
  743         rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
  744         if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty)) {
  745             R600_ERR("Failed to select compute shader\n");
  746             return;
  747         }
  748         
  749         current = rctx->cs_shader_state.shader->sel->current;
  750         if (compute_dirty) {
  751             rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
  752             r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
  753             r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
  754         }
  755 
  756         bool need_buf_const = current->shader.uses_tex_buffers ||
  757             current->shader.has_txq_cube_array_z_comp;
  758 
  759         if (info->indirect) {
  760             struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
  761             unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
  762             unsigned offset = info->indirect_offset / 4;
  763             indirect_grid[0] = data[offset];
  764             indirect_grid[1] = data[offset + 1];
  765             indirect_grid[2] = data[offset + 2];
  766         }
  767         for (int i = 0; i < 3; i++) {
  768             rctx->cs_block_grid_sizes[i] = info->block[i];
  769             rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
  770         }
  771         rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
  772         rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
  773 
  774         evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
  775         r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
  776 
  777         if (need_buf_const) {
  778             eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
  779         }
  780         r600_update_driver_const_buffers(rctx, true);
  781 
  782         evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
  783         if (atomic_used_mask) {
  784             radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
  785             radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
  786         }
  787     } else
  788         r600_need_cs_space(rctx, 0, true, 0);
  789 
  790     /* Initialize all the compute-related registers.
  791      *
  792      * See evergreen_init_atom_start_compute_cs() in this file for the list
  793      * of registers initialized by the start_compute_cs_cmd atom.
  794      */
  795     r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
  796 
  797     /* emit config state */
  798     if (rctx->b.chip_class == EVERGREEN) {
  799         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
  800             rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
  801             radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
  802             radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
  803             radeon_emit(cs, 0);
  804             radeon_emit(cs, 0);
  805             radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
  806         } else
  807             r600_emit_atom(rctx, &rctx->config_state.atom);
  808     }
  809 
  810     rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
  811     r600_flush_emit(rctx);
  812 
  813     if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
  814         rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
  815 
  816         compute_setup_cbs(rctx);
  817 
  818         /* Emit vertex buffer state */
  819         rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
  820         r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
  821     } else {
  822         uint32_t rat_mask;
  823 
  824         rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
  825         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
  826                            rat_mask);
  827     }
  828 
  829     r600_emit_atom(rctx, &rctx->b.render_cond_atom);
  830 
  831     /* Emit constant buffer state */
  832     r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
  833 
  834     /* Emit sampler state */
  835     r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
  836 
  837     /* Emit sampler view (texture resource) state */
  838     r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
  839 
  840     /* Emit images state */
  841     r600_emit_atom(rctx, &rctx->compute_images.atom);
  842 
  843     /* Emit buffers state */
  844     r600_emit_atom(rctx, &rctx->compute_buffers.atom);
  845 
  846     /* Emit shader state */
  847     r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
  848 
  849     /* Emit dispatch state and dispatch packet */
  850     evergreen_emit_dispatch(rctx, info, indirect_grid);
  851 
  852     /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
  853      */
  854     rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
  855               R600_CONTEXT_INV_VERTEX_CACHE |
  856                   R600_CONTEXT_INV_TEX_CACHE;
  857     r600_flush_emit(rctx);
  858     rctx->b.flags = 0;
  859 
  860     if (rctx->b.chip_class >= CAYMAN) {
  861         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
  862         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
  863         /* DEALLOC_STATE prevents the GPU from hanging when a
  864          * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
  865          * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
  866          */
  867         radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
  868         radeon_emit(cs, 0);
  869     }
  870     if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
  871         rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
  872         evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
  873 
  874 #if 0
  875     COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
  876     for (i = 0; i < cs->cdw; i++) {
  877         COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
  878     }
  879 #endif
  880 
  881 }
  882 
  883 
  884 /**
  885  * Emit function for r600_cs_shader_state atom
  886  */
  887 void evergreen_emit_cs_shader(struct r600_context *rctx,
  888                   struct r600_atom *atom)
  889 {
  890     struct r600_cs_shader_state *state =
  891                     (struct r600_cs_shader_state*)atom;
  892     struct r600_pipe_compute *shader = state->shader;
  893     struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
  894     uint64_t va;
  895     struct r600_resource *code_bo;
  896     unsigned ngpr, nstack;
  897 
  898     if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
  899         shader->ir_type == PIPE_SHADER_IR_NIR) {
  900         code_bo = shader->sel->current->bo;
  901         va = shader->sel->current->bo->gpu_address;
  902         ngpr = shader->sel->current->shader.bc.ngpr;
  903         nstack = shader->sel->current->shader.bc.nstack;
  904     } else {
  905         code_bo = shader->code_bo;
  906         va = shader->code_bo->gpu_address + state->pc;
  907         ngpr = shader->bc.ngpr;
  908         nstack = shader->bc.nstack;
  909     }
  910 
  911     radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
  912     radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
  913     radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
  914             S_0288D4_NUM_GPRS(ngpr) |
  915             S_0288D4_DX10_CLAMP(1) |
  916             S_0288D4_STACK_SIZE(nstack));
  917     radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
  918 
  919     radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
  920     radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
  921                           code_bo, RADEON_USAGE_READ,
  922                           RADEON_PRIO_SHADER_BINARY));
  923 }
  924 
  925 static void evergreen_launch_grid(struct pipe_context *ctx,
  926                   const struct pipe_grid_info *info)
  927 {
  928     struct r600_context *rctx = (struct r600_context *)ctx;
  929 #ifdef HAVE_OPENCL
  930     struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
  931     boolean use_kill;
  932 
  933     if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
  934         shader->ir_type != PIPE_SHADER_IR_NIR) {
  935         rctx->cs_shader_state.pc = info->pc;
  936         /* Get the config information for this kernel. */
  937         r600_shader_binary_read_config(&shader->binary, &shader->bc,
  938                            info->pc, &use_kill);
  939     } else {
  940         use_kill = false;
  941         rctx->cs_shader_state.pc = 0;
  942     }
  943 #endif
  944 
  945     COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
  946 
  947 
  948     evergreen_compute_upload_input(ctx, info);
  949     compute_emit_cs(rctx, info);
  950 }
  951 
  952 static void evergreen_set_compute_resources(struct pipe_context *ctx,
  953                         unsigned start, unsigned count,
  954                         struct pipe_surface **surfaces)
  955 {
  956     struct r600_context *rctx = (struct r600_context *)ctx;
  957     struct r600_surface **resources = (struct r600_surface **)surfaces;
  958 
  959     COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
  960             start, count);
  961 
  962     for (unsigned i = 0; i < count; i++) {
  963         /* The First four vertex buffers are reserved for parameters and
  964          * global buffers. */
  965         unsigned vtx_id = 4 + i;
  966         if (resources[i]) {
  967             struct r600_resource_global *buffer =
  968                 (struct r600_resource_global*)
  969                 resources[i]->base.texture;
  970             if (resources[i]->base.writable) {
  971                 assert(i+1 < 12);
  972 
  973                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
  974                 (struct r600_resource *)resources[i]->base.texture,
  975                 buffer->chunk->start_in_dw*4,
  976                 resources[i]->base.texture->width0);
  977             }
  978 
  979             evergreen_cs_set_vertex_buffer(rctx, vtx_id,
  980                     buffer->chunk->start_in_dw * 4,
  981                     resources[i]->base.texture);
  982         }
  983     }
  984 }
  985 
  986 static void evergreen_set_global_binding(struct pipe_context *ctx,
  987                      unsigned first, unsigned n,
  988                      struct pipe_resource **resources,
  989                      uint32_t **handles)
  990 {
  991     struct r600_context *rctx = (struct r600_context *)ctx;
  992     struct compute_memory_pool *pool = rctx->screen->global_pool;
  993     struct r600_resource_global **buffers =
  994         (struct r600_resource_global **)resources;
  995     unsigned i;
  996 
  997     COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
  998             first, n);
  999 
 1000     if (!resources) {
 1001         /* XXX: Unset */
 1002         return;
 1003     }
 1004 
 1005     /* We mark these items for promotion to the pool if they
 1006      * aren't already there */
 1007     for (i = first; i < first + n; i++) {
 1008         struct compute_memory_item *item = buffers[i]->chunk;
 1009 
 1010         if (!is_item_in_pool(item))
 1011             buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 1012     }
 1013 
 1014     if (compute_memory_finalize_pending(pool, ctx) == -1) {
 1015         /* XXX: Unset */
 1016         return;
 1017     }
 1018 
 1019     for (i = first; i < first + n; i++)
 1020     {
 1021         uint32_t buffer_offset;
 1022         uint32_t handle;
 1023         assert(resources[i]->target == PIPE_BUFFER);
 1024         assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 1025 
 1026         buffer_offset = util_le32_to_cpu(*(handles[i]));
 1027         handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 1028 
 1029         *(handles[i]) = util_cpu_to_le32(handle);
 1030     }
 1031 
 1032     /* globals for writing */
 1033     evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 1034     /* globals for reading */
 1035     evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 1036                 (struct pipe_resource*)pool->bo);
 1037 
 1038     /* constants for reading, LLVM puts them in text segment */
 1039     evergreen_cs_set_vertex_buffer(rctx, 2, 0,
 1040                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
 1041 }
 1042 
 1043 /**
 1044  * This function initializes all the compute specific registers that need to
 1045  * be initialized for each compute command stream.  Registers that are common
 1046  * to both compute and 3D will be initialized at the beginning of each compute
 1047  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 1048  * packet requires that the shader type bit be set, we must initialize all
 1049  * context registers needed for compute in this function.  The registers
 1050  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 1051  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 1052  * on the GPU family.
 1053  */
 1054 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 1055 {
 1056     struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 1057     int num_threads;
 1058     int num_stack_entries;
 1059 
 1060     /* since all required registers are initialized in the
 1061      * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 1062      */
 1063     r600_init_command_buffer(cb, 256);
 1064     cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 1065 
 1066     /* We're setting config registers here. */
 1067     r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 1068     r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 1069 
 1070     switch (rctx->b.family) {
 1071     case CHIP_CEDAR:
 1072     default:
 1073         num_threads = 128;
 1074         num_stack_entries = 256;
 1075         break;
 1076     case CHIP_REDWOOD:
 1077         num_threads = 128;
 1078         num_stack_entries = 256;
 1079         break;
 1080     case CHIP_JUNIPER:
 1081         num_threads = 128;
 1082         num_stack_entries = 512;
 1083         break;
 1084     case CHIP_CYPRESS:
 1085     case CHIP_HEMLOCK:
 1086         num_threads = 128;
 1087         num_stack_entries = 512;
 1088         break;
 1089     case CHIP_PALM:
 1090         num_threads = 128;
 1091         num_stack_entries = 256;
 1092         break;
 1093     case CHIP_SUMO:
 1094         num_threads = 128;
 1095         num_stack_entries = 256;
 1096         break;
 1097     case CHIP_SUMO2:
 1098         num_threads = 128;
 1099         num_stack_entries = 512;
 1100         break;
 1101     case CHIP_BARTS:
 1102         num_threads = 128;
 1103         num_stack_entries = 512;
 1104         break;
 1105     case CHIP_TURKS:
 1106         num_threads = 128;
 1107         num_stack_entries = 256;
 1108         break;
 1109     case CHIP_CAICOS:
 1110         num_threads = 128;
 1111         num_stack_entries = 256;
 1112         break;
 1113     }
 1114 
 1115     /* The primitive type always needs to be POINTLIST for compute. */
 1116     r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 1117                         V_008958_DI_PT_POINTLIST);
 1118 
 1119     if (rctx->b.chip_class < CAYMAN) {
 1120 
 1121         /* These registers control which simds can be used by each stage.
 1122          * The default for these registers is 0xffffffff, which means
 1123          * all simds are available for each stage.  It's possible we may
 1124          * want to play around with these in the future, but for now
 1125          * the default value is fine.
 1126          *
 1127          * R_008E20_SQ_STATIC_THREAD_MGMT1
 1128          * R_008E24_SQ_STATIC_THREAD_MGMT2
 1129          * R_008E28_SQ_STATIC_THREAD_MGMT3
 1130          */
 1131 
 1132         /* XXX: We may need to adjust the thread and stack resource
 1133          * values for 3D/compute interop */
 1134 
 1135         r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 1136 
 1137         /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 1138          * Set the number of threads used by the PS/VS/GS/ES stage to
 1139          * 0.
 1140          */
 1141         r600_store_value(cb, 0);
 1142 
 1143         /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 1144          * Set the number of threads used by the CS (aka LS) stage to
 1145          * the maximum number of threads and set the number of threads
 1146          * for the HS stage to 0. */
 1147         r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 1148 
 1149         /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 1150          * Set the Control Flow stack entries to 0 for PS/VS stages */
 1151         r600_store_value(cb, 0);
 1152 
 1153         /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 1154          * Set the Control Flow stack entries to 0 for GS/ES stages */
 1155         r600_store_value(cb, 0);
 1156 
 1157         /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 1158          * Set the Contol Flow stack entries to 0 for the HS stage, and
 1159          * set it to the maximum value for the CS (aka LS) stage. */
 1160         r600_store_value(cb,
 1161             S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 1162     }
 1163     /* Give the compute shader all the available LDS space.
 1164      * NOTE: This only sets the maximum number of dwords that a compute
 1165      * shader can allocate.  When a shader is executed, we still need to
 1166      * allocate the appropriate amount of LDS dwords using the
 1167      * CM_R_0288E8_SQ_LDS_ALLOC register.
 1168      */
 1169     if (rctx->b.chip_class < CAYMAN) {
 1170         r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 1171             S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 1172     } else {
 1173         r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 1174             S_0286FC_NUM_PS_LDS(0) |
 1175             S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 1176     }
 1177 
 1178     /* Context Registers */
 1179 
 1180     if (rctx->b.chip_class < CAYMAN) {
 1181         /* workaround for hw issues with dyn gpr - must set all limits
 1182          * to 240 instead of 0, 0x1e == 240 / 8
 1183          */
 1184         r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 1185                 S_028838_PS_GPRS(0x1e) |
 1186                 S_028838_VS_GPRS(0x1e) |
 1187                 S_028838_GS_GPRS(0x1e) |
 1188                 S_028838_ES_GPRS(0x1e) |
 1189                 S_028838_HS_GPRS(0x1e) |
 1190                 S_028838_LS_GPRS(0x1e));
 1191     }
 1192 
 1193     /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 1194     r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 1195         S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 1196 
 1197     r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 1198 
 1199     r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 1200                    S_0286E8_TID_IN_GROUP_ENA(1) |
 1201                    S_0286E8_TGID_ENA(1) |
 1202                    S_0286E8_DISABLE_INDEX_PACK(1));
 1203 
 1204     /* The LOOP_CONST registers are an optimizations for loops that allows
 1205      * you to store the initial counter, increment value, and maximum
 1206      * counter value in a register so that hardware can calculate the
 1207      * correct number of iterations for the loop, so that you don't need
 1208      * to have the loop counter in your shader code.  We don't currently use
 1209      * this optimization, so we must keep track of the counter in the
 1210      * shader and use a break instruction to exit loops.  However, the
 1211      * hardware will still uses this register to determine when to exit a
 1212      * loop, so we need to initialize the counter to 0, set the increment
 1213      * value to 1 and the maximum counter value to the 4095 (0xfff) which
 1214      * is the maximum value allowed.  This gives us a maximum of 4096
 1215      * iterations for our loops, but hopefully our break instruction will
 1216      * execute before some time before the 4096th iteration.
 1217      */
 1218     eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 1219 }
 1220 
 1221 void evergreen_init_compute_state_functions(struct r600_context *rctx)
 1222 {
 1223     rctx->b.b.create_compute_state = evergreen_create_compute_state;
 1224     rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 1225     rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 1226 //   rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 1227     rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 1228     rctx->b.b.set_global_binding = evergreen_set_global_binding;
 1229     rctx->b.b.launch_grid = evergreen_launch_grid;
 1230 
 1231 }
 1232 
 1233 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
 1234                           struct pipe_resource *resource,
 1235                           unsigned level,
 1236                           unsigned usage,
 1237                           const struct pipe_box *box,
 1238                           struct pipe_transfer **ptransfer)
 1239 {
 1240     struct r600_context *rctx = (struct r600_context*)ctx;
 1241     struct compute_memory_pool *pool = rctx->screen->global_pool;
 1242     struct r600_resource_global* buffer =
 1243         (struct r600_resource_global*)resource;
 1244 
 1245     struct compute_memory_item *item = buffer->chunk;
 1246     struct pipe_resource *dst = NULL;
 1247     unsigned offset = box->x;
 1248 
 1249     if (is_item_in_pool(item)) {
 1250         compute_memory_demote_item(pool, item, ctx);
 1251     }
 1252     else {
 1253         if (item->real_buffer == NULL) {
 1254             item->real_buffer =
 1255                     r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 1256         }
 1257     }
 1258 
 1259     dst = (struct pipe_resource*)item->real_buffer;
 1260 
 1261     if (usage & PIPE_TRANSFER_READ)
 1262         buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 1263 
 1264     COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 1265             "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 1266             "width = %u, height = %u, depth = %u)\n", level, usage,
 1267             box->x, box->y, box->z, box->width, box->height,
 1268             box->depth);
 1269     COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 1270         "%u (box.x)\n", item->id, box->x);
 1271 
 1272 
 1273     assert(resource->target == PIPE_BUFFER);
 1274     assert(resource->bind & PIPE_BIND_GLOBAL);
 1275     assert(box->x >= 0);
 1276     assert(box->y == 0);
 1277     assert(box->z == 0);
 1278 
 1279     ///TODO: do it better, mapping is not possible if the pool is too big
 1280     return pipe_buffer_map_range(ctx, dst,
 1281             offset, box->width, usage, ptransfer);
 1282 }
 1283 
 1284 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
 1285                            struct pipe_transfer *transfer)
 1286 {
 1287     /* struct r600_resource_global are not real resources, they just map
 1288      * to an offset within the compute memory pool.  The function
 1289      * r600_compute_global_transfer_map() maps the memory pool
 1290      * resource rather than the struct r600_resource_global passed to
 1291      * it as an argument and then initalizes ptransfer->resource with
 1292      * the memory pool resource (via pipe_buffer_map_range).
 1293      * When transfer_unmap is called it uses the memory pool's
 1294      * vtable which calls r600_buffer_transfer_map() rather than
 1295      * this function.
 1296      */
 1297     assert (!"This function should not be called");
 1298 }
 1299 
 1300 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
 1301                               struct pipe_transfer *transfer,
 1302                               const struct pipe_box *box)
 1303 {
 1304     assert(0 && "TODO");
 1305 }
 1306 
 1307 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
 1308                            struct pipe_resource *res)
 1309 {
 1310     struct r600_resource_global* buffer = NULL;
 1311     struct r600_screen* rscreen = NULL;
 1312 
 1313     assert(res->target == PIPE_BUFFER);
 1314     assert(res->bind & PIPE_BIND_GLOBAL);
 1315 
 1316     buffer = (struct r600_resource_global*)res;
 1317     rscreen = (struct r600_screen*)screen;
 1318 
 1319     compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 1320 
 1321     buffer->chunk = NULL;
 1322     free(res);
 1323 }
 1324 
 1325 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 1326 {
 1327     u_default_resource_get_handle, /* get_handle */
 1328     r600_compute_global_buffer_destroy, /* resource_destroy */
 1329     r600_compute_global_transfer_map, /* transfer_map */
 1330     r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 1331     r600_compute_global_transfer_unmap, /* transfer_unmap */
 1332 };
 1333 
 1334 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
 1335                             const struct pipe_resource *templ)
 1336 {
 1337     struct r600_resource_global* result = NULL;
 1338     struct r600_screen* rscreen = NULL;
 1339     int size_in_dw = 0;
 1340 
 1341     assert(templ->target == PIPE_BUFFER);
 1342     assert(templ->bind & PIPE_BIND_GLOBAL);
 1343     assert(templ->array_size == 1 || templ->array_size == 0);
 1344     assert(templ->depth0 == 1 || templ->depth0 == 0);
 1345     assert(templ->height0 == 1 || templ->height0 == 0);
 1346 
 1347     result = (struct r600_resource_global*)
 1348     CALLOC(sizeof(struct r600_resource_global), 1);
 1349     rscreen = (struct r600_screen*)screen;
 1350 
 1351     COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 1352     COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 1353             templ->array_size);
 1354 
 1355     result->base.b.vtbl = &r600_global_buffer_vtbl;
 1356     result->base.b.b = *templ;
 1357     result->base.b.b.screen = screen;
 1358     pipe_reference_init(&result->base.b.b.reference, 1);
 1359 
 1360     size_in_dw = (templ->width0+3) / 4;
 1361 
 1362     result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 1363 
 1364     if (result->chunk == NULL)
 1365     {
 1366         free(result);
 1367         return NULL;
 1368     }
 1369 
 1370     return &result->base.b.b;
 1371 }