"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c" (16 Sep 2020, 15917 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ir2_assemble.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8  * and/or sell copies of the Software, and to permit persons to whom the
    9  * Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   21  * SOFTWARE.
   22  *
   23  * Authors:
   24  *    Jonathan Marek <jonathan@marek.ca>
   25  */
   26 
   27 #include "ir2_private.h"
   28 
   29 static unsigned
   30 src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
   31 {
   32     struct ir2_reg_component *comps;
   33     unsigned swiz = 0;
   34 
   35     switch (src->type) {
   36     case IR2_SRC_SSA:
   37     case IR2_SRC_REG:
   38         break;
   39     default:
   40         return src->swizzle;
   41     }
   42     /* we need to take into account where the components were allocated */
   43     comps = get_reg_src(ctx, src)->comp;
   44     for (int i = 0; i < ncomp; i++) {
   45         swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
   46     }
   47     return swiz;
   48 }
   49 
   50 /* alu instr need to take into how the output components are allocated */
   51 
   52 /* scalar doesn't need to take into account dest swizzle */
   53 
   54 static unsigned
   55 alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
   56 {
   57     /* hardware seems to take from W, but swizzle everywhere just in case */
   58     return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
   59 }
   60 
   61 static unsigned
   62 alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, struct ir2_src *src)
   63 {
   64     struct ir2_reg_component *comp = get_reg(instr)->comp;
   65     unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
   66     unsigned swiz = 0;
   67 
   68     /* non per component special cases */
   69     switch (instr->alu.vector_opc) {
   70     case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
   71         return alu_swizzle_scalar(ctx, src);
   72     case DOT2ADDv:
   73     case DOT3v:
   74     case DOT4v:
   75     case CUBEv:
   76         return swiz0;
   77     default:
   78         break;
   79     }
   80 
   81     for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
   82         if (instr->alu.write_mask & 1 << j) {
   83             if (comp[j].c != 7)
   84                 swiz |= swiz_set(i, comp[j].c);
   85             i++;
   86         }
   87     }
   88     return swiz_merge(swiz0, swiz);
   89 }
   90 
   91 static unsigned
   92 alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
   93 {
   94     /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
   95     unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
   96     return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
   97 }
   98 
   99 /* write_mask needs to be transformed by allocation information */
  100 
  101 static unsigned
  102 alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
  103 {
  104     struct ir2_reg_component *comp = get_reg(instr)->comp;
  105     unsigned write_mask = 0;
  106 
  107     for (int i = 0; i < 4; i++) {
  108         if (instr->alu.write_mask & 1 << i)
  109             write_mask |= 1 << comp[i].c;
  110     }
  111 
  112     return write_mask;
  113 }
  114 
  115 /* fetch instructions can swizzle dest, but src swizzle needs conversion */
  116 
  117 static unsigned
  118 fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
  119 {
  120     unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
  121     unsigned swiz = 0;
  122     for (int i = 0; i < ncomp; i++)
  123         swiz |= swiz_get(alu_swiz, i) << i * 2;
  124     return swiz;
  125 }
  126 
  127 static unsigned
  128 fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
  129 {
  130     struct ir2_reg_component *comp = get_reg(instr)->comp;
  131     unsigned dst_swiz = 0xfff;
  132     for (int i = 0; i < dst_ncomp(instr); i++) {
  133         dst_swiz &= ~(7 << comp[i].c * 3);
  134         dst_swiz |= i << comp[i].c * 3;
  135     }
  136     return dst_swiz;
  137 }
  138 
  139 /* register / export # for instr */
  140 static unsigned
  141 dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
  142 {
  143     if (is_export(instr))
  144         return instr->alu.export;
  145 
  146     return get_reg(instr)->idx;
  147 }
  148 
  149 /* register # for src */
  150 static unsigned src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
  151 {
  152     return get_reg_src(ctx, src)->idx;
  153 }
  154 
  155 static unsigned src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
  156 {
  157     if (src->type == IR2_SRC_CONST) {
  158         assert(!src->abs); /* no abs bit for const */
  159         return src->num;
  160     }
  161     return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
  162 }
  163 
  164 /* produce the 12 byte binary instruction for a given sched_instr */
  165 static void
  166 fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched,
  167            instr_t *bc, bool * is_fetch)
  168 {
  169     struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
  170 
  171     *bc = (instr_t) {};
  172 
  173     if (instr && instr->type == IR2_FETCH) {
  174         *is_fetch = true;
  175 
  176         bc->fetch.opc = instr->fetch.opc;
  177         bc->fetch.pred_select = !!instr->pred;
  178         bc->fetch.pred_condition = instr->pred & 1;
  179 
  180         struct ir2_src *src = instr->src;
  181 
  182         if (instr->fetch.opc == VTX_FETCH) {
  183             instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
  184 
  185             assert(instr->fetch.vtx.const_idx <= 0x1f);
  186             assert(instr->fetch.vtx.const_idx_sel <= 0x3);
  187 
  188             vtx->src_reg = src_to_reg(ctx, src);
  189             vtx->src_swiz = fetch_swizzle(ctx, src, 1);
  190             vtx->dst_reg = dst_to_reg(ctx, instr);
  191             vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
  192 
  193             vtx->must_be_one = 1;
  194             vtx->const_index = instr->fetch.vtx.const_idx;
  195             vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
  196 
  197             /* other fields will be patched */
  198 
  199             /* XXX seems like every FETCH but the first has
  200              * this bit set:
  201              */
  202             vtx->reserved3 = instr->idx ? 0x1 : 0x0;
  203             vtx->reserved0 = instr->idx ? 0x2 : 0x3;
  204         } else if (instr->fetch.opc == TEX_FETCH) {
  205             instr_fetch_tex_t *tex = &bc->fetch.tex;
  206 
  207             tex->src_reg = src_to_reg(ctx, src);
  208             tex->src_swiz = fetch_swizzle(ctx, src, 3);
  209             tex->dst_reg = dst_to_reg(ctx, instr);
  210             tex->dst_swiz = fetch_dst_swiz(ctx, instr);
  211             /* tex->const_idx = patch_fetches */
  212             tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
  213             tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
  214             tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
  215             tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
  216             tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
  217             tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
  218             tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
  219             tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;
  220             tex->use_reg_lod = instr->src_count == 2;
  221             tex->sample_location = SAMPLE_CENTER;
  222             tex->tx_coord_denorm = instr->fetch.tex.is_rect;
  223         } else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
  224             instr_fetch_tex_t *tex = &bc->fetch.tex;
  225 
  226             tex->src_reg = src_to_reg(ctx, src);
  227             tex->src_swiz = fetch_swizzle(ctx, src, 1);
  228             tex->dst_reg = 0;
  229             tex->dst_swiz = 0xfff;
  230 
  231             tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
  232             tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
  233             tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
  234             tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
  235             tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
  236             tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
  237             tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
  238             tex->use_comp_lod = 1;
  239             tex->use_reg_lod = 0;
  240             tex->sample_location = SAMPLE_CENTER;
  241         } else {
  242             assert(0);
  243         }
  244         return;
  245     }
  246 
  247     instr_v = sched->instr;
  248     instr_s = sched->instr_s;
  249 
  250     if (instr_v) {
  251         struct ir2_src src1, src2, *src3;
  252 
  253         src1 = instr_v->src[0];
  254         src2 = instr_v->src[instr_v->src_count > 1];
  255         src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
  256 
  257         bc->alu.vector_opc = instr_v->alu.vector_opc;
  258         bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
  259         bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
  260         bc->alu.vector_clamp = instr_v->alu.saturate;
  261         bc->alu.export_data = instr_v->alu.export >= 0;
  262 
  263         /* single operand SETEv, use 0.0f as src2 */
  264         if (instr_v->src_count == 1 &&
  265             (bc->alu.vector_opc == SETEv ||
  266             bc->alu.vector_opc == SETNEv ||
  267             bc->alu.vector_opc == SETGTv ||
  268             bc->alu.vector_opc == SETGTEv))
  269             src2 = ir2_zero(ctx);
  270 
  271         /* export32 instr for a20x hw binning has this bit set..
  272          * it seems to do more than change the base address of constants
  273          * XXX this is a hack
  274          */
  275         bc->alu.relative_addr =
  276             (bc->alu.export_data && bc->alu.vector_dest == 32);
  277 
  278         bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
  279         bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
  280         bc->alu.src1_reg_negate = src1.negate;
  281         bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
  282 
  283         bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
  284         bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
  285         bc->alu.src2_reg_negate = src2.negate;
  286         bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
  287 
  288         if (src3) {
  289             bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
  290             bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
  291             bc->alu.src3_reg_negate = src3->negate;
  292             bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
  293         }
  294 
  295         bc->alu.pred_select = instr_v->pred;
  296     }
  297 
  298     if (instr_s) {
  299         struct ir2_src *src = instr_s->src;
  300 
  301         bc->alu.scalar_opc = instr_s->alu.scalar_opc;
  302         bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
  303         bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
  304         bc->alu.scalar_clamp = instr_s->alu.saturate;
  305         bc->alu.export_data = instr_s->alu.export >= 0;
  306 
  307         if (instr_s->src_count == 1) {
  308             bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
  309             bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
  310             bc->alu.src3_reg_negate = src->negate;
  311             bc->alu.src3_sel = src->type != IR2_SRC_CONST;
  312         } else {
  313             assert(instr_s->src_count == 2);
  314 
  315             bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
  316             bc->alu.src3_swiz = alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
  317             bc->alu.src3_reg_negate = src->negate;
  318             bc->alu.src3_sel = src->type != IR2_SRC_CONST;;
  319         }
  320 
  321         if (instr_v)
  322             assert(instr_s->pred == instr_v->pred);
  323         bc->alu.pred_select = instr_s->pred;
  324     }
  325 
  326     *is_fetch = false;
  327     return;
  328 }
  329 
  330 static unsigned
  331 write_cfs(struct ir2_context *ctx, instr_cf_t * cfs, unsigned cf_idx,
  332           instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
  333 {
  334     assert(exec->count);
  335 
  336     if (alloc)
  337         cfs[cf_idx++].alloc = *alloc;
  338 
  339     /* for memory alloc offset for patching */
  340     if (alloc && alloc->buffer_select == SQ_MEMORY &&
  341         ctx->info->mem_export_ptr == -1)
  342         ctx->info->mem_export_ptr = cf_idx / 2 * 3;
  343 
  344     cfs[cf_idx++].exec = *exec;
  345     exec->address += exec->count;
  346     exec->serialize = 0;
  347     exec->count = 0;
  348 
  349     return cf_idx;
  350 }
  351 
  352 /* assemble the final shader */
  353 void assemble(struct ir2_context *ctx, bool binning)
  354 {
  355     /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
  356      * address is 9 bits so could it be 512 ?
  357      */
  358     instr_cf_t cfs[384];
  359     instr_t bytecode[384], bc;
  360     unsigned block_addr[128];
  361     unsigned num_cf = 0;
  362 
  363     /* CF instr state */
  364     instr_cf_exec_t exec = {.opc = EXEC};
  365     instr_cf_alloc_t alloc = {.opc = ALLOC};
  366 
  367     int sync_id, sync_id_prev = -1;
  368     bool is_fetch = false;
  369     bool need_sync = true;
  370     bool need_alloc = false;
  371     unsigned block_idx = 0;
  372 
  373     ctx->info->mem_export_ptr = -1;
  374     ctx->info->num_fetch_instrs = 0;
  375 
  376     /* vertex shader always needs to allocate at least one parameter
  377      * if it will never happen,
  378      */
  379     if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {
  380         alloc.buffer_select = SQ_PARAMETER_PIXEL;
  381         cfs[num_cf++].alloc = alloc;
  382     }
  383 
  384     block_addr[0] = 0;
  385 
  386     for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
  387         struct ir2_instr *instr = ctx->instr_sched[j].instr;
  388 
  389         /* catch IR2_CF since it isn't a regular instruction */
  390         if (instr && instr->type == IR2_CF) {
  391             assert(!need_alloc); /* XXX */
  392 
  393             /* flush any exec cf before inserting jmp */
  394             if (exec.count)
  395                 num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
  396 
  397             cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t) {
  398                 .opc = COND_JMP,
  399                 .address = instr->cf.block_idx, /* will be fixed later */
  400                 .force_call = !instr->pred,
  401                 .predicated_jmp = 1,
  402                 .direction = instr->cf.block_idx > instr->block_idx,
  403                 .condition = instr->pred & 1,
  404             };
  405             continue;
  406         }
  407 
  408         /* fill the 3 dwords for the instruction */
  409         fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
  410 
  411         /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
  412         sync_id = 0;
  413         if (is_fetch)
  414             sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
  415 
  416         need_sync = sync_id != sync_id_prev;
  417         sync_id_prev = sync_id;
  418 
  419         unsigned block;
  420         {
  421 
  422             if (ctx->instr_sched[j].instr)
  423                 block = ctx->instr_sched[j].instr->block_idx;
  424             else
  425                 block = ctx->instr_sched[j].instr_s->block_idx;
  426 
  427             assert(block_idx <= block);
  428         }
  429 
  430         /* info for patching */
  431         if (is_fetch) {
  432             struct ir2_fetch_info *info =
  433                 &ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
  434             info->offset = i * 3;   /* add cf offset later */
  435 
  436             if (bc.fetch.opc == VTX_FETCH) {
  437                 info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
  438             } else if (bc.fetch.opc == TEX_FETCH) {
  439                 info->tex.samp_id = instr->fetch.tex.samp_id;
  440                 info->tex.src_swiz = bc.fetch.tex.src_swiz;
  441             } else {
  442                 ctx->info->num_fetch_instrs--;
  443             }
  444         }
  445 
  446         /* exec cf after 6 instr or when switching between fetch / alu */
  447         if (exec.count == 6 || (exec.count && (need_sync || block != block_idx))) {
  448             num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
  449             need_alloc = false;
  450         }
  451 
  452         /* update block_addrs for jmp patching */
  453         while (block_idx < block)
  454             block_addr[++block_idx] = num_cf;
  455 
  456         /* export - fill alloc cf */
  457         if (!is_fetch && bc.alu.export_data) {
  458             /* get the export buffer from either vector/scalar dest */
  459             instr_alloc_type_t buffer =
  460                 export_buf(bc.alu.vector_dest);
  461             if (bc.alu.scalar_write_mask) {
  462                 if (bc.alu.vector_write_mask)
  463                     assert(buffer == export_buf(bc.alu.scalar_dest));
  464                 buffer = export_buf(bc.alu.scalar_dest);
  465             }
  466 
  467             /* flush previous alloc if the buffer changes */
  468             bool need_new_alloc = buffer != alloc.buffer_select;
  469 
  470             /* memory export always in 32/33 pair, new alloc on 32 */
  471             if (bc.alu.vector_dest == 32)
  472                 need_new_alloc = true;
  473 
  474             if (need_new_alloc && exec.count) {
  475                 num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
  476                 need_alloc = false;
  477             }
  478 
  479             need_alloc |= need_new_alloc;
  480 
  481             alloc.size = 0;
  482             alloc.buffer_select = buffer;
  483 
  484             if (buffer == SQ_PARAMETER_PIXEL && ctx->so->type == MESA_SHADER_VERTEX)
  485                 alloc.size = ctx->f->inputs_count - 1;
  486 
  487             if (buffer == SQ_POSITION)
  488                 alloc.size = ctx->so->writes_psize;
  489         }
  490 
  491         if (is_fetch)
  492             exec.serialize |= 0x1 << exec.count * 2;
  493         if (need_sync)
  494             exec.serialize |= 0x2 << exec.count * 2;
  495 
  496         need_sync = false;
  497         exec.count += 1;
  498         bytecode[i++] = bc;
  499     }
  500 
  501     /* final exec cf */
  502     exec.opc = EXEC_END;
  503     num_cf =
  504         write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
  505 
  506     /* insert nop to get an even # of CFs */
  507     if (num_cf % 2)
  508         cfs[num_cf++] = (instr_cf_t) {
  509         .opc = NOP};
  510 
  511     /* patch cf addrs */
  512     for (int idx = 0; idx < num_cf; idx++) {
  513         switch (cfs[idx].opc) {
  514         case NOP:
  515         case ALLOC:
  516             break;
  517         case EXEC:
  518         case EXEC_END:
  519             cfs[idx].exec.address += num_cf / 2;
  520             break;
  521         case COND_JMP:
  522             cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
  523             break;
  524         default:
  525             assert(0);
  526         }
  527     }
  528 
  529     /* concatenate cfs and alu/fetch */
  530     uint32_t cfdwords = num_cf / 2 * 3;
  531     uint32_t alufetchdwords = exec.address * 3;
  532     uint32_t sizedwords = cfdwords + alufetchdwords;
  533     uint32_t *dwords = malloc(sizedwords * 4);
  534     assert(dwords);
  535     memcpy(dwords, cfs, cfdwords * 4);
  536     memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
  537 
  538     /* finalize ir2_shader_info */
  539     ctx->info->dwords = dwords;
  540     ctx->info->sizedwords = sizedwords;
  541     for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
  542         ctx->info->fetch_info[i].offset += cfdwords;
  543 
  544     if (fd_mesa_debug & FD_DBG_DISASM) {
  545         DBG("disassemble: type=%d", ctx->so->type);
  546         disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
  547     }
  548 }