"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/freedreno/ir3/ir3_ra.c" (16 Sep 2020, 44082 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ir3_ra.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 20.1.5_vs_20.2.0-rc1.

    1 /*
    2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8  * and/or sell copies of the Software, and to permit persons to whom the
    9  * Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   21  * SOFTWARE.
   22  *
   23  * Authors:
   24  *    Rob Clark <robclark@freedesktop.org>
   25  */
   26 
   27 #include "util/u_math.h"
   28 #include "util/register_allocate.h"
   29 #include "util/ralloc.h"
   30 #include "util/bitset.h"
   31 
   32 #include "ir3.h"
   33 #include "ir3_compiler.h"
   34 #include "ir3_ra.h"
   35 
   36 
   37 #ifdef DEBUG
   38 #define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS)
   39 #else
   40 #define RA_DEBUG 0
   41 #endif
   42 #define d(fmt, ...) do { if (RA_DEBUG) { \
   43     printf("RA: "fmt"\n", ##__VA_ARGS__); \
   44 } } while (0)
   45 
   46 #define di(instr, fmt, ...) do { if (RA_DEBUG) { \
   47     printf("RA: "fmt": ", ##__VA_ARGS__); \
   48     ir3_print_instr(instr); \
   49 } } while (0)
   50 
   51 /*
   52  * Register Assignment:
   53  *
   54  * Uses the register_allocate util, which implements graph coloring
   55  * algo with interference classes.  To handle the cases where we need
   56  * consecutive registers (for example, texture sample instructions),
   57  * we model these as larger (double/quad/etc) registers which conflict
   58  * with the corresponding registers in other classes.
   59  *
   60  * Additionally we create additional classes for half-regs, which
   61  * do not conflict with the full-reg classes.  We do need at least
   62  * sizes 1-4 (to deal w/ texture sample instructions output to half-
   63  * reg).  At the moment we don't create the higher order half-reg
   64  * classes as half-reg frequently does not have enough precision
   65  * for texture coords at higher resolutions.
   66  *
   67  * There are some additional cases that we need to handle specially,
   68  * as the graph coloring algo doesn't understand "partial writes".
   69  * For example, a sequence like:
   70  *
   71  *   add r0.z, ...
   72  *   sam (f32)(xy)r0.x, ...
   73  *   ...
   74  *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
   75  *
   76  * In this scenario, we treat r0.xyz as class size 3, which is written
   77  * (from a use/def perspective) at the 'add' instruction and ignore the
   78  * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
   79  * defining instruction, as it is the first to partially write r0.xyz.
   80  *
   81  * To address the fragmentation that this can potentially cause, a
   82  * two pass register allocation is used.  After the first pass the
   83  * assignment of scalars is discarded, but the assignment of vecN (for
   84  * N > 1) is used to pre-color in the second pass, which considers
   85  * only scalars.
   86  *
   87  * Arrays of arbitrary size are handled via pre-coloring a consecutive
   88  * sequence of registers.  Additional scalar (single component) reg
   89  * names are allocated starting at ctx->class_base[total_class_count]
   90  * (see arr->base), which are pre-colored.  In the use/def graph direct
   91  * access is treated as a single element use/def, and indirect access
   92  * is treated as use or def of all array elements.  (Only the first
   93  * def is tracked, in case of multiple indirect writes, etc.)
   94  *
   95  * TODO arrays that fit in one of the pre-defined class sizes should
   96  * not need to be pre-colored, but instead could be given a normal
   97  * vreg name.  (Ignoring this for now since it is a good way to work
   98  * out the kinks with arbitrary sized arrays.)
   99  *
  100  * TODO might be easier for debugging to split this into two passes,
  101  * the first assigning vreg names in a way that we could ir3_print()
  102  * the result.
  103  */
  104 
  105 
  106 static struct ir3_instruction * name_to_instr(struct ir3_ra_ctx *ctx, unsigned name);
  107 
  108 static bool name_is_array(struct ir3_ra_ctx *ctx, unsigned name);
  109 static struct ir3_array * name_to_array(struct ir3_ra_ctx *ctx, unsigned name);
  110 
  111 /* does it conflict? */
  112 static inline bool
  113 intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
  114 {
  115     return !((a_start >= b_end) || (b_start >= a_end));
  116 }
  117 
  118 static unsigned
  119 reg_size_for_array(struct ir3_array *arr)
  120 {
  121     if (arr->half)
  122         return DIV_ROUND_UP(arr->length, 2);
  123 
  124     return arr->length;
  125 }
  126 
  127 static bool
  128 instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
  129 {
  130     if (a->flags & IR3_INSTR_UNUSED)
  131         return false;
  132     return (a->ip < b->ip);
  133 }
  134 
  135 static struct ir3_instruction *
  136 get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
  137         int *sz, int *off)
  138 {
  139     struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
  140     struct ir3_instruction *d = NULL;
  141 
  142     if (ctx->scalar_pass) {
  143         id->defn = instr;
  144         id->off = 0;
  145         id->sz = 1;     /* considering things as N scalar regs now */
  146     }
  147 
  148     if (id->defn) {
  149         *sz = id->sz;
  150         *off = id->off;
  151         return id->defn;
  152     }
  153 
  154     if (instr->opc == OPC_META_COLLECT) {
  155         /* What about the case where collect is subset of array, we
  156          * need to find the distance between where actual array starts
  157          * and collect..  that probably doesn't happen currently.
  158          */
  159         struct ir3_register *src;
  160         int dsz, doff;
  161 
  162         /* note: don't use foreach_ssa_src as this gets called once
  163          * while assigning regs (which clears SSA flag)
  164          */
  165         foreach_src_n (src, n, instr) {
  166             struct ir3_instruction *dd;
  167             if (!src->instr)
  168                 continue;
  169 
  170             dd = get_definer(ctx, src->instr, &dsz, &doff);
  171 
  172             if ((!d) || instr_before(dd, d)) {
  173                 d = dd;
  174                 *sz = dsz;
  175                 *off = doff - n;
  176             }
  177         }
  178 
  179     } else if (instr->cp.right || instr->cp.left) {
  180         /* covers also the meta:fo case, which ends up w/ single
  181          * scalar instructions for each component:
  182          */
  183         struct ir3_instruction *f = ir3_neighbor_first(instr);
  184 
  185         /* by definition, the entire sequence forms one linked list
  186          * of single scalar register nodes (even if some of them may
  187          * be splits from a texture sample (for example) instr.  We
  188          * just need to walk the list finding the first element of
  189          * the group defined (lowest ip)
  190          */
  191         int cnt = 0;
  192 
  193         /* need to skip over unused in the group: */
  194         while (f && (f->flags & IR3_INSTR_UNUSED)) {
  195             f = f->cp.right;
  196             cnt++;
  197         }
  198 
  199         while (f) {
  200             if ((!d) || instr_before(f, d))
  201                 d = f;
  202             if (f == instr)
  203                 *off = cnt;
  204             f = f->cp.right;
  205             cnt++;
  206         }
  207 
  208         *sz = cnt;
  209 
  210     } else {
  211         /* second case is looking directly at the instruction which
  212          * produces multiple values (eg, texture sample), rather
  213          * than the split nodes that point back to that instruction.
  214          * This isn't quite right, because it may be part of a larger
  215          * group, such as:
  216          *
  217          *     sam (f32)(xyzw)r0.x, ...
  218          *     add r1.x, ...
  219          *     add r1.y, ...
  220          *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
  221          *
  222          * need to come up with a better way to handle that case.
  223          */
  224         if (instr->address) {
  225             *sz = instr->regs[0]->size;
  226         } else {
  227             *sz = util_last_bit(instr->regs[0]->wrmask);
  228         }
  229         *off = 0;
  230         d = instr;
  231     }
  232 
  233     if (d->opc == OPC_META_SPLIT) {
  234         struct ir3_instruction *dd;
  235         int dsz, doff;
  236 
  237         dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
  238 
  239         /* by definition, should come before: */
  240         debug_assert(instr_before(dd, d));
  241 
  242         *sz = MAX2(*sz, dsz);
  243 
  244         if (instr->opc == OPC_META_SPLIT)
  245             *off = MAX2(*off, instr->split.off);
  246 
  247         d = dd;
  248     }
  249 
  250     debug_assert(d->opc != OPC_META_SPLIT);
  251 
  252     id->defn = d;
  253     id->sz = *sz;
  254     id->off = *off;
  255 
  256     return d;
  257 }
  258 
  259 static void
  260 ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  261 {
  262     foreach_instr (instr, &block->instr_list) {
  263         struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
  264         if (instr->regs_count == 0)
  265             continue;
  266         /* couple special cases: */
  267         if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr)) {
  268             id->cls = -1;
  269         } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
  270             id->cls = total_class_count;
  271         } else {
  272             /* and the normal case: */
  273             id->defn = get_definer(ctx, instr, &id->sz, &id->off);
  274             id->cls = ra_size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
  275 
  276             /* this is a bit of duct-tape.. if we have a scenario like:
  277              *
  278              *   sam (f32)(x) out.x, ...
  279              *   sam (f32)(x) out.y, ...
  280              *
  281              * Then the fanout/split meta instructions for the two different
  282              * tex instructions end up grouped as left/right neighbors.  The
  283              * upshot is that in when you get_definer() on one of the meta:fo's
  284              * you get definer as the first sam with sz=2, but when you call
  285              * get_definer() on the either of the sam's you get itself as the
  286              * definer with sz=1.
  287              *
  288              * (We actually avoid this scenario exactly, the neighbor links
  289              * prevent one of the output mov's from being eliminated, so this
  290              * hack should be enough.  But probably we need to rethink how we
  291              * find the "defining" instruction.)
  292              *
  293              * TODO how do we figure out offset properly...
  294              */
  295             if (id->defn != instr) {
  296                 struct ir3_ra_instr_data *did = &ctx->instrd[id->defn->ip];
  297                 if (did->sz < id->sz) {
  298                     did->sz = id->sz;
  299                     did->cls = id->cls;
  300                 }
  301             }
  302         }
  303     }
  304 }
  305 
  306 /* give each instruction a name (and ip), and count up the # of names
  307  * of each class
  308  */
  309 static void
  310 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  311 {
  312     foreach_instr (instr, &block->instr_list) {
  313         struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
  314 
  315 #ifdef DEBUG
  316         instr->name = ~0;
  317 #endif
  318 
  319         ctx->instr_cnt++;
  320 
  321         if (!writes_gpr(instr))
  322             continue;
  323 
  324         if (id->defn != instr)
  325             continue;
  326 
  327         /* In scalar pass, collect/split don't get their own names,
  328          * but instead inherit them from their src(s):
  329          *
  330          * Possibly we don't need this because of scalar_name(), but
  331          * it does make the ir3_print() dumps easier to read.
  332          */
  333         if (ctx->scalar_pass) {
  334             if (instr->opc == OPC_META_SPLIT) {
  335                 instr->name = instr->regs[1]->instr->name + instr->split.off;
  336                 continue;
  337             }
  338 
  339             if (instr->opc == OPC_META_COLLECT) {
  340                 instr->name = instr->regs[1]->instr->name;
  341                 continue;
  342             }
  343         }
  344 
  345         /* arrays which don't fit in one of the pre-defined class
  346          * sizes are pre-colored:
  347          */
  348         if ((id->cls >= 0) && (id->cls < total_class_count)) {
  349             /* in the scalar pass, we generate a name for each
  350              * scalar component, instr->name is the name of the
  351              * first component.
  352              */
  353             unsigned n = ctx->scalar_pass ? dest_regs(instr) : 1;
  354             instr->name = ctx->class_alloc_count[id->cls];
  355             ctx->class_alloc_count[id->cls] += n;
  356             ctx->alloc_count += n;
  357         }
  358     }
  359 }
  360 
  361 /**
  362  * Set a value for max register target.
  363  *
  364  * Currently this just rounds up to a multiple of full-vec4 (ie. the
  365  * granularity that we configure the hw for.. there is no point to
  366  * using r3.x if you aren't going to make r3.yzw available).  But
  367  * in reality there seems to be multiple thresholds that affect the
  368  * number of waves.. and we should round up the target to the next
  369  * threshold when we round-robin registers, to give postsched more
  370  * options.  When we understand that better, this is where we'd
  371  * implement that.
  372  */
  373 static void
  374 ra_set_register_target(struct ir3_ra_ctx *ctx, unsigned max_target)
  375 {
  376     const unsigned hvec4 = 4;
  377     const unsigned vec4 = 2 * hvec4;
  378 
  379     ctx->max_target = align(max_target, vec4);
  380 
  381     d("New max_target=%u", ctx->max_target);
  382 }
  383 
  384 static int
  385 pick_in_range(BITSET_WORD *regs, unsigned min, unsigned max)
  386 {
  387     for (unsigned i = min; i <= max; i++) {
  388         if (BITSET_TEST(regs, i)) {
  389             return i;
  390         }
  391     }
  392     return -1;
  393 }
  394 
  395 static int
  396 pick_in_range_rev(BITSET_WORD *regs, int min, int max)
  397 {
  398     for (int i = max; i >= min; i--) {
  399         if (BITSET_TEST(regs, i)) {
  400             return i;
  401         }
  402     }
  403     return -1;
  404 }
  405 
  406 /* register selector for the a6xx+ merged register file: */
  407 static unsigned int
  408 ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data)
  409 {
  410     struct ir3_ra_ctx *ctx = data;
  411     unsigned int class = ra_get_node_class(ctx->g, n);
  412     bool half, high;
  413     int sz = ra_class_to_size(class, &half, &high);
  414 
  415     assert (sz > 0);
  416 
  417     /* dimensions within the register class: */
  418     unsigned max_target, start;
  419 
  420     /* the regs bitset will include *all* of the virtual regs, but we lay
  421      * out the different classes consecutively in the virtual register
  422      * space.  So we just need to think about the base offset of a given
  423      * class within the virtual register space, and offset the register
  424      * space we search within by that base offset.
  425      */
  426     unsigned base;
  427 
  428     /* TODO I think eventually we want to round-robin in vector pass
  429      * as well, but needs some more work to calculate # of live vals
  430      * for this.  (Maybe with some work, we could just figure out
  431      * the scalar target and use that, since that is what we care
  432      * about in the end.. but that would mean setting up use-def/
  433      * liveranges for scalar pass before doing vector pass.)
  434      *
  435      * For now, in the vector class, just move assignments for scalar
  436      * vals higher to hopefully prevent them from limiting where vecN
  437      * values can be placed.  Since the scalar values are re-assigned
  438      * in the 2nd pass, we don't really care where they end up in the
  439      * vector pass.
  440      */
  441     if (!ctx->scalar_pass) {
  442         base = ctx->set->gpr_to_ra_reg[class][0];
  443         if (high) {
  444             max_target = HIGH_CLASS_REGS(class - HIGH_OFFSET);
  445         } else if (half) {
  446             max_target = HALF_CLASS_REGS(class - HALF_OFFSET);
  447         } else {
  448             max_target = CLASS_REGS(class);
  449         }
  450 
  451         if ((sz == 1) && !high) {
  452             return pick_in_range_rev(regs, base, base + max_target);
  453         } else {
  454             return pick_in_range(regs, base, base + max_target);
  455         }
  456     } else {
  457         assert(sz == 1);
  458     }
  459 
  460     /* NOTE: this is only used in scalar pass, so the register
  461      * class will be one of the scalar classes (ie. idx==0):
  462      */
  463     base = ctx->set->gpr_to_ra_reg[class][0];
  464     if (high) {
  465         max_target = HIGH_CLASS_REGS(0);
  466         start = 0;
  467     } else if (half) {
  468         max_target = ctx->max_target;
  469         start = ctx->start_search_reg;
  470     } else {
  471         max_target = ctx->max_target / 2;
  472         start = ctx->start_search_reg;
  473     }
  474 
  475     /* For cat4 instructions, if the src reg is already assigned, and
  476      * avail to pick, use it.  Because this doesn't introduce unnecessary
  477      * dependencies, and it potentially avoids needing (ss) syncs to
  478      * for write after read hazards:
  479      */
  480     struct ir3_instruction *instr = name_to_instr(ctx, n);
  481     if (is_sfu(instr)) {
  482         struct ir3_register *src = instr->regs[1];
  483         int src_n;
  484 
  485         if ((src->flags & IR3_REG_ARRAY) && !(src->flags & IR3_REG_RELATIV)) {
  486             struct ir3_array *arr = ir3_lookup_array(ctx->ir, src->array.id);
  487             src_n = arr->base + src->array.offset;
  488         } else {
  489             src_n = scalar_name(ctx, src->instr, 0);
  490         }
  491 
  492         unsigned reg = ra_get_node_reg(ctx->g, src_n);
  493 
  494         /* Check if the src register has been assigned yet: */
  495         if (reg != NO_REG) {
  496             if (BITSET_TEST(regs, reg)) {
  497                 return reg;
  498             }
  499         }
  500     }
  501 
  502     int r = pick_in_range(regs, base + start, base + max_target);
  503     if (r < 0) {
  504         /* wrap-around: */
  505         r = pick_in_range(regs, base, base + start);
  506     }
  507 
  508     if (r < 0) {
  509         /* overflow, we need to increase max_target: */
  510         ra_set_register_target(ctx, ctx->max_target + 1);
  511         return ra_select_reg_merged(n, regs, data);
  512     }
  513 
  514     if (class == ctx->set->half_classes[0]) {
  515         int n = r - base;
  516         ctx->start_search_reg = (n + 1) % ctx->max_target;
  517     } else if (class == ctx->set->classes[0]) {
  518         int n = (r - base) * 2;
  519         ctx->start_search_reg = (n + 1) % ctx->max_target;
  520     }
  521 
  522     return r;
  523 }
  524 
  525 static void
  526 ra_init(struct ir3_ra_ctx *ctx)
  527 {
  528     unsigned n, base;
  529 
  530     ir3_clear_mark(ctx->ir);
  531     n = ir3_count_instructions_ra(ctx->ir);
  532 
  533     ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
  534 
  535     foreach_block (block, &ctx->ir->block_list) {
  536         ra_block_find_definers(ctx, block);
  537     }
  538 
  539     foreach_block (block, &ctx->ir->block_list) {
  540         ra_block_name_instructions(ctx, block);
  541     }
  542 
  543     /* figure out the base register name for each class.  The
  544      * actual ra name is class_base[cls] + instr->name;
  545      */
  546     ctx->class_base[0] = 0;
  547     for (unsigned i = 1; i <= total_class_count; i++) {
  548         ctx->class_base[i] = ctx->class_base[i-1] +
  549                 ctx->class_alloc_count[i-1];
  550     }
  551 
  552     /* and vreg names for array elements: */
  553     base = ctx->class_base[total_class_count];
  554     foreach_array (arr, &ctx->ir->array_list) {
  555         arr->base = base;
  556         ctx->class_alloc_count[total_class_count] += reg_size_for_array(arr);
  557         base += reg_size_for_array(arr);
  558     }
  559     ctx->alloc_count += ctx->class_alloc_count[total_class_count];
  560 
  561     /* Add vreg names for r0.xyz */
  562     ctx->r0_xyz_nodes = ctx->alloc_count;
  563     ctx->alloc_count += 3;
  564     ctx->hr0_xyz_nodes = ctx->alloc_count;
  565     ctx->alloc_count += 3;
  566 
  567     ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
  568     ralloc_steal(ctx->g, ctx->instrd);
  569     ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
  570     ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
  571 
  572     /* TODO add selector callback for split (pre-a6xx) register file: */
  573     if (ctx->ir->compiler->gpu_id >= 600) {
  574         ra_set_select_reg_callback(ctx->g, ra_select_reg_merged, ctx);
  575 
  576         if (ctx->scalar_pass) {
  577             ctx->name_to_instr = _mesa_hash_table_create(ctx->g,
  578                     _mesa_hash_int, _mesa_key_int_equal);
  579         }
  580     }
  581 }
  582 
  583 /* Map the name back to instruction: */
  584 static struct ir3_instruction *
  585 name_to_instr(struct ir3_ra_ctx *ctx, unsigned name)
  586 {
  587     assert(!name_is_array(ctx, name));
  588     struct hash_entry *entry = _mesa_hash_table_search(ctx->name_to_instr, &name);
  589     if (entry)
  590         return entry->data;
  591     unreachable("invalid instr name");
  592     return NULL;
  593 }
  594 
  595 static bool
  596 name_is_array(struct ir3_ra_ctx *ctx, unsigned name)
  597 {
  598     return name >= ctx->class_base[total_class_count];
  599 }
  600 
  601 static struct ir3_array *
  602 name_to_array(struct ir3_ra_ctx *ctx, unsigned name)
  603 {
  604     assert(name_is_array(ctx, name));
  605     foreach_array (arr, &ctx->ir->array_list) {
  606         unsigned sz = reg_size_for_array(arr);
  607         if (name < (arr->base + sz))
  608             return arr;
  609     }
  610     unreachable("invalid array name");
  611     return NULL;
  612 }
  613 
  614 static void
  615 ra_destroy(struct ir3_ra_ctx *ctx)
  616 {
  617     ralloc_free(ctx->g);
  618 }
  619 
  620 static void
  621 __def(struct ir3_ra_ctx *ctx, struct ir3_ra_block_data *bd, unsigned name,
  622         struct ir3_instruction *instr)
  623 {
  624     debug_assert(name < ctx->alloc_count);
  625 
  626     /* split/collect do not actually define any real value */
  627     if ((instr->opc == OPC_META_SPLIT) || (instr->opc == OPC_META_COLLECT))
  628         return;
  629 
  630     /* defined on first write: */
  631     if (!ctx->def[name])
  632         ctx->def[name] = instr->ip;
  633     ctx->use[name] = MAX2(ctx->use[name], instr->ip);
  634     BITSET_SET(bd->def, name);
  635 }
  636 
  637 static void
  638 __use(struct ir3_ra_ctx *ctx, struct ir3_ra_block_data *bd, unsigned name,
  639         struct ir3_instruction *instr)
  640 {
  641     debug_assert(name < ctx->alloc_count);
  642     ctx->use[name] = MAX2(ctx->use[name], instr->ip);
  643     if (!BITSET_TEST(bd->def, name))
  644         BITSET_SET(bd->use, name);
  645 }
  646 
  647 static void
  648 ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  649 {
  650     struct ir3_ra_block_data *bd;
  651     unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
  652 
  653 #define def(name, instr) __def(ctx, bd, name, instr)
  654 #define use(name, instr) __use(ctx, bd, name, instr)
  655 
  656     bd = rzalloc(ctx->g, struct ir3_ra_block_data);
  657 
  658     bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
  659     bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
  660     bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
  661     bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
  662 
  663     block->data = bd;
  664 
  665     struct ir3_instruction *first_non_input = NULL;
  666     foreach_instr (instr, &block->instr_list) {
  667         if (instr->opc != OPC_META_INPUT) {
  668             first_non_input = instr;
  669             break;
  670         }
  671     }
  672 
  673     foreach_instr (instr, &block->instr_list) {
  674         foreach_def (name, ctx, instr) {
  675             if (name_is_array(ctx, name)) {
  676                 struct ir3_array *arr = name_to_array(ctx, name);
  677 
  678                 arr->start_ip = MIN2(arr->start_ip, instr->ip);
  679                 arr->end_ip = MAX2(arr->end_ip, instr->ip);
  680 
  681                 for (unsigned i = 0; i < arr->length; i++) {
  682                     unsigned name = arr->base + i;
  683                     if(arr->half)
  684                         ra_set_node_class(ctx->g, name, ctx->set->half_classes[0]);
  685                     else
  686                         ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
  687                 }
  688             } else {
  689                 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
  690                 if (is_high(instr)) {
  691                     ra_set_node_class(ctx->g, name,
  692                             ctx->set->high_classes[id->cls - HIGH_OFFSET]);
  693                 } else if (is_half(instr)) {
  694                     ra_set_node_class(ctx->g, name,
  695                             ctx->set->half_classes[id->cls - HALF_OFFSET]);
  696                 } else {
  697                     ra_set_node_class(ctx->g, name,
  698                             ctx->set->classes[id->cls]);
  699                 }
  700             }
  701 
  702             def(name, instr);
  703 
  704             if ((instr->opc == OPC_META_INPUT) && first_non_input)
  705                 use(name, first_non_input);
  706 
  707             /* Texture instructions with writemasks can be treated as smaller
  708              * vectors (or just scalars!) to allocate knowing that the
  709              * masked-out regs won't be written, but we need to make sure that
  710              * the start of the vector doesn't come before the first register
  711              * or we'll wrap.
  712              */
  713             if (is_tex_or_prefetch(instr)) {
  714                 int writemask_skipped_regs = ffs(instr->regs[0]->wrmask) - 1;
  715                 int r0_xyz = (instr->regs[0]->flags & IR3_REG_HALF) ?
  716                     ctx->hr0_xyz_nodes : ctx->r0_xyz_nodes;
  717                 for (int i = 0; i < writemask_skipped_regs; i++)
  718                     ra_add_node_interference(ctx->g, name, r0_xyz + i);
  719             }
  720         }
  721 
  722         foreach_use (name, ctx, instr) {
  723             if (name_is_array(ctx, name)) {
  724                 struct ir3_array *arr = name_to_array(ctx, name);
  725 
  726                 arr->start_ip = MIN2(arr->start_ip, instr->ip);
  727                 arr->end_ip = MAX2(arr->end_ip, instr->ip);
  728 
  729                 /* NOTE: arrays are not SSA so unconditionally
  730                  * set use bit:
  731                  */
  732                 BITSET_SET(bd->use, name);
  733             }
  734 
  735             use(name, instr);
  736         }
  737 
  738         foreach_name (name, ctx, instr) {
  739             /* split/collect instructions have duplicate names
  740              * as real instructions, so they skip the hashtable:
  741              */
  742             if (ctx->name_to_instr && !((instr->opc == OPC_META_SPLIT) ||
  743                     (instr->opc == OPC_META_COLLECT))) {
  744                 /* this is slightly annoying, we can't just use an
  745                  * integer on the stack
  746                  */
  747                 unsigned *key = ralloc(ctx->name_to_instr, unsigned);
  748                 *key = name;
  749                 debug_assert(!_mesa_hash_table_search(ctx->name_to_instr, key));
  750                 _mesa_hash_table_insert(ctx->name_to_instr, key, instr);
  751             }
  752         }
  753     }
  754 }
  755 
  756 static bool
  757 ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
  758 {
  759     unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
  760     bool progress = false;
  761 
  762     foreach_block (block, &ctx->ir->block_list) {
  763         struct ir3_ra_block_data *bd = block->data;
  764 
  765         /* update livein: */
  766         for (unsigned i = 0; i < bitset_words; i++) {
  767             /* anything used but not def'd within a block is
  768              * by definition a live value coming into the block:
  769              */
  770             BITSET_WORD new_livein =
  771                 (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
  772 
  773             if (new_livein & ~bd->livein[i]) {
  774                 bd->livein[i] |= new_livein;
  775                 progress = true;
  776             }
  777         }
  778 
  779         /* update liveout: */
  780         for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
  781             struct ir3_block *succ = block->successors[j];
  782             struct ir3_ra_block_data *succ_bd;
  783 
  784             if (!succ)
  785                 continue;
  786 
  787             succ_bd = succ->data;
  788 
  789             for (unsigned i = 0; i < bitset_words; i++) {
  790                 /* add anything that is livein in a successor block
  791                  * to our liveout:
  792                  */
  793                 BITSET_WORD new_liveout =
  794                     (succ_bd->livein[i] & ~bd->liveout[i]);
  795 
  796                 if (new_liveout) {
  797                     bd->liveout[i] |= new_liveout;
  798                     progress = true;
  799                 }
  800             }
  801         }
  802     }
  803 
  804     return progress;
  805 }
  806 
  807 static void
  808 print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
  809 {
  810     bool first = true;
  811     debug_printf("RA:  %s:", name);
  812     for (unsigned i = 0; i < cnt; i++) {
  813         if (BITSET_TEST(bs, i)) {
  814             if (!first)
  815                 debug_printf(",");
  816             debug_printf(" %04u", i);
  817             first = false;
  818         }
  819     }
  820     debug_printf("\n");
  821 }
  822 
  823 /* size of one component of instruction result, ie. half vs full: */
  824 static unsigned
  825 live_size(struct ir3_instruction *instr)
  826 {
  827     if (is_half(instr)) {
  828         return 1;
  829     } else if (is_high(instr)) {
  830         /* doesn't count towards footprint */
  831         return 0;
  832     } else {
  833         return 2;
  834     }
  835 }
  836 
  837 static unsigned
  838 name_size(struct ir3_ra_ctx *ctx, unsigned name)
  839 {
  840     if (name_is_array(ctx, name)) {
  841         struct ir3_array *arr = name_to_array(ctx, name);
  842         return arr->half ? 1 : 2;
  843     } else {
  844         struct ir3_instruction *instr = name_to_instr(ctx, name);
  845         /* in scalar pass, each name represents on scalar value,
  846          * half or full precision
  847          */
  848         return live_size(instr);
  849     }
  850 }
  851 
  852 static unsigned
  853 ra_calc_block_live_values(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  854 {
  855     struct ir3_ra_block_data *bd = block->data;
  856     unsigned name;
  857 
  858     assert(ctx->name_to_instr);
  859 
  860     /* TODO this gets a bit more complicated in non-scalar pass.. but
  861      * possibly a lowball estimate is fine to start with if we do
  862      * round-robin in non-scalar pass?  Maybe we just want to handle
  863      * that in a different fxn?
  864      */
  865     assert(ctx->scalar_pass);
  866 
  867     BITSET_WORD *live =
  868         rzalloc_array(bd, BITSET_WORD, BITSET_WORDS(ctx->alloc_count));
  869 
  870     /* Add the live input values: */
  871     unsigned livein = 0;
  872     BITSET_FOREACH_SET (name, bd->livein, ctx->alloc_count) {
  873         livein += name_size(ctx, name);
  874         BITSET_SET(live, name);
  875     }
  876 
  877     d("---------------------");
  878     d("block%u: LIVEIN: %u", block_id(block), livein);
  879 
  880     unsigned max = livein;
  881     int cur_live = max;
  882 
  883     /* Now that we know the live inputs to the block, iterate the
  884      * instructions adjusting the current # of live values as we
  885      * see their last use:
  886      */
  887     foreach_instr (instr, &block->instr_list) {
  888         if (RA_DEBUG)
  889             print_bitset("LIVE", live, ctx->alloc_count);
  890         di(instr, "CALC");
  891 
  892         unsigned new_live = 0;    /* newly live values */
  893         unsigned new_dead = 0;    /* newly no-longer live values */
  894         unsigned next_dead = 0;   /* newly dead following this instr */
  895 
  896         foreach_def (name, ctx, instr) {
  897             /* NOTE: checking ctx->def filters out things like split/
  898              * collect which are just redefining existing live names
  899              * or array writes to already live array elements:
  900              */
  901             if (ctx->def[name] != instr->ip)
  902                 continue;
  903             new_live += live_size(instr);
  904             d("NEW_LIVE: %u (new_live=%u, use=%u)", name, new_live, ctx->use[name]);
  905             BITSET_SET(live, name);
  906             /* There can be cases where this is *also* the last use
  907              * of a value, for example instructions that write multiple
  908              * values, only some of which are used.  These values are
  909              * dead *after* (rather than during) this instruction.
  910              */
  911             if (ctx->use[name] != instr->ip)
  912                 continue;
  913             next_dead += live_size(instr);
  914             d("NEXT_DEAD: %u (next_dead=%u)", name, next_dead);
  915             BITSET_CLEAR(live, name);
  916         }
  917 
  918         /* To be more resilient against special cases where liverange
  919          * is extended (like first_non_input), rather than using the
  920          * foreach_use() iterator, we iterate the current live values
  921          * instead:
  922          */
  923         BITSET_FOREACH_SET (name, live, ctx->alloc_count) {
  924             /* Is this the last use? */
  925             if (ctx->use[name] != instr->ip)
  926                 continue;
  927             new_dead += name_size(ctx, name);
  928             d("NEW_DEAD: %u (new_dead=%u)", name, new_dead);
  929             BITSET_CLEAR(live, name);
  930         }
  931 
  932         cur_live += new_live;
  933         cur_live -= new_dead;
  934 
  935         assert(cur_live >= 0);
  936         d("CUR_LIVE: %u", cur_live);
  937 
  938         max = MAX2(max, cur_live);
  939 
  940         /* account for written values which are not used later,
  941          * but after updating max (since they are for one cycle
  942          * live)
  943          */
  944         cur_live -= next_dead;
  945         assert(cur_live >= 0);
  946 
  947         if (RA_DEBUG) {
  948             unsigned cnt = 0;
  949             BITSET_FOREACH_SET (name, live, ctx->alloc_count) {
  950                 cnt += name_size(ctx, name);
  951             }
  952             assert(cur_live == cnt);
  953         }
  954     }
  955 
  956     d("block%u max=%u", block_id(block), max);
  957 
  958     /* the remaining live should match liveout (for extra sanity testing): */
  959     if (RA_DEBUG) {
  960         unsigned new_dead = 0;
  961         BITSET_FOREACH_SET (name, live, ctx->alloc_count) {
  962             /* Is this the last use? */
  963             if (ctx->use[name] != block->end_ip)
  964                 continue;
  965             new_dead += name_size(ctx, name);
  966             d("NEW_DEAD: %u (new_dead=%u)", name, new_dead);
  967             BITSET_CLEAR(live, name);
  968         }
  969         unsigned liveout = 0;
  970         BITSET_FOREACH_SET (name, bd->liveout, ctx->alloc_count) {
  971             liveout += name_size(ctx, name);
  972             BITSET_CLEAR(live, name);
  973         }
  974 
  975         if (cur_live != liveout) {
  976             print_bitset("LEAKED", live, ctx->alloc_count);
  977             /* TODO there are a few edge cases where live-range extension
  978              * tells us a value is livein.  But not used by the block or
  979              * liveout for the block.  Possibly a bug in the liverange
  980              * extension.  But for now leave the assert disabled:
  981             assert(cur_live == liveout);
  982              */
  983         }
  984     }
  985 
  986     ralloc_free(live);
  987 
  988     return max;
  989 }
  990 
  991 static unsigned
  992 ra_calc_max_live_values(struct ir3_ra_ctx *ctx)
  993 {
  994     unsigned max = 0;
  995 
  996     foreach_block (block, &ctx->ir->block_list) {
  997         unsigned block_live = ra_calc_block_live_values(ctx, block);
  998         max = MAX2(max, block_live);
  999     }
 1000 
 1001     return max;
 1002 }
 1003 
 1004 static void
 1005 ra_add_interference(struct ir3_ra_ctx *ctx)
 1006 {
 1007     struct ir3 *ir = ctx->ir;
 1008 
 1009     /* initialize array live ranges: */
 1010     foreach_array (arr, &ir->array_list) {
 1011         arr->start_ip = ~0;
 1012         arr->end_ip = 0;
 1013     }
 1014 
 1015 
 1016     /* set up the r0.xyz precolor regs. */
 1017     for (int i = 0; i < 3; i++) {
 1018         ra_set_node_reg(ctx->g, ctx->r0_xyz_nodes + i, i);
 1019         ra_set_node_reg(ctx->g, ctx->hr0_xyz_nodes + i,
 1020                 ctx->set->first_half_reg + i);
 1021     }
 1022 
 1023     /* compute live ranges (use/def) on a block level, also updating
 1024      * block's def/use bitmasks (used below to calculate per-block
 1025      * livein/liveout):
 1026      */
 1027     foreach_block (block, &ir->block_list) {
 1028         ra_block_compute_live_ranges(ctx, block);
 1029     }
 1030 
 1031     /* update per-block livein/liveout: */
 1032     while (ra_compute_livein_liveout(ctx)) {}
 1033 
 1034     if (RA_DEBUG) {
 1035         d("AFTER LIVEIN/OUT:");
 1036         foreach_block (block, &ir->block_list) {
 1037             struct ir3_ra_block_data *bd = block->data;
 1038             d("block%u:", block_id(block));
 1039             print_bitset("  def", bd->def, ctx->alloc_count);
 1040             print_bitset("  use", bd->use, ctx->alloc_count);
 1041             print_bitset("  l/i", bd->livein, ctx->alloc_count);
 1042             print_bitset("  l/o", bd->liveout, ctx->alloc_count);
 1043         }
 1044         foreach_array (arr, &ir->array_list) {
 1045             d("array%u:", arr->id);
 1046             d("   length:   %u", arr->length);
 1047             d("   start_ip: %u", arr->start_ip);
 1048             d("   end_ip:   %u", arr->end_ip);
 1049         }
 1050         d("INSTRUCTION VREG NAMES:");
 1051         foreach_block (block, &ctx->ir->block_list) {
 1052             foreach_instr (instr, &block->instr_list) {
 1053                 if (!ctx->instrd[instr->ip].defn)
 1054                     continue;
 1055                 if (!writes_gpr(instr))
 1056                     continue;
 1057                 di(instr, "%04u", scalar_name(ctx, instr, 0));
 1058             }
 1059         }
 1060         d("ARRAY VREG NAMES:");
 1061         foreach_array (arr, &ctx->ir->array_list) {
 1062             d("%04u: arr%u", arr->base, arr->id);
 1063         }
 1064     }
 1065 
 1066     /* extend start/end ranges based on livein/liveout info from cfg: */
 1067     foreach_block (block, &ir->block_list) {
 1068         struct ir3_ra_block_data *bd = block->data;
 1069 
 1070         for (unsigned i = 0; i < ctx->alloc_count; i++) {
 1071             if (BITSET_TEST(bd->livein, i)) {
 1072                 ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
 1073                 ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
 1074             }
 1075 
 1076             if (BITSET_TEST(bd->liveout, i)) {
 1077                 ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
 1078                 ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
 1079             }
 1080         }
 1081 
 1082         foreach_array (arr, &ctx->ir->array_list) {
 1083             for (unsigned i = 0; i < arr->length; i++) {
 1084                 if (BITSET_TEST(bd->livein, i + arr->base)) {
 1085                     arr->start_ip = MIN2(arr->start_ip, block->start_ip);
 1086                 }
 1087                 if (BITSET_TEST(bd->liveout, i + arr->base)) {
 1088                     arr->end_ip = MAX2(arr->end_ip, block->end_ip);
 1089                 }
 1090             }
 1091         }
 1092     }
 1093 
 1094     if (ctx->name_to_instr) {
 1095         unsigned max = ra_calc_max_live_values(ctx);
 1096         ra_set_register_target(ctx, max);
 1097     }
 1098 
 1099     for (unsigned i = 0; i < ctx->alloc_count; i++) {
 1100         for (unsigned j = 0; j < ctx->alloc_count; j++) {
 1101             if (intersects(ctx->def[i], ctx->use[i],
 1102                     ctx->def[j], ctx->use[j])) {
 1103                 ra_add_node_interference(ctx->g, i, j);
 1104             }
 1105         }
 1106     }
 1107 }
 1108 
 1109 /* some instructions need fix-up if dst register is half precision: */
 1110 static void fixup_half_instr_dst(struct ir3_instruction *instr)
 1111 {
 1112     switch (opc_cat(instr->opc)) {
 1113     case 1: /* move instructions */
 1114         instr->cat1.dst_type = half_type(instr->cat1.dst_type);
 1115         break;
 1116     case 4:
 1117         switch (instr->opc) {
 1118         case OPC_RSQ:
 1119             instr->opc = OPC_HRSQ;
 1120             break;
 1121         case OPC_LOG2:
 1122             instr->opc = OPC_HLOG2;
 1123             break;
 1124         case OPC_EXP2:
 1125             instr->opc = OPC_HEXP2;
 1126             break;
 1127         default:
 1128             break;
 1129         }
 1130         break;
 1131     case 5:
 1132         instr->cat5.type = half_type(instr->cat5.type);
 1133         break;
 1134     }
 1135 }
 1136 /* some instructions need fix-up if src register is half precision: */
 1137 static void fixup_half_instr_src(struct ir3_instruction *instr)
 1138 {
 1139     switch (instr->opc) {
 1140     case OPC_MOV:
 1141         instr->cat1.src_type = half_type(instr->cat1.src_type);
 1142         break;
 1143     case OPC_MAD_F32:
 1144         instr->opc = OPC_MAD_F16;
 1145         break;
 1146     case OPC_SEL_B32:
 1147         instr->opc = OPC_SEL_B16;
 1148         break;
 1149     case OPC_SEL_S32:
 1150         instr->opc = OPC_SEL_S16;
 1151         break;
 1152     case OPC_SEL_F32:
 1153         instr->opc = OPC_SEL_F16;
 1154         break;
 1155     case OPC_SAD_S32:
 1156         instr->opc = OPC_SAD_S16;
 1157         break;
 1158     default:
 1159         break;
 1160     }
 1161 }
 1162 
 1163 /* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
 1164  * array access(es) which do not have any previous access to depend
 1165  * on from scheduling point of view
 1166  */
 1167 static void
 1168 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 1169         struct ir3_instruction *instr)
 1170 {
 1171     struct ir3_ra_instr_data *id;
 1172 
 1173     if (reg->flags & IR3_REG_ARRAY) {
 1174         struct ir3_array *arr =
 1175             ir3_lookup_array(ctx->ir, reg->array.id);
 1176         unsigned name = arr->base + reg->array.offset;
 1177         unsigned r = ra_get_node_reg(ctx->g, name);
 1178         unsigned num = ctx->set->ra_reg_to_gpr[r];
 1179 
 1180         if (reg->flags & IR3_REG_RELATIV) {
 1181             reg->array.offset = num;
 1182         } else {
 1183             reg->num = num;
 1184             reg->flags &= ~IR3_REG_SSA;
 1185         }
 1186 
 1187         reg->flags &= ~IR3_REG_ARRAY;
 1188     } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
 1189         unsigned first_component = 0;
 1190 
 1191         /* Special case for tex instructions, which may use the wrmask
 1192          * to mask off the first component(s).  In the scalar pass,
 1193          * this means the masked off component(s) are not def'd/use'd,
 1194          * so we get a bogus value when we ask the register_allocate
 1195          * algo to get the assigned reg for the unused/untouched
 1196          * component.  So we need to consider the first used component:
 1197          */
 1198         if (ctx->scalar_pass && is_tex_or_prefetch(id->defn)) {
 1199             unsigned n = ffs(id->defn->regs[0]->wrmask);
 1200             debug_assert(n > 0);
 1201             first_component = n - 1;
 1202         }
 1203 
 1204         unsigned name = scalar_name(ctx, id->defn, first_component);
 1205         unsigned r = ra_get_node_reg(ctx->g, name);
 1206         unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
 1207 
 1208         debug_assert(!(reg->flags & IR3_REG_RELATIV));
 1209 
 1210         debug_assert(num >= first_component);
 1211 
 1212         if (is_high(id->defn))
 1213             num += FIRST_HIGH_REG;
 1214 
 1215         reg->num = num - first_component;
 1216 
 1217         reg->flags &= ~IR3_REG_SSA;
 1218 
 1219         if (is_half(id->defn))
 1220             reg->flags |= IR3_REG_HALF;
 1221     }
 1222 }
 1223 
 1224 /* helper to determine which regs to assign in which pass: */
 1225 static bool
 1226 should_assign(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
 1227 {
 1228     if ((instr->opc == OPC_META_SPLIT) &&
 1229             (util_bitcount(instr->regs[1]->wrmask) > 1))
 1230         return !ctx->scalar_pass;
 1231     if ((instr->opc == OPC_META_COLLECT) &&
 1232             (util_bitcount(instr->regs[0]->wrmask) > 1))
 1233         return !ctx->scalar_pass;
 1234     return ctx->scalar_pass;
 1235 }
 1236 
 1237 static void
 1238 ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 1239 {
 1240     foreach_instr (instr, &block->instr_list) {
 1241         struct ir3_register *reg;
 1242 
 1243         if (writes_gpr(instr)) {
 1244             if (should_assign(ctx, instr)) {
 1245                 reg_assign(ctx, instr->regs[0], instr);
 1246                 if (instr->regs[0]->flags & IR3_REG_HALF)
 1247                     fixup_half_instr_dst(instr);
 1248             }
 1249         }
 1250 
 1251         foreach_src_n (reg, n, instr) {
 1252             struct ir3_instruction *src = reg->instr;
 1253 
 1254             if (src && !should_assign(ctx, src) && !should_assign(ctx, instr))
 1255                 continue;
 1256 
 1257             if (src && should_assign(ctx, instr))
 1258                 reg_assign(ctx, src->regs[0], src);
 1259 
 1260             /* Note: reg->instr could be null for IR3_REG_ARRAY */
 1261             if (src || (reg->flags & IR3_REG_ARRAY))
 1262                 reg_assign(ctx, instr->regs[n+1], src);
 1263 
 1264             if (instr->regs[n+1]->flags & IR3_REG_HALF)
 1265                 fixup_half_instr_src(instr);
 1266         }
 1267     }
 1268 
 1269     /* We need to pre-color outputs for the scalar pass in
 1270      * ra_precolor_assigned(), so we need to actually assign
 1271      * them in the first pass:
 1272      */
 1273     if (!ctx->scalar_pass) {
 1274         struct ir3_instruction *in, *out;
 1275 
 1276         foreach_input (in, ctx->ir) {
 1277             reg_assign(ctx, in->regs[0], in);
 1278         }
 1279         foreach_output (out, ctx->ir) {
 1280             reg_assign(ctx, out->regs[0], out);
 1281         }
 1282     }
 1283 }
 1284 
 1285 static void
 1286 assign_arr_base(struct ir3_ra_ctx *ctx, struct ir3_array *arr,
 1287         struct ir3_instruction **precolor, unsigned nprecolor)
 1288 {
 1289     unsigned base = 0;
 1290 
 1291     /* figure out what else we conflict with which has already
 1292      * been assigned:
 1293      */
 1294 retry:
 1295     foreach_array (arr2, &ctx->ir->array_list) {
 1296         if (arr2 == arr)
 1297             break;
 1298         if (arr2->end_ip == 0)
 1299             continue;
 1300         /* if it intersects with liverange AND register range.. */
 1301         if (intersects(arr->start_ip, arr->end_ip,
 1302                 arr2->start_ip, arr2->end_ip) &&
 1303             intersects(base, base + reg_size_for_array(arr),
 1304                 arr2->reg, arr2->reg + reg_size_for_array(arr2))) {
 1305             base = MAX2(base, arr2->reg + reg_size_for_array(arr2));
 1306             goto retry;
 1307         }
 1308     }
 1309 
 1310     /* also need to not conflict with any pre-assigned inputs: */
 1311     for (unsigned i = 0; i < nprecolor; i++) {
 1312         struct ir3_instruction *instr = precolor[i];
 1313 
 1314         if (!instr || (instr->flags & IR3_INSTR_UNUSED))
 1315             continue;
 1316 
 1317         struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 1318 
 1319         /* only consider the first component: */
 1320         if (id->off > 0)
 1321             continue;
 1322 
 1323         unsigned name = ra_name(ctx, id);
 1324         unsigned regid = instr->regs[0]->num;
 1325 
 1326         /* Check if array intersects with liverange AND register
 1327          * range of the input:
 1328          */
 1329         if (intersects(arr->start_ip, arr->end_ip,
 1330                         ctx->def[name], ctx->use[name]) &&
 1331                 intersects(base, base + reg_size_for_array(arr),
 1332                         regid, regid + class_sizes[id->cls])) {
 1333             base = MAX2(base, regid + class_sizes[id->cls]);
 1334             goto retry;
 1335         }
 1336     }
 1337 
 1338     arr->reg = base;
 1339 }
 1340 
 1341 /* handle pre-colored registers.  This includes "arrays" (which could be of
 1342  * length 1, used for phi webs lowered to registers in nir), as well as
 1343  * special shader input values that need to be pinned to certain registers.
 1344  */
 1345 static void
 1346 ra_precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction **precolor, unsigned nprecolor)
 1347 {
 1348     for (unsigned i = 0; i < nprecolor; i++) {
 1349         if (precolor[i] && !(precolor[i]->flags & IR3_INSTR_UNUSED)) {
 1350             struct ir3_instruction *instr = precolor[i];
 1351 
 1352             if (instr->regs[0]->num == INVALID_REG)
 1353                 continue;
 1354 
 1355             struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 1356 
 1357             debug_assert(!(instr->regs[0]->flags & (IR3_REG_HALF | IR3_REG_HIGH)));
 1358 
 1359             /* only consider the first component: */
 1360             if (id->off > 0)
 1361                 continue;
 1362 
 1363             if (ctx->scalar_pass && !should_assign(ctx, instr))
 1364                 continue;
 1365 
 1366             /* 'base' is in scalar (class 0) but we need to map that
 1367              * the conflicting register of the appropriate class (ie.
 1368              * input could be vec2/vec3/etc)
 1369              *
 1370              * Note that the higher class (larger than scalar) regs
 1371              * are setup to conflict with others in the same class,
 1372              * so for example, R1 (scalar) is also the first component
 1373              * of D1 (vec2/double):
 1374              *
 1375              *    Single (base) |  Double
 1376              *    --------------+---------------
 1377              *       R0         |  D0
 1378              *       R1         |  D0 D1
 1379              *       R2         |     D1 D2
 1380              *       R3         |        D2
 1381              *           .. and so on..
 1382              */
 1383             unsigned regid = instr->regs[0]->num;
 1384             unsigned reg = ctx->set->gpr_to_ra_reg[id->cls][regid];
 1385             unsigned name = ra_name(ctx, id);
 1386             ra_set_node_reg(ctx->g, name, reg);
 1387         }
 1388     }
 1389 
 1390     /* pre-assign array elements:
 1391      *
 1392      * TODO this is going to need some work for half-precision.. possibly
 1393      * this is easier on a6xx, where we can just divide array size by two?
 1394      * But on a5xx and earlier it will need to track two bases.
 1395      */
 1396     foreach_array (arr, &ctx->ir->array_list) {
 1397 
 1398         if (arr->end_ip == 0)
 1399             continue;
 1400 
 1401         if (!ctx->scalar_pass)
 1402             assign_arr_base(ctx, arr, precolor, nprecolor);
 1403 
 1404         unsigned base = arr->reg;
 1405 
 1406         for (unsigned i = 0; i < arr->length; i++) {
 1407             unsigned name, reg;
 1408 
 1409             if (arr->half) {
 1410                 /* Doesn't need to do this on older generations than a6xx,
 1411                  * since there's no conflict between full regs and half regs
 1412                  * on them.
 1413                  *
 1414                  * TODO Presumably "base" could start from 0 respectively
 1415                  * for half regs of arrays on older generations.
 1416                  */
 1417                 unsigned base_half = base * 2 + i;
 1418                 reg = ctx->set->gpr_to_ra_reg[0+HALF_OFFSET][base_half];
 1419                 base = base_half / 2 + 1;
 1420             } else {
 1421                 reg = ctx->set->gpr_to_ra_reg[0][base++];
 1422             }
 1423 
 1424             name = arr->base + i;
 1425             ra_set_node_reg(ctx->g, name, reg);
 1426         }
 1427     }
 1428 
 1429     if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
 1430         foreach_array (arr, &ctx->ir->array_list) {
 1431             unsigned first = arr->reg;
 1432             unsigned last  = arr->reg + arr->length - 1;
 1433             debug_printf("arr[%d] at r%d.%c->r%d.%c\n", arr->id,
 1434                     (first >> 2), "xyzw"[first & 0x3],
 1435                     (last >> 2), "xyzw"[last & 0x3]);
 1436         }
 1437     }
 1438 }
 1439 
 1440 static void
 1441 precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
 1442 {
 1443     struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 1444     unsigned n = dest_regs(instr);
 1445     for (unsigned i = 0; i < n; i++) {
 1446         /* tex instructions actually have a wrmask, and
 1447          * don't touch masked out components.  So we
 1448          * shouldn't precolor them::
 1449          */
 1450         if (is_tex_or_prefetch(instr) &&
 1451                 !(instr->regs[0]->wrmask & (1 << i)))
 1452             continue;
 1453 
 1454         unsigned name = scalar_name(ctx, instr, i);
 1455         unsigned regid = instr->regs[0]->num + i;
 1456 
 1457         if (instr->regs[0]->flags & IR3_REG_HIGH)
 1458             regid -= FIRST_HIGH_REG;
 1459 
 1460         unsigned vreg = ctx->set->gpr_to_ra_reg[id->cls][regid];
 1461         ra_set_node_reg(ctx->g, name, vreg);
 1462     }
 1463 }
 1464 
 1465 /* pre-color non-scalar registers based on the registers assigned in previous
 1466  * pass.  Do this by looking actually at the fanout instructions.
 1467  */
 1468 static void
 1469 ra_precolor_assigned(struct ir3_ra_ctx *ctx)
 1470 {
 1471     debug_assert(ctx->scalar_pass);
 1472 
 1473     foreach_block (block, &ctx->ir->block_list) {
 1474         foreach_instr (instr, &block->instr_list) {
 1475 
 1476             if (!writes_gpr(instr))
 1477                 continue;
 1478 
 1479             if (should_assign(ctx, instr))
 1480                 continue;
 1481 
 1482             precolor(ctx, instr);
 1483 
 1484             struct ir3_register *src;
 1485             foreach_src (src, instr) {
 1486                 if (!src->instr)
 1487                     continue;
 1488                 precolor(ctx, src->instr);
 1489             }
 1490         }
 1491     }
 1492 }
 1493 
 1494 static int
 1495 ra_alloc(struct ir3_ra_ctx *ctx)
 1496 {
 1497     if (!ra_allocate(ctx->g))
 1498         return -1;
 1499 
 1500     foreach_block (block, &ctx->ir->block_list) {
 1501         ra_block_alloc(ctx, block);
 1502     }
 1503 
 1504     return 0;
 1505 }
 1506 
 1507 /* if we end up with split/collect instructions with non-matching src
 1508  * and dest regs, that means something has gone wrong.  Which makes it
 1509  * a pretty good sanity check.
 1510  */
 1511 static void
 1512 ra_sanity_check(struct ir3 *ir)
 1513 {
 1514     foreach_block (block, &ir->block_list) {
 1515         foreach_instr (instr, &block->instr_list) {
 1516             if (instr->opc == OPC_META_SPLIT) {
 1517                 struct ir3_register *dst = instr->regs[0];
 1518                 struct ir3_register *src = instr->regs[1];
 1519                 debug_assert(dst->num == (src->num + instr->split.off));
 1520             } else if (instr->opc == OPC_META_COLLECT) {
 1521                 struct ir3_register *dst = instr->regs[0];
 1522                 struct ir3_register *src;
 1523 
 1524                 foreach_src_n (src, n, instr) {
 1525                     debug_assert(dst->num == (src->num - n));
 1526                 }
 1527             }
 1528         }
 1529     }
 1530 }
 1531 
 1532 static int
 1533 ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
 1534         unsigned nprecolor, bool scalar_pass)
 1535 {
 1536     struct ir3_ra_ctx ctx = {
 1537             .v = v,
 1538             .ir = v->ir,
 1539             .set = v->ir->compiler->set,
 1540             .scalar_pass = scalar_pass,
 1541     };
 1542     int ret;
 1543 
 1544     ra_init(&ctx);
 1545     ra_add_interference(&ctx);
 1546     ra_precolor(&ctx, precolor, nprecolor);
 1547     if (scalar_pass)
 1548         ra_precolor_assigned(&ctx);
 1549     ret = ra_alloc(&ctx);
 1550     ra_destroy(&ctx);
 1551 
 1552     return ret;
 1553 }
 1554 
 1555 int
 1556 ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
 1557         unsigned nprecolor)
 1558 {
 1559     int ret;
 1560 
 1561     /* First pass, assign the vecN (non-scalar) registers: */
 1562     ret = ir3_ra_pass(v, precolor, nprecolor, false);
 1563     if (ret)
 1564         return ret;
 1565 
 1566     ir3_debug_print(v->ir, "AFTER RA (1st pass)");
 1567 
 1568     /* Second pass, assign the scalar registers: */
 1569     ret = ir3_ra_pass(v, precolor, nprecolor, true);
 1570     if (ret)
 1571         return ret;
 1572 
 1573     ir3_debug_print(v->ir, "AFTER RA (2st pass)");
 1574 
 1575 #ifdef DEBUG
 1576 #  define SANITY_CHECK DEBUG
 1577 #else
 1578 #  define SANITY_CHECK 0
 1579 #endif
 1580     if (SANITY_CHECK)
 1581         ra_sanity_check(v->ir);
 1582 
 1583     return ret;
 1584 }