"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/vc4/vc4_qir_schedule.c" (16 Sep 2020, 26787 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "vc4_qir_schedule.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright © 2010 Intel Corporation
    3  * Copyright © 2014-2015 Broadcom
    4  *
    5  * Permission is hereby granted, free of charge, to any person obtaining a
    6  * copy of this software and associated documentation files (the "Software"),
    7  * to deal in the Software without restriction, including without limitation
    8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    9  * and/or sell copies of the Software, and to permit persons to whom the
   10  * Software is furnished to do so, subject to the following conditions:
   11  *
   12  * The above copyright notice and this permission notice (including the next
   13  * paragraph) shall be included in all copies or substantial portions of the
   14  * Software.
   15  *
   16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   22  * IN THE SOFTWARE.
   23  */
   24 
   25 /**
   26  * @file vc4_qir_schedule.c
   27  *
   28  * The basic model of the list scheduler is to take a basic block, compute a
   29  * DAG of the dependencies from the bottom up, and make a list of the DAG
   30  * heads.  Heuristically pick a DAG head and schedule (remove) it, then put
   31  * all the parents that are now DAG heads into the list of things to
   32  * schedule.
   33  *
   34  * The goal of scheduling here, before register allocation and conversion to
   35  * QPU instructions, is to reduce register pressure by reordering instructions
   36  * to consume values when possible.
   37  */
   38 
   39 #include "vc4_qir.h"
   40 #include "util/dag.h"
   41 
   42 static bool debug;
   43 
   44 struct schedule_node {
   45         struct dag_node dag;
   46         struct list_head link;
   47         struct qinst *inst;
   48 
   49         /* Length of the longest (latency) chain from a DAG head to the this
   50          * instruction.
   51          */
   52         uint32_t delay;
   53 
   54         /* Longest time + latency_between(parent, this) of any parent of this
   55          * node.
   56          */
   57         uint32_t unblocked_time;
   58 };
   59 
   60 struct schedule_state {
   61         struct dag *dag;
   62 
   63         uint32_t time;
   64 
   65         uint32_t *temp_writes;
   66 
   67         BITSET_WORD *temp_live;
   68 };
   69 
   70 /* When walking the instructions in reverse, we need to swap before/after in
   71  * add_dep().
   72  */
   73 enum direction { F, R };
   74 
   75 /**
   76  * Marks a dependency between two intructions, that \p after must appear after
   77  * \p before.
   78  *
   79  * Our dependencies are tracked as a DAG.  Since we're scheduling bottom-up,
   80  * the latest instructions with nothing left to schedule are the DAG heads,
   81  * and their inputs are their children.
   82  */
   83 static void
   84 add_dep(enum direction dir,
   85         struct schedule_node *before,
   86         struct schedule_node *after)
   87 {
   88         if (!before || !after)
   89                 return;
   90 
   91         assert(before != after);
   92 
   93         if (dir == R) {
   94                 struct schedule_node *t = before;
   95                 before = after;
   96                 after = t;
   97         }
   98 
   99         dag_add_edge(&after->dag, &before->dag, NULL);
  100 }
  101 
  102 static void
  103 add_write_dep(enum direction dir,
  104               struct schedule_node **before,
  105               struct schedule_node *after)
  106 {
  107         add_dep(dir, *before, after);
  108         *before = after;
  109 }
  110 
  111 struct schedule_setup_state {
  112         struct schedule_node **last_temp_write;
  113         struct schedule_node *last_sf;
  114         struct schedule_node *last_vary_read;
  115         struct schedule_node *last_vpm_read;
  116         struct schedule_node *last_vpm_write;
  117         struct schedule_node *last_tex_coord;
  118         struct schedule_node *last_tex_result;
  119         struct schedule_node *last_tlb;
  120         struct schedule_node *last_uniforms_reset;
  121         enum direction dir;
  122 
  123     /**
  124          * Texture FIFO tracking.  This is done top-to-bottom, and is used to
  125          * track the QOP_TEX_RESULTs and add dependencies on previous ones
  126          * when trying to submit texture coords with TFREQ full or new texture
  127          * fetches with TXRCV full.
  128          */
  129         struct {
  130                 struct schedule_node *node;
  131                 int coords;
  132         } tex_fifo[8];
  133         int tfreq_count; /**< Number of texture coords outstanding. */
  134         int tfrcv_count; /**< Number of texture results outstanding. */
  135         int tex_fifo_pos;
  136 };
  137 
  138 static void
  139 block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n)
  140 {
  141         add_dep(state->dir, state->tex_fifo[0].node, n);
  142 
  143         state->tfreq_count -= state->tex_fifo[0].coords;
  144         state->tfrcv_count--;
  145 
  146         memmove(&state->tex_fifo[0],
  147                 &state->tex_fifo[1],
  148                 state->tex_fifo_pos * sizeof(state->tex_fifo[0]));
  149         state->tex_fifo_pos--;
  150 }
  151 
  152 /**
  153  * Common code for dependencies that need to be tracked both forward and
  154  * backward.
  155  *
  156  * This is for things like "all VPM reads have to happen in order."
  157  */
  158 static void
  159 calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
  160 {
  161         struct qinst *inst = n->inst;
  162         enum direction dir = state->dir;
  163 
  164 
  165         /* Add deps for temp registers and varyings accesses.  Note that we
  166          * ignore uniforms accesses, because qir_reorder_uniforms() happens
  167          * after this.
  168          */
  169         for (int i = 0; i < qir_get_nsrc(inst); i++) {
  170                 switch (inst->src[i].file) {
  171                 case QFILE_TEMP:
  172                         add_dep(dir,
  173                                 state->last_temp_write[inst->src[i].index], n);
  174                         break;
  175 
  176                 case QFILE_VARY:
  177                         add_write_dep(dir, &state->last_vary_read, n);
  178                         break;
  179 
  180                 case QFILE_VPM:
  181                         add_write_dep(dir, &state->last_vpm_read, n);
  182                         break;
  183 
  184                 default:
  185                         break;
  186                 }
  187         }
  188 
  189         switch (inst->op) {
  190         case QOP_VARY_ADD_C:
  191                 add_dep(dir, state->last_vary_read, n);
  192                 break;
  193 
  194         case QOP_TEX_RESULT:
  195                 /* Results have to be fetched in order. */
  196                 add_write_dep(dir, &state->last_tex_result, n);
  197                 break;
  198 
  199         case QOP_THRSW:
  200                 /* After a new THRSW, one must collect all texture samples
  201                  * queued since the previous THRSW/program start.  For now, we
  202                  * have one THRSW in between each texture setup and its
  203                  * results collection as our input, and we just make sure that
  204                  * that ordering is maintained.
  205                  */
  206                 add_write_dep(dir, &state->last_tex_coord, n);
  207                 add_write_dep(dir, &state->last_tex_result, n);
  208 
  209                 /* accumulators and flags are lost across thread switches. */
  210                 add_write_dep(dir, &state->last_sf, n);
  211 
  212                 /* Setup, like the varyings, will need to be drained before we
  213                  * thread switch.
  214                  */
  215                 add_write_dep(dir, &state->last_vary_read, n);
  216 
  217                 /* The TLB-locking operations have to stay after the last
  218                  * thread switch.
  219                  */
  220                 add_write_dep(dir, &state->last_tlb, n);
  221                 break;
  222 
  223         case QOP_TLB_COLOR_READ:
  224         case QOP_MS_MASK:
  225                 add_write_dep(dir, &state->last_tlb, n);
  226                 break;
  227 
  228         default:
  229                 break;
  230         }
  231 
  232         switch (inst->dst.file) {
  233         case QFILE_VPM:
  234                 add_write_dep(dir, &state->last_vpm_write, n);
  235                 break;
  236 
  237         case QFILE_TEMP:
  238                 add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
  239                 break;
  240 
  241         case QFILE_TLB_COLOR_WRITE:
  242         case QFILE_TLB_COLOR_WRITE_MS:
  243         case QFILE_TLB_Z_WRITE:
  244         case QFILE_TLB_STENCIL_SETUP:
  245                 add_write_dep(dir, &state->last_tlb, n);
  246                 break;
  247 
  248         case QFILE_TEX_S_DIRECT:
  249         case QFILE_TEX_S:
  250         case QFILE_TEX_T:
  251         case QFILE_TEX_R:
  252         case QFILE_TEX_B:
  253                 /* Texturing setup gets scheduled in order, because
  254                  * the uniforms referenced by them have to land in a
  255                  * specific order.
  256                  */
  257                 add_write_dep(dir, &state->last_tex_coord, n);
  258                 break;
  259 
  260         default:
  261                 break;
  262         }
  263 
  264         if (qir_depends_on_flags(inst))
  265                 add_dep(dir, state->last_sf, n);
  266 
  267         if (inst->sf)
  268                 add_write_dep(dir, &state->last_sf, n);
  269 }
  270 
  271 static void
  272 calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
  273                        struct list_head *schedule_list)
  274 {
  275         struct schedule_setup_state state;
  276 
  277         memset(&state, 0, sizeof(state));
  278         state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
  279                                               c->num_temps);
  280         state.dir = F;
  281 
  282         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
  283                 struct qinst *inst = n->inst;
  284 
  285                 calculate_deps(&state, n);
  286 
  287                 for (int i = 0; i < qir_get_nsrc(inst); i++) {
  288                         switch (inst->src[i].file) {
  289                         case QFILE_UNIF:
  290                                 add_dep(state.dir, state.last_uniforms_reset, n);
  291                                 break;
  292                         default:
  293                                 break;
  294                         }
  295                 }
  296 
  297                 switch (inst->dst.file) {
  298                 case QFILE_TEX_S_DIRECT:
  299                 case QFILE_TEX_S:
  300                 case QFILE_TEX_T:
  301                 case QFILE_TEX_R:
  302                 case QFILE_TEX_B:
  303                         /* From the VC4 spec:
  304                          *
  305                          *     "The TFREQ input FIFO holds two full lots of s,
  306                          *      t, r, b data, plus associated setup data, per
  307                          *      QPU, that is, there are eight data slots. For
  308                          *      each texture request, slots are only consumed
  309                          *      for the components of s, t, r, and b actually
  310                          *      written. Thus the FIFO can hold four requests
  311                          *      of just (s, t) data, or eight requests of just
  312                          *      s data (for direct addressed data lookups).
  313                          *
  314                          *      Note that there is one FIFO per QPU, and the
  315                          *      FIFO has no concept of threads - that is,
  316                          *      multi-threaded shaders must be careful to use
  317                          *      only 1/2 the FIFO depth before reading
  318                          *      back. Multi-threaded programs must also
  319                          *      therefore always thread switch on texture
  320                          *      fetch as the other thread may have data
  321                          *      waiting in the FIFO."
  322                          *
  323                          * If the texture coordinate fifo is full, block this
  324                          * on the last QOP_TEX_RESULT.
  325                          */
  326                         if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) {
  327                                 block_until_tex_result(&state, n);
  328                         }
  329 
  330                         /* From the VC4 spec:
  331                          *
  332                          *     "Since the maximum number of texture requests
  333                          *      in the input (TFREQ) FIFO is four lots of (s,
  334                          *      t) data, the output (TFRCV) FIFO is sized to
  335                          *      holds four lots of max-size color data per
  336                          *      QPU. For non-float color, reads are packed
  337                          *      RGBA8888 data (one read per pixel). For 16-bit
  338                          *      float color, two reads are necessary per
  339                          *      pixel, with reads packed as RG1616 then
  340                          *      BA1616. So per QPU there are eight color slots
  341                          *      in the TFRCV FIFO."
  342                          *
  343                          * If the texture result fifo is full, block adding
  344                          * any more to it until the last QOP_TEX_RESULT.
  345                          */
  346                         if (inst->dst.file == QFILE_TEX_S ||
  347                             inst->dst.file == QFILE_TEX_S_DIRECT) {
  348                                 if (state.tfrcv_count ==
  349                                     (c->fs_threaded ? 2 : 4))
  350                                         block_until_tex_result(&state, n);
  351                                 state.tfrcv_count++;
  352                         }
  353 
  354                         state.tex_fifo[state.tex_fifo_pos].coords++;
  355                         state.tfreq_count++;
  356                         break;
  357 
  358                 default:
  359                         break;
  360                 }
  361 
  362                 switch (inst->op) {
  363                 case QOP_TEX_RESULT:
  364                         /* Results have to be fetched after the
  365                          * coordinate setup.  Note that we're assuming
  366                          * here that our input shader has the texture
  367                          * coord setup and result fetch in order,
  368                          * which is true initially but not of our
  369                          * instruction stream after this pass.
  370                          */
  371                         add_dep(state.dir, state.last_tex_coord, n);
  372 
  373                         state.tex_fifo[state.tex_fifo_pos].node = n;
  374 
  375                         state.tex_fifo_pos++;
  376                         memset(&state.tex_fifo[state.tex_fifo_pos], 0,
  377                                sizeof(state.tex_fifo[0]));
  378                         break;
  379 
  380                 case QOP_UNIFORMS_RESET:
  381                         add_write_dep(state.dir, &state.last_uniforms_reset, n);
  382                         break;
  383 
  384                 default:
  385                         break;
  386                 }
  387         }
  388 }
  389 
  390 static void
  391 calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx,
  392                        struct list_head *schedule_list)
  393 {
  394         struct schedule_setup_state state;
  395 
  396         memset(&state, 0, sizeof(state));
  397         state.dir = R;
  398         state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
  399                                               c->num_temps);
  400 
  401         list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) {
  402                 calculate_deps(&state, n);
  403         }
  404 }
  405 
  406 static int
  407 get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
  408 {
  409         int cost = 0;
  410 
  411         if (inst->dst.file == QFILE_TEMP &&
  412             state->temp_writes[inst->dst.index] == 1)
  413                 cost--;
  414 
  415         for (int i = 0; i < qir_get_nsrc(inst); i++) {
  416                 if (inst->src[i].file != QFILE_TEMP ||
  417                     BITSET_TEST(state->temp_live, inst->src[i].index)) {
  418                         continue;
  419                 }
  420 
  421                 bool already_counted = false;
  422                 for (int j = 0; j < i; j++) {
  423                         if (inst->src[i].file == inst->src[j].file &&
  424                             inst->src[i].index == inst->src[j].index) {
  425                                 already_counted = true;
  426                         }
  427                 }
  428                 if (!already_counted)
  429                         cost++;
  430         }
  431 
  432         return cost;
  433 }
  434 
  435 static bool
  436 locks_scoreboard(struct qinst *inst)
  437 {
  438         if (inst->op == QOP_TLB_COLOR_READ)
  439                 return true;
  440 
  441         switch (inst->dst.file) {
  442         case QFILE_TLB_Z_WRITE:
  443         case QFILE_TLB_COLOR_WRITE:
  444         case QFILE_TLB_COLOR_WRITE_MS:
  445                 return true;
  446         default:
  447                 return false;
  448         }
  449 }
  450 
  451 static struct schedule_node *
  452 choose_instruction(struct schedule_state *state)
  453 {
  454         struct schedule_node *chosen = NULL;
  455 
  456         list_for_each_entry(struct schedule_node, n, &state->dag->heads,
  457                             dag.link) {
  458                 /* The branches aren't being tracked as dependencies.  Make
  459                  * sure that they stay scheduled as the last instruction of
  460                  * the block, which is to say the first one we choose to
  461                  * schedule.
  462                  */
  463                 if (n->inst->op == QOP_BRANCH)
  464                         return n;
  465 
  466                 if (!chosen) {
  467                         chosen = n;
  468                         continue;
  469                 }
  470 
  471                 /* Prefer scheduling things that lock the scoreboard, so that
  472                  * they appear late in the program and we get more parallelism
  473                  * between shaders on multiple QPUs hitting the same fragment.
  474                  */
  475                 if (locks_scoreboard(n->inst) &&
  476                     !locks_scoreboard(chosen->inst)) {
  477                         chosen = n;
  478                         continue;
  479                 } else if (!locks_scoreboard(n->inst) &&
  480                            locks_scoreboard(chosen->inst)) {
  481                         continue;
  482                 }
  483 
  484                 /* If we would block on the previously chosen node, but would
  485                  * block less on this one, then prefer it.
  486                  */
  487                 if (chosen->unblocked_time > state->time &&
  488                     n->unblocked_time < chosen->unblocked_time) {
  489                         chosen = n;
  490                         continue;
  491                 } else if (n->unblocked_time > state->time &&
  492                            n->unblocked_time > chosen->unblocked_time) {
  493                         continue;
  494                 }
  495 
  496                 /* If we can definitely reduce register pressure, do so
  497                  * immediately.
  498                  */
  499                 int register_pressure_cost =
  500                         get_register_pressure_cost(state, n->inst);
  501                 int chosen_register_pressure_cost =
  502                         get_register_pressure_cost(state, chosen->inst);
  503 
  504                 if (register_pressure_cost < chosen_register_pressure_cost) {
  505                         chosen = n;
  506                         continue;
  507                 } else if (register_pressure_cost >
  508                            chosen_register_pressure_cost) {
  509                         continue;
  510                 }
  511 
  512                 /* Otherwise, prefer instructions with the deepest chain to
  513                  * the end of the program.  This avoids the problem of
  514                  * "everything generates a temp, nothing finishes freeing one,
  515                  * guess I'll just keep emitting varying mul/adds".
  516                  */
  517                 if (n->delay > chosen->delay) {
  518                         chosen = n;
  519                         continue;
  520                 } else if (n->delay < chosen->delay) {
  521                         continue;
  522                 }
  523         }
  524 
  525         return chosen;
  526 }
  527 
  528 static void
  529 dump_state(struct vc4_compile *c, struct schedule_state *state)
  530 {
  531         uint32_t i = 0;
  532         list_for_each_entry(struct schedule_node, n, &state->dag->heads,
  533                             dag.link) {
  534                 fprintf(stderr, "%3d: ", i++);
  535                 qir_dump_inst(c, n->inst);
  536                 fprintf(stderr, " (%d cost)\n",
  537                         get_register_pressure_cost(state, n->inst));
  538 
  539                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
  540                         struct schedule_node *child =
  541                                 (struct schedule_node *)edge->child;
  542                         fprintf(stderr, "   - ");
  543                         qir_dump_inst(c, child->inst);
  544                         fprintf(stderr, " (%d parents)\n",
  545                                 child->dag.parent_count);
  546                 }
  547         }
  548 }
  549 
  550 /* Estimate of how many instructions we should schedule between operations.
  551  *
  552  * These aren't in real cycle counts, because we're just estimating cycle
  553  * times anyway.  QIR instructions will get paired up when turned into QPU
  554  * instructions, or extra NOP delays will have to be added due to register
  555  * allocation choices.
  556  */
  557 static uint32_t
  558 latency_between(struct schedule_node *before, struct schedule_node *after)
  559 {
  560         if ((before->inst->dst.file == QFILE_TEX_S ||
  561              before->inst->dst.file == QFILE_TEX_S_DIRECT) &&
  562             after->inst->op == QOP_TEX_RESULT)
  563                 return 100;
  564 
  565         switch (before->inst->op) {
  566         case QOP_RCP:
  567         case QOP_RSQ:
  568         case QOP_EXP2:
  569         case QOP_LOG2:
  570                 for (int i = 0; i < qir_get_nsrc(after->inst); i++) {
  571                         if (after->inst->src[i].file ==
  572                             before->inst->dst.file &&
  573                             after->inst->src[i].index ==
  574                             before->inst->dst.index) {
  575                                 /* There are two QPU delay slots before we can
  576                                  * read a math result, which could be up to 4
  577                                  * QIR instructions if they packed well.
  578                                  */
  579                                 return 4;
  580                         }
  581                 }
  582                 break;
  583         default:
  584                 break;
  585         }
  586 
  587         return 1;
  588 }
  589 
  590 /** Recursive computation of the delay member of a node. */
  591 static void
  592 compute_delay(struct dag_node *node, void *state)
  593 {
  594         struct schedule_node *n = (struct schedule_node *)node;
  595 
  596         /* The color read needs to be scheduled late, to avoid locking the
  597          * scoreboard early.  This is our best tool for encouraging that.  The
  598          * other scoreboard locking ops will have this happen by default,
  599          * since they are generally the DAG heads or close to them.
  600          */
  601         if (n->inst->op == QOP_TLB_COLOR_READ)
  602                 n->delay = 1000;
  603         else
  604                 n->delay = 1;
  605 
  606         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
  607                 struct schedule_node *child =
  608                         (struct schedule_node *)edge->child;
  609 
  610                 n->delay = MAX2(n->delay, (child->delay +
  611                                            latency_between(child, n)));
  612         }
  613 }
  614 
  615 static void
  616 schedule_instructions(struct vc4_compile *c,
  617                       struct qblock *block, struct schedule_state *state)
  618 {
  619         if (debug) {
  620                 fprintf(stderr, "initial deps:\n");
  621                 dump_state(c, state);
  622         }
  623 
  624         state->time = 0;
  625         while (!list_is_empty(&state->dag->heads)) {
  626                 struct schedule_node *chosen = choose_instruction(state);
  627                 struct qinst *inst = chosen->inst;
  628 
  629                 if (debug) {
  630                         fprintf(stderr, "current list:\n");
  631                         dump_state(c, state);
  632                         fprintf(stderr, "chose: ");
  633                         qir_dump_inst(c, inst);
  634                         fprintf(stderr, " (%d cost)\n",
  635                                 get_register_pressure_cost(state, inst));
  636                 }
  637 
  638                 state->time = MAX2(state->time, chosen->unblocked_time);
  639 
  640                 /* Schedule this instruction back onto the QIR list. */
  641                 list_add(&inst->link, &block->instructions);
  642 
  643                 /* Now that we've scheduled a new instruction, some of its
  644                  * children can be promoted to the list of instructions ready to
  645                  * be scheduled.  Update the children's unblocked time for this
  646                  * DAG edge as we do so.
  647                  */
  648                 util_dynarray_foreach(&chosen->dag.edges,
  649                                       struct dag_edge, edge) {
  650                         struct schedule_node *child =
  651                                 (struct schedule_node *)edge->child;
  652 
  653                         child->unblocked_time = MAX2(child->unblocked_time,
  654                                                      state->time +
  655                                                      latency_between(child,
  656                                                                      chosen));
  657                 }
  658                 dag_prune_head(state->dag, &chosen->dag);
  659 
  660                 /* Update our tracking of register pressure. */
  661                 for (int i = 0; i < qir_get_nsrc(inst); i++) {
  662                         if (inst->src[i].file == QFILE_TEMP)
  663                                 BITSET_SET(state->temp_live, inst->src[i].index);
  664                 }
  665                 if (inst->dst.file == QFILE_TEMP) {
  666                         state->temp_writes[inst->dst.index]--;
  667                         if (state->temp_writes[inst->dst.index] == 0)
  668                                 BITSET_CLEAR(state->temp_live, inst->dst.index);
  669                 }
  670 
  671                 state->time++;
  672         }
  673 }
  674 
  675 static void
  676 qir_schedule_instructions_block(struct vc4_compile *c,
  677                                 struct qblock *block)
  678 {
  679         struct schedule_state *state = rzalloc(NULL, struct schedule_state);
  680 
  681         state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps);
  682         state->temp_live = rzalloc_array(state, BITSET_WORD,
  683                                          BITSET_WORDS(c->num_temps));
  684         state->dag = dag_create(state);
  685 
  686         struct list_head setup_list;
  687         list_inithead(&setup_list);
  688 
  689         /* Wrap each instruction in a scheduler structure. */
  690         qir_for_each_inst_safe(inst, block) {
  691                 struct schedule_node *n = rzalloc(state, struct schedule_node);
  692 
  693                 n->inst = inst;
  694                 list_del(&inst->link);
  695                 list_addtail(&n->link, &setup_list);
  696                 dag_init_node(state->dag, &n->dag);
  697 
  698                 if (inst->dst.file == QFILE_TEMP)
  699                         state->temp_writes[inst->dst.index]++;
  700         }
  701 
  702         /* Dependencies tracked top-to-bottom. */
  703         calculate_forward_deps(c, state, &setup_list);
  704         /* Dependencies tracked bottom-to-top. */
  705         calculate_reverse_deps(c, state, &setup_list);
  706 
  707         dag_traverse_bottom_up(state->dag, compute_delay, NULL);
  708 
  709         schedule_instructions(c, block, state);
  710 
  711         ralloc_free(state);
  712 }
  713 
  714 void
  715 qir_schedule_instructions(struct vc4_compile *c)
  716 {
  717 
  718         if (debug) {
  719                 fprintf(stderr, "Pre-schedule instructions\n");
  720                 qir_dump(c);
  721         }
  722 
  723         qir_for_each_block(block, c)
  724                 qir_schedule_instructions_block(c, block);
  725 
  726         if (debug) {
  727                 fprintf(stderr, "Post-schedule instructions\n");
  728                 qir_dump(c);
  729         }
  730 }