"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/vc4/vc4_qpu_schedule.c" (16 Sep 2020, 40807 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "vc4_qpu_schedule.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright © 2010 Intel Corporation
    3  * Copyright © 2014 Broadcom
    4  *
    5  * Permission is hereby granted, free of charge, to any person obtaining a
    6  * copy of this software and associated documentation files (the "Software"),
    7  * to deal in the Software without restriction, including without limitation
    8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    9  * and/or sell copies of the Software, and to permit persons to whom the
   10  * Software is furnished to do so, subject to the following conditions:
   11  *
   12  * The above copyright notice and this permission notice (including the next
   13  * paragraph) shall be included in all copies or substantial portions of the
   14  * Software.
   15  *
   16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   22  * IN THE SOFTWARE.
   23  */
   24 
   25 /**
   26  * @file vc4_qpu_schedule.c
   27  *
   28  * The basic model of the list scheduler is to take a basic block, compute a
   29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
   30  * pick a DAG head, then put all the children that are now DAG heads into the
   31  * list of things to schedule.
   32  *
   33  * The goal of scheduling here is to pack pairs of operations together in a
   34  * single QPU instruction.
   35  */
   36 
   37 #include "vc4_qir.h"
   38 #include "vc4_qpu.h"
   39 #include "util/ralloc.h"
   40 #include "util/dag.h"
   41 
   42 static bool debug;
   43 
   44 struct schedule_node_child;
   45 
   46 struct schedule_node {
   47         struct dag_node dag;
   48         struct list_head link;
   49         struct queued_qpu_inst *inst;
   50 
   51         /* Longest cycles + instruction_latency() of any parent of this node. */
   52         uint32_t unblocked_time;
   53 
   54         /**
   55          * Minimum number of cycles from scheduling this instruction until the
   56          * end of the program, based on the slowest dependency chain through
   57          * the children.
   58          */
   59         uint32_t delay;
   60 
   61         /**
   62          * cycles between this instruction being scheduled and when its result
   63          * can be consumed.
   64          */
   65         uint32_t latency;
   66 
   67         /**
   68          * Which uniform from uniform_data[] this instruction read, or -1 if
   69          * not reading a uniform.
   70          */
   71         int uniform;
   72 };
   73 
   74 /* When walking the instructions in reverse, we need to swap before/after in
   75  * add_dep().
   76  */
   77 enum direction { F, R };
   78 
   79 struct schedule_state {
   80         struct dag *dag;
   81         struct schedule_node *last_r[6];
   82         struct schedule_node *last_ra[32];
   83         struct schedule_node *last_rb[32];
   84         struct schedule_node *last_sf;
   85         struct schedule_node *last_vpm_read;
   86         struct schedule_node *last_tmu_write;
   87         struct schedule_node *last_tlb;
   88         struct schedule_node *last_vpm;
   89         struct schedule_node *last_uniforms_reset;
   90         enum direction dir;
   91         /* Estimated cycle when the current instruction would start. */
   92         uint32_t time;
   93 };
   94 
   95 static void
   96 add_dep(struct schedule_state *state,
   97         struct schedule_node *before,
   98         struct schedule_node *after,
   99         bool write)
  100 {
  101         bool write_after_read = !write && state->dir == R;
  102         void *edge_data = (void *)(uintptr_t)write_after_read;
  103 
  104         if (!before || !after)
  105                 return;
  106 
  107         assert(before != after);
  108 
  109         if (state->dir == F)
  110                 dag_add_edge(&before->dag, &after->dag, edge_data);
  111         else
  112                 dag_add_edge(&after->dag, &before->dag, edge_data);
  113 }
  114 
  115 static void
  116 add_read_dep(struct schedule_state *state,
  117               struct schedule_node *before,
  118               struct schedule_node *after)
  119 {
  120         add_dep(state, before, after, false);
  121 }
  122 
  123 static void
  124 add_write_dep(struct schedule_state *state,
  125               struct schedule_node **before,
  126               struct schedule_node *after)
  127 {
  128         add_dep(state, *before, after, true);
  129         *before = after;
  130 }
  131 
  132 static bool
  133 qpu_writes_r4(uint64_t inst)
  134 {
  135         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
  136 
  137         switch(sig) {
  138         case QPU_SIG_COLOR_LOAD:
  139         case QPU_SIG_LOAD_TMU0:
  140         case QPU_SIG_LOAD_TMU1:
  141         case QPU_SIG_ALPHA_MASK_LOAD:
  142                 return true;
  143         default:
  144                 return false;
  145         }
  146 }
  147 
  148 static void
  149 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
  150                    uint32_t raddr, bool is_a)
  151 {
  152         switch (raddr) {
  153         case QPU_R_VARY:
  154                 add_write_dep(state, &state->last_r[5], n);
  155                 break;
  156 
  157         case QPU_R_VPM:
  158                 add_write_dep(state, &state->last_vpm_read, n);
  159                 break;
  160 
  161         case QPU_R_UNIF:
  162                 add_read_dep(state, state->last_uniforms_reset, n);
  163                 break;
  164 
  165         case QPU_R_NOP:
  166         case QPU_R_ELEM_QPU:
  167         case QPU_R_XY_PIXEL_COORD:
  168         case QPU_R_MS_REV_FLAGS:
  169                 break;
  170 
  171         default:
  172                 if (raddr < 32) {
  173                         if (is_a)
  174                                 add_read_dep(state, state->last_ra[raddr], n);
  175                         else
  176                                 add_read_dep(state, state->last_rb[raddr], n);
  177                 } else {
  178                         fprintf(stderr, "unknown raddr %d\n", raddr);
  179                         abort();
  180                 }
  181                 break;
  182         }
  183 }
  184 
  185 static bool
  186 is_tmu_write(uint32_t waddr)
  187 {
  188         switch (waddr) {
  189         case QPU_W_TMU0_S:
  190         case QPU_W_TMU0_T:
  191         case QPU_W_TMU0_R:
  192         case QPU_W_TMU0_B:
  193         case QPU_W_TMU1_S:
  194         case QPU_W_TMU1_T:
  195         case QPU_W_TMU1_R:
  196         case QPU_W_TMU1_B:
  197                 return true;
  198         default:
  199                 return false;
  200         }
  201 }
  202 
  203 static bool
  204 reads_uniform(uint64_t inst)
  205 {
  206         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
  207                 return false;
  208 
  209         return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
  210                 (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&
  211                  QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||
  212                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
  213                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
  214 }
  215 
  216 static void
  217 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
  218                  uint32_t mux)
  219 {
  220         if (mux != QPU_MUX_A && mux != QPU_MUX_B)
  221                 add_read_dep(state, state->last_r[mux], n);
  222 }
  223 
  224 
  225 static void
  226 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
  227                    uint32_t waddr, bool is_add)
  228 {
  229         uint64_t inst = n->inst->inst;
  230         bool is_a = is_add ^ ((inst & QPU_WS) != 0);
  231 
  232         if (waddr < 32) {
  233                 if (is_a) {
  234                         add_write_dep(state, &state->last_ra[waddr], n);
  235                 } else {
  236                         add_write_dep(state, &state->last_rb[waddr], n);
  237                 }
  238         } else if (is_tmu_write(waddr)) {
  239                 add_write_dep(state, &state->last_tmu_write, n);
  240                 add_read_dep(state, state->last_uniforms_reset, n);
  241         } else if (qpu_waddr_is_tlb(waddr) ||
  242                    waddr == QPU_W_MS_FLAGS) {
  243                 add_write_dep(state, &state->last_tlb, n);
  244         } else {
  245                 switch (waddr) {
  246                 case QPU_W_ACC0:
  247                 case QPU_W_ACC1:
  248                 case QPU_W_ACC2:
  249                 case QPU_W_ACC3:
  250                 case QPU_W_ACC5:
  251                         add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
  252                                       n);
  253                         break;
  254 
  255                 case QPU_W_VPM:
  256                         add_write_dep(state, &state->last_vpm, n);
  257                         break;
  258 
  259                 case QPU_W_VPMVCD_SETUP:
  260                         if (is_a)
  261                                 add_write_dep(state, &state->last_vpm_read, n);
  262                         else
  263                                 add_write_dep(state, &state->last_vpm, n);
  264                         break;
  265 
  266                 case QPU_W_SFU_RECIP:
  267                 case QPU_W_SFU_RECIPSQRT:
  268                 case QPU_W_SFU_EXP:
  269                 case QPU_W_SFU_LOG:
  270                         add_write_dep(state, &state->last_r[4], n);
  271                         break;
  272 
  273                 case QPU_W_TLB_STENCIL_SETUP:
  274                         /* This isn't a TLB operation that does things like
  275                          * implicitly lock the scoreboard, but it does have to
  276                          * appear before TLB_Z, and each of the TLB_STENCILs
  277                          * have to schedule in the same order relative to each
  278                          * other.
  279                          */
  280                         add_write_dep(state, &state->last_tlb, n);
  281                         break;
  282 
  283                 case QPU_W_MS_FLAGS:
  284                         add_write_dep(state, &state->last_tlb, n);
  285                         break;
  286 
  287                 case QPU_W_UNIFORMS_ADDRESS:
  288                         add_write_dep(state, &state->last_uniforms_reset, n);
  289                         break;
  290 
  291                 case QPU_W_NOP:
  292                         break;
  293 
  294                 default:
  295                         fprintf(stderr, "Unknown waddr %d\n", waddr);
  296                         abort();
  297                 }
  298         }
  299 }
  300 
  301 static void
  302 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
  303                   uint32_t cond)
  304 {
  305         switch (cond) {
  306         case QPU_COND_NEVER:
  307         case QPU_COND_ALWAYS:
  308                 break;
  309         default:
  310                 add_read_dep(state, state->last_sf, n);
  311                 break;
  312         }
  313 }
  314 
  315 /**
  316  * Common code for dependencies that need to be tracked both forward and
  317  * backward.
  318  *
  319  * This is for things like "all reads of r4 have to happen between the r4
  320  * writes that surround them".
  321  */
  322 static void
  323 calculate_deps(struct schedule_state *state, struct schedule_node *n)
  324 {
  325         uint64_t inst = n->inst->inst;
  326         uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
  327         uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
  328         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
  329         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
  330         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
  331         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
  332         uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
  333         uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
  334         uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
  335         uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
  336         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
  337 
  338         if (sig != QPU_SIG_LOAD_IMM) {
  339                 process_raddr_deps(state, n, raddr_a, true);
  340                 if (sig != QPU_SIG_SMALL_IMM &&
  341                     sig != QPU_SIG_BRANCH)
  342                         process_raddr_deps(state, n, raddr_b, false);
  343         }
  344 
  345         if (add_op != QPU_A_NOP) {
  346                 process_mux_deps(state, n, add_a);
  347                 process_mux_deps(state, n, add_b);
  348         }
  349         if (mul_op != QPU_M_NOP) {
  350                 process_mux_deps(state, n, mul_a);
  351                 process_mux_deps(state, n, mul_b);
  352         }
  353 
  354         process_waddr_deps(state, n, waddr_add, true);
  355         process_waddr_deps(state, n, waddr_mul, false);
  356         if (qpu_writes_r4(inst))
  357                 add_write_dep(state, &state->last_r[4], n);
  358 
  359         switch (sig) {
  360         case QPU_SIG_SW_BREAKPOINT:
  361         case QPU_SIG_NONE:
  362         case QPU_SIG_SMALL_IMM:
  363         case QPU_SIG_LOAD_IMM:
  364                 break;
  365 
  366         case QPU_SIG_THREAD_SWITCH:
  367         case QPU_SIG_LAST_THREAD_SWITCH:
  368                 /* All accumulator contents and flags are undefined after the
  369                  * switch.
  370                  */
  371                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
  372                         add_write_dep(state, &state->last_r[i], n);
  373                 add_write_dep(state, &state->last_sf, n);
  374 
  375                 /* Scoreboard-locking operations have to stay after the last
  376                  * thread switch.
  377                  */
  378                 add_write_dep(state, &state->last_tlb, n);
  379 
  380                 add_write_dep(state, &state->last_tmu_write, n);
  381                 break;
  382 
  383         case QPU_SIG_LOAD_TMU0:
  384         case QPU_SIG_LOAD_TMU1:
  385                 /* TMU loads are coming from a FIFO, so ordering is important.
  386                  */
  387                 add_write_dep(state, &state->last_tmu_write, n);
  388                 break;
  389 
  390         case QPU_SIG_COLOR_LOAD:
  391                 add_read_dep(state, state->last_tlb, n);
  392                 break;
  393 
  394         case QPU_SIG_BRANCH:
  395                 add_read_dep(state, state->last_sf, n);
  396                 break;
  397 
  398         case QPU_SIG_PROG_END:
  399         case QPU_SIG_WAIT_FOR_SCOREBOARD:
  400         case QPU_SIG_SCOREBOARD_UNLOCK:
  401         case QPU_SIG_COVERAGE_LOAD:
  402         case QPU_SIG_COLOR_LOAD_END:
  403         case QPU_SIG_ALPHA_MASK_LOAD:
  404                 fprintf(stderr, "Unhandled signal bits %d\n", sig);
  405                 abort();
  406         }
  407 
  408         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
  409         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
  410         if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
  411                 add_write_dep(state, &state->last_sf, n);
  412 }
  413 
  414 static void
  415 calculate_forward_deps(struct vc4_compile *c, struct dag *dag,
  416                        struct list_head *schedule_list)
  417 {
  418         struct schedule_state state;
  419 
  420         memset(&state, 0, sizeof(state));
  421         state.dag = dag;
  422         state.dir = F;
  423 
  424         list_for_each_entry(struct schedule_node, node, schedule_list, link)
  425                 calculate_deps(&state, node);
  426 }
  427 
  428 static void
  429 calculate_reverse_deps(struct vc4_compile *c, struct dag *dag,
  430                        struct list_head *schedule_list)
  431 {
  432         struct schedule_state state;
  433 
  434         memset(&state, 0, sizeof(state));
  435         state.dag = dag;
  436         state.dir = R;
  437 
  438         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
  439                                 link) {
  440                 calculate_deps(&state, (struct schedule_node *)node);
  441         }
  442 }
  443 
  444 struct choose_scoreboard {
  445         struct dag *dag;
  446         int tick;
  447         int last_sfu_write_tick;
  448         int last_uniforms_reset_tick;
  449         uint32_t last_waddr_a, last_waddr_b;
  450         bool tlb_locked;
  451 };
  452 
  453 static bool
  454 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
  455 {
  456         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
  457         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
  458         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
  459 
  460         /* Full immediate loads don't read any registers. */
  461         if (sig == QPU_SIG_LOAD_IMM)
  462                 return false;
  463 
  464         uint32_t src_muxes[] = {
  465                 QPU_GET_FIELD(inst, QPU_ADD_A),
  466                 QPU_GET_FIELD(inst, QPU_ADD_B),
  467                 QPU_GET_FIELD(inst, QPU_MUL_A),
  468                 QPU_GET_FIELD(inst, QPU_MUL_B),
  469         };
  470         for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
  471                 if ((src_muxes[i] == QPU_MUX_A &&
  472                      raddr_a < 32 &&
  473                      scoreboard->last_waddr_a == raddr_a) ||
  474                     (src_muxes[i] == QPU_MUX_B &&
  475                      sig != QPU_SIG_SMALL_IMM &&
  476                      raddr_b < 32 &&
  477                      scoreboard->last_waddr_b == raddr_b)) {
  478                         return true;
  479                 }
  480 
  481                 if (src_muxes[i] == QPU_MUX_R4) {
  482                         if (scoreboard->tick -
  483                             scoreboard->last_sfu_write_tick <= 2) {
  484                                 return true;
  485                         }
  486                 }
  487         }
  488 
  489         if (sig == QPU_SIG_SMALL_IMM &&
  490             QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= QPU_SMALL_IMM_MUL_ROT) {
  491                 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
  492                 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
  493 
  494                 if (scoreboard->last_waddr_a == mux_a + QPU_W_ACC0 ||
  495                     scoreboard->last_waddr_a == mux_b + QPU_W_ACC0 ||
  496                     scoreboard->last_waddr_b == mux_a + QPU_W_ACC0 ||
  497                     scoreboard->last_waddr_b == mux_b + QPU_W_ACC0) {
  498                         return true;
  499                 }
  500         }
  501 
  502         if (reads_uniform(inst) &&
  503             scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) {
  504                 return true;
  505         }
  506 
  507         return false;
  508 }
  509 
  510 static bool
  511 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
  512 {
  513         return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
  514 }
  515 
  516 static int
  517 get_instruction_priority(uint64_t inst)
  518 {
  519         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
  520         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
  521         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
  522         uint32_t baseline_score;
  523         uint32_t next_score = 0;
  524 
  525         /* Schedule TLB operations as late as possible, to get more
  526          * parallelism between shaders.
  527          */
  528         if (qpu_inst_is_tlb(inst))
  529                 return next_score;
  530         next_score++;
  531 
  532         /* Schedule texture read results collection late to hide latency. */
  533         if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
  534                 return next_score;
  535         next_score++;
  536 
  537         /* Default score for things that aren't otherwise special. */
  538         baseline_score = next_score;
  539         next_score++;
  540 
  541         /* Schedule texture read setup early to hide their latency better. */
  542         if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
  543                 return next_score;
  544         next_score++;
  545 
  546         return baseline_score;
  547 }
  548 
  549 static struct schedule_node *
  550 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
  551                                struct list_head *schedule_list,
  552                                struct schedule_node *prev_inst)
  553 {
  554         struct schedule_node *chosen = NULL;
  555         int chosen_prio = 0;
  556 
  557         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
  558          * will handle pairing it along with filling the delay slots.
  559          */
  560         if (prev_inst) {
  561                 uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,
  562                                                   QPU_SIG);
  563                 if (prev_sig == QPU_SIG_THREAD_SWITCH ||
  564                     prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {
  565                         return NULL;
  566                 }
  567         }
  568 
  569         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
  570                             dag.link) {
  571                 uint64_t inst = n->inst->inst;
  572                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
  573 
  574                 /* Don't choose the branch instruction until it's the last one
  575                  * left.  XXX: We could potentially choose it before it's the
  576                  * last one, if the remaining instructions fit in the delay
  577                  * slots.
  578                  */
  579                 if (sig == QPU_SIG_BRANCH &&
  580                     !list_is_singular(&scoreboard->dag->heads)) {
  581                         continue;
  582                 }
  583 
  584                 /* "An instruction must not read from a location in physical
  585                  *  regfile A or B that was written to by the previous
  586                  *  instruction."
  587                  */
  588                 if (reads_too_soon_after_write(scoreboard, inst))
  589                         continue;
  590 
  591                 /* "A scoreboard wait must not occur in the first two
  592                  *  instructions of a fragment shader. This is either the
  593                  *  explicit Wait for Scoreboard signal or an implicit wait
  594                  *  with the first tile-buffer read or write instruction."
  595                  */
  596                 if (pixel_scoreboard_too_soon(scoreboard, inst))
  597                         continue;
  598 
  599                 /* If we're trying to pair with another instruction, check
  600                  * that they're compatible.
  601                  */
  602                 if (prev_inst) {
  603                         /* Don't pair up a thread switch signal -- we'll
  604                          * handle pairing it when we pick it on its own.
  605                          */
  606                         if (sig == QPU_SIG_THREAD_SWITCH ||
  607                             sig == QPU_SIG_LAST_THREAD_SWITCH) {
  608                                 continue;
  609                         }
  610 
  611                         if (prev_inst->uniform != -1 && n->uniform != -1)
  612                                 continue;
  613 
  614                         /* Don't merge in something that will lock the TLB.
  615                          * Hopwefully what we have in inst will release some
  616                          * other instructions, allowing us to delay the
  617                          * TLB-locking instruction until later.
  618                          */
  619                         if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
  620                                 continue;
  621 
  622                         inst = qpu_merge_inst(prev_inst->inst->inst, inst);
  623                         if (!inst)
  624                                 continue;
  625                 }
  626 
  627                 int prio = get_instruction_priority(inst);
  628 
  629                 /* Found a valid instruction.  If nothing better comes along,
  630                  * this one works.
  631                  */
  632                 if (!chosen) {
  633                         chosen = n;
  634                         chosen_prio = prio;
  635                         continue;
  636                 }
  637 
  638                 if (prio > chosen_prio) {
  639                         chosen = n;
  640                         chosen_prio = prio;
  641                 } else if (prio < chosen_prio) {
  642                         continue;
  643                 }
  644 
  645                 if (n->delay > chosen->delay) {
  646                         chosen = n;
  647                         chosen_prio = prio;
  648                 } else if (n->delay < chosen->delay) {
  649                         continue;
  650                 }
  651         }
  652 
  653         return chosen;
  654 }
  655 
  656 static void
  657 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
  658                              uint64_t inst)
  659 {
  660         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
  661         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
  662 
  663         if (!(inst & QPU_WS)) {
  664                 scoreboard->last_waddr_a = waddr_add;
  665                 scoreboard->last_waddr_b = waddr_mul;
  666         } else {
  667                 scoreboard->last_waddr_b = waddr_add;
  668                 scoreboard->last_waddr_a = waddr_mul;
  669         }
  670 
  671         if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
  672             (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
  673                 scoreboard->last_sfu_write_tick = scoreboard->tick;
  674         }
  675 
  676         if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
  677             waddr_mul == QPU_W_UNIFORMS_ADDRESS) {
  678                 scoreboard->last_uniforms_reset_tick = scoreboard->tick;
  679         }
  680 
  681         if (qpu_inst_is_tlb(inst))
  682                 scoreboard->tlb_locked = true;
  683 }
  684 
  685 static void
  686 dump_state(struct dag *dag)
  687 {
  688         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
  689                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
  690                 vc4_qpu_disasm(&n->inst->inst, 1);
  691                 fprintf(stderr, "\n");
  692 
  693                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
  694                         struct schedule_node *child =
  695                                 (struct schedule_node *)edge->child;
  696                         if (!child)
  697                                 continue;
  698 
  699                         fprintf(stderr, "                 - ");
  700                         vc4_qpu_disasm(&child->inst->inst, 1);
  701                         fprintf(stderr, " (%d parents, %c)\n",
  702                                 child->dag.parent_count,
  703                                 edge->data ? 'w' : 'r');
  704                 }
  705         }
  706 }
  707 
  708 static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
  709 {
  710         if (waddr < 32)
  711                 return 2;
  712 
  713         /* Apply some huge latency between texture fetch requests and getting
  714          * their results back.
  715          *
  716          * FIXME: This is actually pretty bogus.  If we do:
  717          *
  718          * mov tmu0_s, a
  719          * <a bit of math>
  720          * mov tmu0_s, b
  721          * load_tmu0
  722          * <more math>
  723          * load_tmu0
  724          *
  725          * we count that as worse than
  726          *
  727          * mov tmu0_s, a
  728          * mov tmu0_s, b
  729          * <lots of math>
  730          * load_tmu0
  731          * <more math>
  732          * load_tmu0
  733          *
  734          * because we associate the first load_tmu0 with the *second* tmu0_s.
  735          */
  736         if (waddr == QPU_W_TMU0_S) {
  737                 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
  738                         return 100;
  739         }
  740         if (waddr == QPU_W_TMU1_S) {
  741                 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
  742                         return 100;
  743         }
  744 
  745         switch(waddr) {
  746         case QPU_W_SFU_RECIP:
  747         case QPU_W_SFU_RECIPSQRT:
  748         case QPU_W_SFU_EXP:
  749         case QPU_W_SFU_LOG:
  750                 return 3;
  751         default:
  752                 return 1;
  753         }
  754 }
  755 
  756 static uint32_t
  757 instruction_latency(struct schedule_node *before, struct schedule_node *after)
  758 {
  759         uint64_t before_inst = before->inst->inst;
  760         uint64_t after_inst = after->inst->inst;
  761 
  762         return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
  763                                   after_inst),
  764                     waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
  765                                   after_inst));
  766 }
  767 
  768 /** Recursive computation of the delay member of a node. */
  769 static void
  770 compute_delay(struct dag_node *node, void *state)
  771 {
  772         struct schedule_node *n = (struct schedule_node *)node;
  773 
  774         n->delay = 1;
  775 
  776         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
  777                 struct schedule_node *child =
  778                         (struct schedule_node *)edge->child;
  779                 n->delay = MAX2(n->delay, (child->delay +
  780                                            instruction_latency(n, child)));
  781         }
  782 }
  783 
  784 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
  785  * should be called on it later to finish pruning the other edges).
  786  */
  787 static void
  788 pre_remove_head(struct dag *dag, struct schedule_node *n)
  789 {
  790         list_delinit(&n->dag.link);
  791 
  792         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
  793                 if (edge->data)
  794                         dag_remove_edge(dag, edge);
  795         }
  796 }
  797 
  798 static void
  799 mark_instruction_scheduled(struct dag *dag,
  800                            uint32_t time,
  801                            struct schedule_node *node)
  802 {
  803         if (!node)
  804                 return;
  805 
  806         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
  807                 struct schedule_node *child =
  808                         (struct schedule_node *)edge->child;
  809 
  810                 if (!child)
  811                         continue;
  812 
  813                 uint32_t latency = instruction_latency(node, child);
  814 
  815                 child->unblocked_time = MAX2(child->unblocked_time,
  816                                              time + latency);
  817         }
  818         dag_prune_head(dag, &node->dag);
  819 }
  820 
  821 /**
  822  * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
  823  * with another instruction.
  824  */
  825 static void
  826 emit_thrsw(struct vc4_compile *c,
  827            struct choose_scoreboard *scoreboard,
  828            uint64_t inst)
  829 {
  830         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
  831 
  832         /* There should be nothing in a thrsw inst being scheduled other than
  833          * the signal bits.
  834          */
  835         assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);
  836         assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);
  837 
  838         /* Try to find an earlier scheduled instruction that we can merge the
  839          * thrsw into.
  840          */
  841         int thrsw_ip = c->qpu_inst_count;
  842         for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
  843                 uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
  844                 uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
  845 
  846                 if (prev_sig == QPU_SIG_NONE)
  847                         thrsw_ip = c->qpu_inst_count - i;
  848         }
  849 
  850         if (thrsw_ip != c->qpu_inst_count) {
  851                 /* Merge the thrsw into the existing instruction. */
  852                 c->qpu_insts[thrsw_ip] =
  853                         QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
  854         } else {
  855                 qpu_serialize_one_inst(c, inst);
  856                 update_scoreboard_for_chosen(scoreboard, inst);
  857         }
  858 
  859         /* Fill the delay slots. */
  860         while (c->qpu_inst_count < thrsw_ip + 3) {
  861                 update_scoreboard_for_chosen(scoreboard, qpu_NOP());
  862                 qpu_serialize_one_inst(c, qpu_NOP());
  863         }
  864 }
  865 
  866 static uint32_t
  867 schedule_instructions(struct vc4_compile *c,
  868                       struct choose_scoreboard *scoreboard,
  869                       struct qblock *block,
  870                       struct list_head *schedule_list,
  871                       enum quniform_contents *orig_uniform_contents,
  872                       uint32_t *orig_uniform_data,
  873                       uint32_t *next_uniform)
  874 {
  875         uint32_t time = 0;
  876 
  877         while (!list_is_empty(&scoreboard->dag->heads)) {
  878                 struct schedule_node *chosen =
  879                         choose_instruction_to_schedule(scoreboard,
  880                                                        schedule_list,
  881                                                        NULL);
  882                 struct schedule_node *merge = NULL;
  883 
  884                 /* If there are no valid instructions to schedule, drop a NOP
  885                  * in.
  886                  */
  887                 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
  888 
  889                 if (debug) {
  890                         fprintf(stderr, "t=%4d: current list:\n",
  891                                 time);
  892                         dump_state(scoreboard->dag);
  893                         fprintf(stderr, "t=%4d: chose: ", time);
  894                         vc4_qpu_disasm(&inst, 1);
  895                         fprintf(stderr, "\n");
  896                 }
  897 
  898                 /* Schedule this instruction onto the QPU list. Also try to
  899                  * find an instruction to pair with it.
  900                  */
  901                 if (chosen) {
  902                         time = MAX2(chosen->unblocked_time, time);
  903                         pre_remove_head(scoreboard->dag, chosen);
  904                         if (chosen->uniform != -1) {
  905                                 c->uniform_data[*next_uniform] =
  906                                         orig_uniform_data[chosen->uniform];
  907                                 c->uniform_contents[*next_uniform] =
  908                                         orig_uniform_contents[chosen->uniform];
  909                                 (*next_uniform)++;
  910                         }
  911 
  912                         merge = choose_instruction_to_schedule(scoreboard,
  913                                                                schedule_list,
  914                                                                chosen);
  915                         if (merge) {
  916                                 time = MAX2(merge->unblocked_time, time);
  917                                 inst = qpu_merge_inst(inst, merge->inst->inst);
  918                                 assert(inst != 0);
  919                                 if (merge->uniform != -1) {
  920                                         c->uniform_data[*next_uniform] =
  921                                                 orig_uniform_data[merge->uniform];
  922                                         c->uniform_contents[*next_uniform] =
  923                                                 orig_uniform_contents[merge->uniform];
  924                                         (*next_uniform)++;
  925                                 }
  926 
  927                                 if (debug) {
  928                                         fprintf(stderr, "t=%4d: merging: ",
  929                                                 time);
  930                                         vc4_qpu_disasm(&merge->inst->inst, 1);
  931                                         fprintf(stderr, "\n");
  932                                         fprintf(stderr, "            resulting in: ");
  933                                         vc4_qpu_disasm(&inst, 1);
  934                                         fprintf(stderr, "\n");
  935                                 }
  936                         }
  937                 }
  938 
  939                 if (debug) {
  940                         fprintf(stderr, "\n");
  941                 }
  942 
  943                 /* Now that we've scheduled a new instruction, some of its
  944                  * children can be promoted to the list of instructions ready to
  945                  * be scheduled.  Update the children's unblocked time for this
  946                  * DAG edge as we do so.
  947                  */
  948                 mark_instruction_scheduled(scoreboard->dag, time, chosen);
  949                 mark_instruction_scheduled(scoreboard->dag, time, merge);
  950 
  951                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
  952                     QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
  953                         emit_thrsw(c, scoreboard, inst);
  954                 } else {
  955                         qpu_serialize_one_inst(c, inst);
  956                         update_scoreboard_for_chosen(scoreboard, inst);
  957                 }
  958 
  959                 scoreboard->tick++;
  960                 time++;
  961 
  962                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
  963                         block->branch_qpu_ip = c->qpu_inst_count - 1;
  964                         /* Fill the delay slots.
  965                          *
  966                          * We should fill these with actual instructions,
  967                          * instead, but that will probably need to be done
  968                          * after this, once we know what the leading
  969                          * instructions of the successors are (so we can
  970                          * handle A/B register file write latency)
  971                         */
  972                         inst = qpu_NOP();
  973                         update_scoreboard_for_chosen(scoreboard, inst);
  974                         qpu_serialize_one_inst(c, inst);
  975                         qpu_serialize_one_inst(c, inst);
  976                         qpu_serialize_one_inst(c, inst);
  977                 }
  978         }
  979 
  980         return time;
  981 }
  982 
  983 static uint32_t
  984 qpu_schedule_instructions_block(struct vc4_compile *c,
  985                                 struct choose_scoreboard *scoreboard,
  986                                 struct qblock *block,
  987                                 enum quniform_contents *orig_uniform_contents,
  988                                 uint32_t *orig_uniform_data,
  989                                 uint32_t *next_uniform)
  990 {
  991         scoreboard->dag = dag_create(NULL);
  992         struct list_head setup_list;
  993 
  994         list_inithead(&setup_list);
  995 
  996         /* Wrap each instruction in a scheduler structure. */
  997         uint32_t next_sched_uniform = *next_uniform;
  998         while (!list_is_empty(&block->qpu_inst_list)) {
  999                 struct queued_qpu_inst *inst =
 1000                         (struct queued_qpu_inst *)block->qpu_inst_list.next;
 1001                 struct schedule_node *n = rzalloc(scoreboard->dag,
 1002                                                   struct schedule_node);
 1003 
 1004                 dag_init_node(scoreboard->dag, &n->dag);
 1005                 n->inst = inst;
 1006 
 1007                 if (reads_uniform(inst->inst)) {
 1008                         n->uniform = next_sched_uniform++;
 1009                 } else {
 1010                         n->uniform = -1;
 1011                 }
 1012                 list_del(&inst->link);
 1013                 list_addtail(&n->link, &setup_list);
 1014         }
 1015 
 1016         calculate_forward_deps(c, scoreboard->dag, &setup_list);
 1017         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
 1018 
 1019         dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
 1020 
 1021         uint32_t cycles = schedule_instructions(c, scoreboard, block,
 1022                                                 &setup_list,
 1023                                                 orig_uniform_contents,
 1024                                                 orig_uniform_data,
 1025                                                 next_uniform);
 1026 
 1027         ralloc_free(scoreboard->dag);
 1028         scoreboard->dag = NULL;
 1029 
 1030         return cycles;
 1031 }
 1032 
 1033 static void
 1034 qpu_set_branch_targets(struct vc4_compile *c)
 1035 {
 1036         qir_for_each_block(block, c) {
 1037                 /* The end block of the program has no branch. */
 1038                 if (!block->successors[0])
 1039                         continue;
 1040 
 1041                 /* If there was no branch instruction, then the successor
 1042                  * block must follow immediately after this one.
 1043                  */
 1044                 if (block->branch_qpu_ip == ~0) {
 1045                         assert(block->end_qpu_ip + 1 ==
 1046                                block->successors[0]->start_qpu_ip);
 1047                         continue;
 1048                 }
 1049 
 1050                 /* Set the branch target for the block that doesn't follow
 1051                  * immediately after ours.
 1052                  */
 1053                 uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
 1054                 assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
 1055                 assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
 1056 
 1057                 uint32_t branch_target =
 1058                         (block->successors[0]->start_qpu_ip -
 1059                          (block->branch_qpu_ip + 4)) * sizeof(uint64_t);
 1060                 *branch_inst = (*branch_inst |
 1061                                 QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
 1062 
 1063                 /* Make sure that the if-we-don't-jump successor was scheduled
 1064                  * just after the delay slots.
 1065                  */
 1066                 if (block->successors[1]) {
 1067                         assert(block->successors[1]->start_qpu_ip ==
 1068                                block->branch_qpu_ip + 4);
 1069                 }
 1070         }
 1071 }
 1072 
 1073 uint32_t
 1074 qpu_schedule_instructions(struct vc4_compile *c)
 1075 {
 1076         /* We reorder the uniforms as we schedule instructions, so save the
 1077          * old data off and replace it.
 1078          */
 1079         uint32_t *uniform_data = c->uniform_data;
 1080         enum quniform_contents *uniform_contents = c->uniform_contents;
 1081         c->uniform_contents = ralloc_array(c, enum quniform_contents,
 1082                                            c->num_uniforms);
 1083         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
 1084         c->uniform_array_size = c->num_uniforms;
 1085         uint32_t next_uniform = 0;
 1086 
 1087         struct choose_scoreboard scoreboard;
 1088         memset(&scoreboard, 0, sizeof(scoreboard));
 1089         scoreboard.last_waddr_a = ~0;
 1090         scoreboard.last_waddr_b = ~0;
 1091         scoreboard.last_sfu_write_tick = -10;
 1092         scoreboard.last_uniforms_reset_tick = -10;
 1093 
 1094         if (debug) {
 1095                 fprintf(stderr, "Pre-schedule instructions\n");
 1096                 qir_for_each_block(block, c) {
 1097                         fprintf(stderr, "BLOCK %d\n", block->index);
 1098                         list_for_each_entry(struct queued_qpu_inst, q,
 1099                                             &block->qpu_inst_list, link) {
 1100                                 vc4_qpu_disasm(&q->inst, 1);
 1101                                 fprintf(stderr, "\n");
 1102                         }
 1103                 }
 1104                 fprintf(stderr, "\n");
 1105         }
 1106 
 1107         uint32_t cycles = 0;
 1108         qir_for_each_block(block, c) {
 1109                 block->start_qpu_ip = c->qpu_inst_count;
 1110                 block->branch_qpu_ip = ~0;
 1111 
 1112                 cycles += qpu_schedule_instructions_block(c,
 1113                                                           &scoreboard,
 1114                                                           block,
 1115                                                           uniform_contents,
 1116                                                           uniform_data,
 1117                                                           &next_uniform);
 1118 
 1119                 block->end_qpu_ip = c->qpu_inst_count - 1;
 1120         }
 1121 
 1122         qpu_set_branch_targets(c);
 1123 
 1124         assert(next_uniform == c->num_uniforms);
 1125 
 1126         if (debug) {
 1127                 fprintf(stderr, "Post-schedule instructions\n");
 1128                 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
 1129                 fprintf(stderr, "\n");
 1130         }
 1131 
 1132         return cycles;
 1133 }