"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/broadcom/compiler/qpu_schedule.c" (16 Sep 2020, 57431 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "qpu_schedule.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright © 2010 Intel Corporation
    3  * Copyright © 2014-2017 Broadcom
    4  *
    5  * Permission is hereby granted, free of charge, to any person obtaining a
    6  * copy of this software and associated documentation files (the "Software"),
    7  * to deal in the Software without restriction, including without limitation
    8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    9  * and/or sell copies of the Software, and to permit persons to whom the
   10  * Software is furnished to do so, subject to the following conditions:
   11  *
   12  * The above copyright notice and this permission notice (including the next
   13  * paragraph) shall be included in all copies or substantial portions of the
   14  * Software.
   15  *
   16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   22  * IN THE SOFTWARE.
   23  */
   24 
   25 /**
   26  * @file
   27  *
   28  * The basic model of the list scheduler is to take a basic block, compute a
   29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
   30  * pick a DAG head, then put all the children that are now DAG heads into the
   31  * list of things to schedule.
   32  *
   33  * The goal of scheduling here is to pack pairs of operations together in a
   34  * single QPU instruction.
   35  */
   36 
   37 #include "qpu/qpu_disasm.h"
   38 #include "v3d_compiler.h"
   39 #include "util/ralloc.h"
   40 #include "util/dag.h"
   41 
   42 static bool debug;
   43 
   44 struct schedule_node_child;
   45 
   46 struct schedule_node {
   47         struct dag_node dag;
   48         struct list_head link;
   49         struct qinst *inst;
   50 
   51         /* Longest cycles + instruction_latency() of any parent of this node. */
   52         uint32_t unblocked_time;
   53 
   54         /**
   55          * Minimum number of cycles from scheduling this instruction until the
   56          * end of the program, based on the slowest dependency chain through
   57          * the children.
   58          */
   59         uint32_t delay;
   60 
   61         /**
   62          * cycles between this instruction being scheduled and when its result
   63          * can be consumed.
   64          */
   65         uint32_t latency;
   66 };
   67 
   68 /* When walking the instructions in reverse, we need to swap before/after in
   69  * add_dep().
   70  */
   71 enum direction { F, R };
   72 
   73 struct schedule_state {
   74         const struct v3d_device_info *devinfo;
   75         struct dag *dag;
   76         struct schedule_node *last_r[6];
   77         struct schedule_node *last_rf[64];
   78         struct schedule_node *last_sf;
   79         struct schedule_node *last_vpm_read;
   80         struct schedule_node *last_tmu_write;
   81         struct schedule_node *last_tmu_config;
   82         struct schedule_node *last_tlb;
   83         struct schedule_node *last_vpm;
   84         struct schedule_node *last_unif;
   85         struct schedule_node *last_rtop;
   86         enum direction dir;
   87         /* Estimated cycle when the current instruction would start. */
   88         uint32_t time;
   89 };
   90 
   91 static void
   92 add_dep(struct schedule_state *state,
   93         struct schedule_node *before,
   94         struct schedule_node *after,
   95         bool write)
   96 {
   97         bool write_after_read = !write && state->dir == R;
   98         void *edge_data = (void *)(uintptr_t)write_after_read;
   99 
  100         if (!before || !after)
  101                 return;
  102 
  103         assert(before != after);
  104 
  105         if (state->dir == F)
  106                 dag_add_edge(&before->dag, &after->dag, edge_data);
  107         else
  108                 dag_add_edge(&after->dag, &before->dag, edge_data);
  109 }
  110 
  111 static void
  112 add_read_dep(struct schedule_state *state,
  113               struct schedule_node *before,
  114               struct schedule_node *after)
  115 {
  116         add_dep(state, before, after, false);
  117 }
  118 
  119 static void
  120 add_write_dep(struct schedule_state *state,
  121               struct schedule_node **before,
  122               struct schedule_node *after)
  123 {
  124         add_dep(state, *before, after, true);
  125         *before = after;
  126 }
  127 
  128 static bool
  129 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
  130 {
  131         if (inst->sig.ldtlb || inst->sig.ldtlbu)
  132                 return true;
  133 
  134         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
  135                 return false;
  136 
  137         if (inst->alu.add.magic_write &&
  138             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
  139              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
  140                 return true;
  141 
  142         if (inst->alu.mul.magic_write &&
  143             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
  144              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
  145                 return true;
  146 
  147         return false;
  148 }
  149 
  150 static void
  151 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
  152                  enum v3d_qpu_mux mux)
  153 {
  154         switch (mux) {
  155         case V3D_QPU_MUX_A:
  156                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
  157                 break;
  158         case V3D_QPU_MUX_B:
  159                 if (!n->inst->qpu.sig.small_imm) {
  160                         add_read_dep(state,
  161                                      state->last_rf[n->inst->qpu.raddr_b], n);
  162                 }
  163                 break;
  164         default:
  165                 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
  166                 break;
  167         }
  168 }
  169 
  170 
  171 static void
  172 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
  173                    uint32_t waddr, bool magic)
  174 {
  175         if (!magic) {
  176                 add_write_dep(state, &state->last_rf[waddr], n);
  177         } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
  178                 /* XXX perf: For V3D 4.x, we could reorder TMU writes other
  179                  * than the TMUS/TMUD/TMUA to improve scheduling flexibility.
  180                  */
  181                 add_write_dep(state, &state->last_tmu_write, n);
  182                 switch (waddr) {
  183                 case V3D_QPU_WADDR_TMUS:
  184                 case V3D_QPU_WADDR_TMUSCM:
  185                 case V3D_QPU_WADDR_TMUSF:
  186                 case V3D_QPU_WADDR_TMUSLOD:
  187                         add_write_dep(state, &state->last_tmu_config, n);
  188                         break;
  189                 default:
  190                         break;
  191                 }
  192         } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
  193                 /* Handled by v3d_qpu_writes_r4() check. */
  194         } else {
  195                 switch (waddr) {
  196                 case V3D_QPU_WADDR_R0:
  197                 case V3D_QPU_WADDR_R1:
  198                 case V3D_QPU_WADDR_R2:
  199                         add_write_dep(state,
  200                                       &state->last_r[waddr - V3D_QPU_WADDR_R0],
  201                                       n);
  202                         break;
  203                 case V3D_QPU_WADDR_R3:
  204                 case V3D_QPU_WADDR_R4:
  205                 case V3D_QPU_WADDR_R5:
  206                         /* Handled by v3d_qpu_writes_r*() checks below. */
  207                         break;
  208 
  209                 case V3D_QPU_WADDR_VPM:
  210                 case V3D_QPU_WADDR_VPMU:
  211                         add_write_dep(state, &state->last_vpm, n);
  212                         break;
  213 
  214                 case V3D_QPU_WADDR_TLB:
  215                 case V3D_QPU_WADDR_TLBU:
  216                         add_write_dep(state, &state->last_tlb, n);
  217                         break;
  218 
  219                 case V3D_QPU_WADDR_SYNC:
  220                 case V3D_QPU_WADDR_SYNCB:
  221                 case V3D_QPU_WADDR_SYNCU:
  222                         /* For CS barrier(): Sync against any other memory
  223                          * accesses.  There doesn't appear to be any need for
  224                          * barriers to affect ALU operations.
  225                          */
  226                         add_write_dep(state, &state->last_tmu_write, n);
  227                         break;
  228 
  229                 case V3D_QPU_WADDR_NOP:
  230                         break;
  231 
  232                 default:
  233                         fprintf(stderr, "Unknown waddr %d\n", waddr);
  234                         abort();
  235                 }
  236         }
  237 }
  238 
  239 /**
  240  * Common code for dependencies that need to be tracked both forward and
  241  * backward.
  242  *
  243  * This is for things like "all reads of r4 have to happen between the r4
  244  * writes that surround them".
  245  */
  246 static void
  247 calculate_deps(struct schedule_state *state, struct schedule_node *n)
  248 {
  249         const struct v3d_device_info *devinfo = state->devinfo;
  250         struct qinst *qinst = n->inst;
  251         struct v3d_qpu_instr *inst = &qinst->qpu;
  252         /* If the input and output segments are shared, then all VPM reads to
  253          * a location need to happen before all writes.  We handle this by
  254          * serializing all VPM operations for now.
  255          */
  256         bool separate_vpm_segment = false;
  257 
  258         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
  259                 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
  260                         add_read_dep(state, state->last_sf, n);
  261 
  262                 /* XXX: BDI */
  263                 /* XXX: BDU */
  264                 /* XXX: ub */
  265                 /* XXX: raddr_a */
  266 
  267                 add_write_dep(state, &state->last_unif, n);
  268                 return;
  269         }
  270 
  271         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
  272 
  273         /* XXX: LOAD_IMM */
  274 
  275         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
  276                 process_mux_deps(state, n, inst->alu.add.a);
  277         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
  278                 process_mux_deps(state, n, inst->alu.add.b);
  279 
  280         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
  281                 process_mux_deps(state, n, inst->alu.mul.a);
  282         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
  283                 process_mux_deps(state, n, inst->alu.mul.b);
  284 
  285         switch (inst->alu.add.op) {
  286         case V3D_QPU_A_VPMSETUP:
  287                 /* Could distinguish read/write by unpacking the uniform. */
  288                 add_write_dep(state, &state->last_vpm, n);
  289                 add_write_dep(state, &state->last_vpm_read, n);
  290                 break;
  291 
  292         case V3D_QPU_A_STVPMV:
  293         case V3D_QPU_A_STVPMD:
  294         case V3D_QPU_A_STVPMP:
  295                 add_write_dep(state, &state->last_vpm, n);
  296                 break;
  297 
  298         case V3D_QPU_A_LDVPMV_IN:
  299         case V3D_QPU_A_LDVPMD_IN:
  300         case V3D_QPU_A_LDVPMG_IN:
  301         case V3D_QPU_A_LDVPMP:
  302                 if (!separate_vpm_segment)
  303                         add_write_dep(state, &state->last_vpm, n);
  304                 break;
  305 
  306         case V3D_QPU_A_VPMWT:
  307                 add_read_dep(state, state->last_vpm, n);
  308                 break;
  309 
  310         case V3D_QPU_A_MSF:
  311                 add_read_dep(state, state->last_tlb, n);
  312                 break;
  313 
  314         case V3D_QPU_A_SETMSF:
  315         case V3D_QPU_A_SETREVF:
  316                 add_write_dep(state, &state->last_tlb, n);
  317                 break;
  318 
  319         default:
  320                 break;
  321         }
  322 
  323         switch (inst->alu.mul.op) {
  324         case V3D_QPU_M_MULTOP:
  325         case V3D_QPU_M_UMUL24:
  326                 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
  327                  * resets it to 0.  We could possibly reorder umul24s relative
  328                  * to each other, but for now just keep all the MUL parts in
  329                  * order.
  330                  */
  331                 add_write_dep(state, &state->last_rtop, n);
  332                 break;
  333         default:
  334                 break;
  335         }
  336 
  337         if (inst->alu.add.op != V3D_QPU_A_NOP) {
  338                 process_waddr_deps(state, n, inst->alu.add.waddr,
  339                                    inst->alu.add.magic_write);
  340         }
  341         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
  342                 process_waddr_deps(state, n, inst->alu.mul.waddr,
  343                                    inst->alu.mul.magic_write);
  344         }
  345         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
  346                 process_waddr_deps(state, n, inst->sig_addr,
  347                                    inst->sig_magic);
  348         }
  349 
  350         if (v3d_qpu_writes_r3(devinfo, inst))
  351                 add_write_dep(state, &state->last_r[3], n);
  352         if (v3d_qpu_writes_r4(devinfo, inst))
  353                 add_write_dep(state, &state->last_r[4], n);
  354         if (v3d_qpu_writes_r5(devinfo, inst))
  355                 add_write_dep(state, &state->last_r[5], n);
  356 
  357         if (inst->sig.thrsw) {
  358                 /* All accumulator contents and flags are undefined after the
  359                  * switch.
  360                  */
  361                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
  362                         add_write_dep(state, &state->last_r[i], n);
  363                 add_write_dep(state, &state->last_sf, n);
  364                 add_write_dep(state, &state->last_rtop, n);
  365 
  366                 /* Scoreboard-locking operations have to stay after the last
  367                  * thread switch.
  368                  */
  369                 add_write_dep(state, &state->last_tlb, n);
  370 
  371                 add_write_dep(state, &state->last_tmu_write, n);
  372                 add_write_dep(state, &state->last_tmu_config, n);
  373         }
  374 
  375         if (v3d_qpu_waits_on_tmu(inst)) {
  376                 /* TMU loads are coming from a FIFO, so ordering is important.
  377                  */
  378                 add_write_dep(state, &state->last_tmu_write, n);
  379         }
  380 
  381         if (inst->sig.wrtmuc)
  382                 add_write_dep(state, &state->last_tmu_config, n);
  383 
  384         if (inst->sig.ldtlb | inst->sig.ldtlbu)
  385                 add_write_dep(state, &state->last_tlb, n);
  386 
  387         if (inst->sig.ldvpm) {
  388                 add_write_dep(state, &state->last_vpm_read, n);
  389 
  390                 /* At least for now, we're doing shared I/O segments, so queue
  391                  * all writes after all reads.
  392                  */
  393                 if (!separate_vpm_segment)
  394                         add_write_dep(state, &state->last_vpm, n);
  395         }
  396 
  397         /* inst->sig.ldunif or sideband uniform read */
  398         if (vir_has_uniform(qinst))
  399                 add_write_dep(state, &state->last_unif, n);
  400 
  401         if (v3d_qpu_reads_flags(inst))
  402                 add_read_dep(state, state->last_sf, n);
  403         if (v3d_qpu_writes_flags(inst))
  404                 add_write_dep(state, &state->last_sf, n);
  405 }
  406 
  407 static void
  408 calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
  409                        struct list_head *schedule_list)
  410 {
  411         struct schedule_state state;
  412 
  413         memset(&state, 0, sizeof(state));
  414         state.dag = dag;
  415         state.devinfo = c->devinfo;
  416         state.dir = F;
  417 
  418         list_for_each_entry(struct schedule_node, node, schedule_list, link)
  419                 calculate_deps(&state, node);
  420 }
  421 
  422 static void
  423 calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
  424                        struct list_head *schedule_list)
  425 {
  426         struct schedule_state state;
  427 
  428         memset(&state, 0, sizeof(state));
  429         state.dag = dag;
  430         state.devinfo = c->devinfo;
  431         state.dir = R;
  432 
  433         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
  434                                 link) {
  435                 calculate_deps(&state, (struct schedule_node *)node);
  436         }
  437 }
  438 
  439 struct choose_scoreboard {
  440         struct dag *dag;
  441         int tick;
  442         int last_magic_sfu_write_tick;
  443         int last_stallable_sfu_reg;
  444         int last_stallable_sfu_tick;
  445         int last_ldvary_tick;
  446         int last_uniforms_reset_tick;
  447         int last_thrsw_tick;
  448         bool tlb_locked;
  449 };
  450 
  451 static bool
  452 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
  453                    const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
  454 {
  455         switch (mux) {
  456         case V3D_QPU_MUX_R4:
  457                 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
  458                         return true;
  459                 break;
  460 
  461         case V3D_QPU_MUX_R5:
  462                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
  463                         return true;
  464                 break;
  465         default:
  466                 break;
  467         }
  468 
  469         return false;
  470 }
  471 
  472 static bool
  473 reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
  474                            struct qinst *qinst)
  475 {
  476         const struct v3d_qpu_instr *inst = &qinst->qpu;
  477 
  478         /* XXX: Branching off of raddr. */
  479         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
  480                 return false;
  481 
  482         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
  483 
  484         if (inst->alu.add.op != V3D_QPU_A_NOP) {
  485                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
  486                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
  487                         return true;
  488                 }
  489                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
  490                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
  491                         return true;
  492                 }
  493         }
  494 
  495         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
  496                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
  497                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
  498                         return true;
  499                 }
  500                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
  501                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
  502                         return true;
  503                 }
  504         }
  505 
  506         /* XXX: imm */
  507 
  508         return false;
  509 }
  510 
  511 static bool
  512 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
  513                             struct choose_scoreboard *scoreboard,
  514                             struct qinst *qinst)
  515 {
  516         const struct v3d_qpu_instr *inst = &qinst->qpu;
  517 
  518         /* Don't schedule any other r4 write too soon after an SFU write.
  519          * This would normally be prevented by dependency tracking, but might
  520          * occur if a dead SFU computation makes it to scheduling.
  521          */
  522         if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
  523             v3d_qpu_writes_r4(devinfo, inst))
  524                 return true;
  525 
  526         return false;
  527 }
  528 
  529 static bool
  530 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
  531                           const struct v3d_qpu_instr *inst)
  532 {
  533         return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
  534 }
  535 
  536 static bool
  537 qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
  538                         uint32_t waddr) {
  539 
  540         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
  541            return false;
  542 
  543         if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
  544             inst->raddr_a == waddr)
  545               return true;
  546 
  547         if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
  548             !inst->sig.small_imm && (inst->raddr_b == waddr))
  549               return true;
  550 
  551         return false;
  552 }
  553 
  554 static bool
  555 mux_read_stalls(struct choose_scoreboard *scoreboard,
  556                 const struct v3d_qpu_instr *inst)
  557 {
  558         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
  559                 qpu_instruction_uses_rf(inst,
  560                                         scoreboard->last_stallable_sfu_reg);
  561 }
  562 
  563 /* We define a max schedule priority to allow negative priorities as result of
  564  * substracting this max when an instruction stalls. So instructions that
  565  * stall have lower priority than regular instructions. */
  566 #define MAX_SCHEDULE_PRIORITY 16
  567 
  568 static int
  569 get_instruction_priority(const struct v3d_qpu_instr *inst)
  570 {
  571         uint32_t baseline_score;
  572         uint32_t next_score = 0;
  573 
  574         /* Schedule TLB operations as late as possible, to get more
  575          * parallelism between shaders.
  576          */
  577         if (qpu_inst_is_tlb(inst))
  578                 return next_score;
  579         next_score++;
  580 
  581         /* Schedule texture read results collection late to hide latency. */
  582         if (v3d_qpu_waits_on_tmu(inst))
  583                 return next_score;
  584         next_score++;
  585 
  586         /* Default score for things that aren't otherwise special. */
  587         baseline_score = next_score;
  588         next_score++;
  589 
  590         /* Schedule texture read setup early to hide their latency better. */
  591         if (v3d_qpu_writes_tmu(inst))
  592                 return next_score;
  593         next_score++;
  594 
  595         /* We should increase the maximum if we assert here */
  596         assert(next_score < MAX_SCHEDULE_PRIORITY);
  597 
  598         return baseline_score;
  599 }
  600 
  601 static bool
  602 qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
  603 {
  604         return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
  605                 v3d_qpu_magic_waddr_is_sfu(waddr) ||
  606                 v3d_qpu_magic_waddr_is_tlb(waddr) ||
  607                 v3d_qpu_magic_waddr_is_vpm(waddr) ||
  608                 v3d_qpu_magic_waddr_is_tsy(waddr));
  609 }
  610 
  611 static bool
  612 qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
  613 {
  614         if (v3d_qpu_uses_vpm(inst))
  615                 return true;
  616         if (v3d_qpu_uses_sfu(inst))
  617                 return true;
  618 
  619         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
  620                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
  621                     inst->alu.add.magic_write &&
  622                     qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
  623                         return true;
  624                 }
  625 
  626                 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
  627                         return true;
  628 
  629                 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
  630                     inst->alu.mul.magic_write &&
  631                     qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
  632                         return true;
  633                 }
  634         }
  635 
  636         return (inst->sig.ldvpm ||
  637                 inst->sig.ldtmu ||
  638                 inst->sig.ldtlb ||
  639                 inst->sig.ldtlbu ||
  640                 inst->sig.wrtmuc);
  641 }
  642 
  643 static bool
  644 qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
  645                                  const struct v3d_qpu_instr *a,
  646                                  const struct v3d_qpu_instr *b)
  647 {
  648         const bool a_uses_peripheral = qpu_accesses_peripheral(a);
  649         const bool b_uses_peripheral = qpu_accesses_peripheral(b);
  650 
  651         /* We can always do one peripheral access per instruction. */
  652         if (!a_uses_peripheral || !b_uses_peripheral)
  653                 return true;
  654 
  655         if (devinfo->ver < 41)
  656                 return false;
  657 
  658         /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
  659          * WRTMUC with a TMU magic register write (other than tmuc).
  660          */
  661         if ((a->sig.ldtmu && v3d_qpu_uses_vpm(b)) ||
  662             (b->sig.ldtmu && v3d_qpu_uses_vpm(a))) {
  663                 return true;
  664         }
  665 
  666         if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(b)) ||
  667             (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(a))) {
  668                 return true;
  669         }
  670 
  671         return false;
  672 }
  673 
  674 static bool
  675 qpu_merge_inst(const struct v3d_device_info *devinfo,
  676                struct v3d_qpu_instr *result,
  677                const struct v3d_qpu_instr *a,
  678                const struct v3d_qpu_instr *b)
  679 {
  680         if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
  681             b->type != V3D_QPU_INSTR_TYPE_ALU) {
  682                 return false;
  683         }
  684 
  685         if (!qpu_compatible_peripheral_access(devinfo, a, b))
  686                 return false;
  687 
  688         struct v3d_qpu_instr merge = *a;
  689 
  690         if (b->alu.add.op != V3D_QPU_A_NOP) {
  691                 if (a->alu.add.op != V3D_QPU_A_NOP)
  692                         return false;
  693                 merge.alu.add = b->alu.add;
  694 
  695                 merge.flags.ac = b->flags.ac;
  696                 merge.flags.apf = b->flags.apf;
  697                 merge.flags.auf = b->flags.auf;
  698         }
  699 
  700         if (b->alu.mul.op != V3D_QPU_M_NOP) {
  701                 if (a->alu.mul.op != V3D_QPU_M_NOP)
  702                         return false;
  703                 merge.alu.mul = b->alu.mul;
  704 
  705                 merge.flags.mc = b->flags.mc;
  706                 merge.flags.mpf = b->flags.mpf;
  707                 merge.flags.muf = b->flags.muf;
  708         }
  709 
  710         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
  711                 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
  712                     a->raddr_a != b->raddr_a) {
  713                         return false;
  714                 }
  715                 merge.raddr_a = b->raddr_a;
  716         }
  717 
  718         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
  719                 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
  720                     (a->raddr_b != b->raddr_b ||
  721                      a->sig.small_imm != b->sig.small_imm)) {
  722                         return false;
  723                 }
  724                 merge.raddr_b = b->raddr_b;
  725         }
  726 
  727         merge.sig.thrsw |= b->sig.thrsw;
  728         merge.sig.ldunif |= b->sig.ldunif;
  729         merge.sig.ldunifrf |= b->sig.ldunifrf;
  730         merge.sig.ldunifa |= b->sig.ldunifa;
  731         merge.sig.ldunifarf |= b->sig.ldunifarf;
  732         merge.sig.ldtmu |= b->sig.ldtmu;
  733         merge.sig.ldvary |= b->sig.ldvary;
  734         merge.sig.ldvpm |= b->sig.ldvpm;
  735         merge.sig.small_imm |= b->sig.small_imm;
  736         merge.sig.ldtlb |= b->sig.ldtlb;
  737         merge.sig.ldtlbu |= b->sig.ldtlbu;
  738         merge.sig.ucb |= b->sig.ucb;
  739         merge.sig.rotate |= b->sig.rotate;
  740         merge.sig.wrtmuc |= b->sig.wrtmuc;
  741 
  742         if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
  743             v3d_qpu_sig_writes_address(devinfo, &b->sig))
  744                 return false;
  745         merge.sig_addr |= b->sig_addr;
  746         merge.sig_magic |= b->sig_magic;
  747 
  748         uint64_t packed;
  749         bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
  750 
  751         *result = merge;
  752         /* No modifying the real instructions on failure. */
  753         assert(ok || (a != result && b != result));
  754 
  755         return ok;
  756 }
  757 
  758 static struct schedule_node *
  759 choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
  760                                struct choose_scoreboard *scoreboard,
  761                                struct schedule_node *prev_inst)
  762 {
  763         struct schedule_node *chosen = NULL;
  764         int chosen_prio = 0;
  765 
  766         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
  767          * will handle pairing it along with filling the delay slots.
  768          */
  769         if (prev_inst) {
  770                 if (prev_inst->inst->qpu.sig.thrsw)
  771                         return NULL;
  772         }
  773 
  774         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
  775                             dag.link) {
  776                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
  777 
  778                 /* Don't choose the branch instruction until it's the last one
  779                  * left.  We'll move it up to fit its delay slots after we
  780                  * choose it.
  781                  */
  782                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
  783                     !list_is_singular(&scoreboard->dag->heads)) {
  784                         continue;
  785                 }
  786 
  787                 /* "An instruction must not read from a location in physical
  788                  *  regfile A or B that was written to by the previous
  789                  *  instruction."
  790                  */
  791                 if (reads_too_soon_after_write(scoreboard, n->inst))
  792                         continue;
  793 
  794                 if (writes_too_soon_after_write(devinfo, scoreboard, n->inst))
  795                         continue;
  796 
  797                 /* "A scoreboard wait must not occur in the first two
  798                  *  instructions of a fragment shader. This is either the
  799                  *  explicit Wait for Scoreboard signal or an implicit wait
  800                  *  with the first tile-buffer read or write instruction."
  801                  */
  802                 if (pixel_scoreboard_too_soon(scoreboard, inst))
  803                         continue;
  804 
  805                 /* ldunif and ldvary both write r5, but ldunif does so a tick
  806                  * sooner.  If the ldvary's r5 wasn't used, then ldunif might
  807                  * otherwise get scheduled so ldunif and ldvary try to update
  808                  * r5 in the same tick.
  809                  *
  810                  * XXX perf: To get good pipelining of a sequence of varying
  811                  * loads, we need to figure out how to pair the ldvary signal
  812                  * up to the instruction before the last r5 user in the
  813                  * previous ldvary sequence.  Currently, it usually pairs with
  814                  * the last r5 user.
  815                  */
  816                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
  817                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
  818                         continue;
  819                 }
  820 
  821                 /* If we're trying to pair with another instruction, check
  822                  * that they're compatible.
  823                  */
  824                 if (prev_inst) {
  825                         /* Don't pair up a thread switch signal -- we'll
  826                          * handle pairing it when we pick it on its own.
  827                          */
  828                         if (inst->sig.thrsw)
  829                                 continue;
  830 
  831                         if (prev_inst->inst->uniform != -1 &&
  832                             n->inst->uniform != -1)
  833                                 continue;
  834 
  835                         /* Don't merge in something that will lock the TLB.
  836                          * Hopwefully what we have in inst will release some
  837                          * other instructions, allowing us to delay the
  838                          * TLB-locking instruction until later.
  839                          */
  840                         if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
  841                                 continue;
  842 
  843                         struct v3d_qpu_instr merged_inst;
  844                         if (!qpu_merge_inst(devinfo, &merged_inst,
  845                                             &prev_inst->inst->qpu, inst)) {
  846                                 continue;
  847                         }
  848                 }
  849 
  850                 int prio = get_instruction_priority(inst);
  851 
  852                 if (mux_read_stalls(scoreboard, inst)) {
  853                         /* Don't merge an instruction that stalls */
  854                         if (prev_inst)
  855                                 continue;
  856                         else {
  857                                 /* Any instruction that don't stall will have
  858                                  * higher scheduling priority */
  859                                 prio -= MAX_SCHEDULE_PRIORITY;
  860                                 assert(prio < 0);
  861                         }
  862                 }
  863 
  864                 /* Found a valid instruction.  If nothing better comes along,
  865                  * this one works.
  866                  */
  867                 if (!chosen) {
  868                         chosen = n;
  869                         chosen_prio = prio;
  870                         continue;
  871                 }
  872 
  873                 if (prio > chosen_prio) {
  874                         chosen = n;
  875                         chosen_prio = prio;
  876                 } else if (prio < chosen_prio) {
  877                         continue;
  878                 }
  879 
  880                 if (n->delay > chosen->delay) {
  881                         chosen = n;
  882                         chosen_prio = prio;
  883                 } else if (n->delay < chosen->delay) {
  884                         continue;
  885                 }
  886         }
  887 
  888         return chosen;
  889 }
  890 
  891 static void
  892 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
  893                                   enum v3d_qpu_waddr waddr)
  894 {
  895         if (v3d_qpu_magic_waddr_is_sfu(waddr))
  896                 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
  897 }
  898 
  899 static void
  900 update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
  901                                       const struct v3d_qpu_instr *inst)
  902 {
  903         if (v3d_qpu_instr_is_sfu(inst)) {
  904                 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
  905                 scoreboard->last_stallable_sfu_tick = scoreboard->tick;
  906         }
  907 }
  908 
  909 static void
  910 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
  911                              const struct v3d_qpu_instr *inst)
  912 {
  913         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
  914                 return;
  915 
  916         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
  917 
  918         if (inst->alu.add.op != V3D_QPU_A_NOP)  {
  919                 if (inst->alu.add.magic_write) {
  920                         update_scoreboard_for_magic_waddr(scoreboard,
  921                                                           inst->alu.add.waddr);
  922                 } else {
  923                         update_scoreboard_for_sfu_stall_waddr(scoreboard,
  924                                                               inst);
  925                 }
  926         }
  927 
  928         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
  929                 if (inst->alu.mul.magic_write) {
  930                         update_scoreboard_for_magic_waddr(scoreboard,
  931                                                           inst->alu.mul.waddr);
  932                 }
  933         }
  934 
  935         if (inst->sig.ldvary)
  936                 scoreboard->last_ldvary_tick = scoreboard->tick;
  937 
  938         if (qpu_inst_is_tlb(inst))
  939                 scoreboard->tlb_locked = true;
  940 }
  941 
  942 static void
  943 dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
  944 {
  945         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
  946                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
  947                 v3d_qpu_dump(devinfo, &n->inst->qpu);
  948                 fprintf(stderr, "\n");
  949 
  950                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
  951                         struct schedule_node *child =
  952                                 (struct schedule_node *)edge->child;
  953                         if (!child)
  954                                 continue;
  955 
  956                         fprintf(stderr, "                 - ");
  957                         v3d_qpu_dump(devinfo, &child->inst->qpu);
  958                         fprintf(stderr, " (%d parents, %c)\n",
  959                                 child->dag.parent_count,
  960                                 edge->data ? 'w' : 'r');
  961                 }
  962         }
  963 }
  964 
  965 static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
  966                                     const struct v3d_qpu_instr *after)
  967 {
  968         /* Apply some huge latency between texture fetch requests and getting
  969          * their results back.
  970          *
  971          * FIXME: This is actually pretty bogus.  If we do:
  972          *
  973          * mov tmu0_s, a
  974          * <a bit of math>
  975          * mov tmu0_s, b
  976          * load_tmu0
  977          * <more math>
  978          * load_tmu0
  979          *
  980          * we count that as worse than
  981          *
  982          * mov tmu0_s, a
  983          * mov tmu0_s, b
  984          * <lots of math>
  985          * load_tmu0
  986          * <more math>
  987          * load_tmu0
  988          *
  989          * because we associate the first load_tmu0 with the *second* tmu0_s.
  990          */
  991         if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after))
  992                 return 100;
  993 
  994         /* Assume that anything depending on us is consuming the SFU result. */
  995         if (v3d_qpu_magic_waddr_is_sfu(waddr))
  996                 return 3;
  997 
  998         return 1;
  999 }
 1000 
 1001 static uint32_t
 1002 instruction_latency(struct schedule_node *before, struct schedule_node *after)
 1003 {
 1004         const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
 1005         const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
 1006         uint32_t latency = 1;
 1007 
 1008         if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
 1009             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
 1010                 return latency;
 1011 
 1012         if (before_inst->alu.add.magic_write) {
 1013                 latency = MAX2(latency,
 1014                                magic_waddr_latency(before_inst->alu.add.waddr,
 1015                                                    after_inst));
 1016         }
 1017 
 1018         if (before_inst->alu.mul.magic_write) {
 1019                 latency = MAX2(latency,
 1020                                magic_waddr_latency(before_inst->alu.mul.waddr,
 1021                                                    after_inst));
 1022         }
 1023 
 1024         if (v3d_qpu_instr_is_sfu(before_inst))
 1025                 return 2;
 1026 
 1027         return latency;
 1028 }
 1029 
 1030 /** Recursive computation of the delay member of a node. */
 1031 static void
 1032 compute_delay(struct dag_node *node, void *state)
 1033 {
 1034         struct schedule_node *n = (struct schedule_node *)node;
 1035 
 1036         n->delay = 1;
 1037 
 1038         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 1039                 struct schedule_node *child =
 1040                         (struct schedule_node *)edge->child;
 1041 
 1042                 n->delay = MAX2(n->delay, (child->delay +
 1043                                            instruction_latency(n, child)));
 1044         }
 1045 }
 1046 
 1047 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
 1048  * should be called on it later to finish pruning the other edges).
 1049  */
 1050 static void
 1051 pre_remove_head(struct dag *dag, struct schedule_node *n)
 1052 {
 1053         list_delinit(&n->dag.link);
 1054 
 1055         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 1056                 if (edge->data)
 1057                         dag_remove_edge(dag, edge);
 1058         }
 1059 }
 1060 
 1061 static void
 1062 mark_instruction_scheduled(struct dag *dag,
 1063                            uint32_t time,
 1064                            struct schedule_node *node)
 1065 {
 1066         if (!node)
 1067                 return;
 1068 
 1069         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
 1070                 struct schedule_node *child =
 1071                         (struct schedule_node *)edge->child;
 1072 
 1073                 if (!child)
 1074                         continue;
 1075 
 1076                 uint32_t latency = instruction_latency(node, child);
 1077 
 1078                 child->unblocked_time = MAX2(child->unblocked_time,
 1079                                              time + latency);
 1080         }
 1081         dag_prune_head(dag, &node->dag);
 1082 }
 1083 
 1084 static void
 1085 insert_scheduled_instruction(struct v3d_compile *c,
 1086                              struct qblock *block,
 1087                              struct choose_scoreboard *scoreboard,
 1088                              struct qinst *inst)
 1089 {
 1090         list_addtail(&inst->link, &block->instructions);
 1091 
 1092         update_scoreboard_for_chosen(scoreboard, &inst->qpu);
 1093         c->qpu_inst_count++;
 1094         scoreboard->tick++;
 1095 }
 1096 
 1097 static struct qinst *
 1098 vir_nop()
 1099 {
 1100         struct qreg undef = vir_nop_reg();
 1101         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
 1102 
 1103         return qinst;
 1104 }
 1105 
 1106 static void
 1107 emit_nop(struct v3d_compile *c, struct qblock *block,
 1108          struct choose_scoreboard *scoreboard)
 1109 {
 1110         insert_scheduled_instruction(c, block, scoreboard, vir_nop());
 1111 }
 1112 
 1113 static bool
 1114 qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
 1115                                      const struct qinst *qinst, int slot)
 1116 {
 1117         const struct v3d_qpu_instr *inst = &qinst->qpu;
 1118 
 1119         /* Only TLB Z writes are prohibited in the last slot, but we don't
 1120          * have those flagged so prohibit all TLB ops for now.
 1121          */
 1122         if (slot == 2 && qpu_inst_is_tlb(inst))
 1123                 return false;
 1124 
 1125         if (slot > 0 && qinst->uniform != ~0)
 1126                 return false;
 1127 
 1128         if (v3d_qpu_uses_vpm(inst))
 1129                 return false;
 1130 
 1131         if (inst->sig.ldvary)
 1132                 return false;
 1133 
 1134         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
 1135                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
 1136                 if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
 1137                         return false;
 1138 
 1139                 /* No writing physical registers at the end. */
 1140                 if (!inst->alu.add.magic_write ||
 1141                     !inst->alu.mul.magic_write) {
 1142                         return false;
 1143                 }
 1144 
 1145                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
 1146                         return false;
 1147 
 1148                 /* RF0-2 might be overwritten during the delay slots by
 1149                  * fragment shader setup.
 1150                  */
 1151                 if (inst->raddr_a < 3 &&
 1152                     (inst->alu.add.a == V3D_QPU_MUX_A ||
 1153                      inst->alu.add.b == V3D_QPU_MUX_A ||
 1154                      inst->alu.mul.a == V3D_QPU_MUX_A ||
 1155                      inst->alu.mul.b == V3D_QPU_MUX_A)) {
 1156                         return false;
 1157                 }
 1158 
 1159                 if (inst->raddr_b < 3 &&
 1160                     !inst->sig.small_imm &&
 1161                     (inst->alu.add.a == V3D_QPU_MUX_B ||
 1162                      inst->alu.add.b == V3D_QPU_MUX_B ||
 1163                      inst->alu.mul.a == V3D_QPU_MUX_B ||
 1164                      inst->alu.mul.b == V3D_QPU_MUX_B)) {
 1165                         return false;
 1166                 }
 1167         }
 1168 
 1169         return true;
 1170 }
 1171 
 1172 static bool
 1173 valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
 1174                      struct qinst *qinst, int instructions_in_sequence,
 1175                      bool is_thrend)
 1176 {
 1177         /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
 1178         if (scoreboard->last_thrsw_tick + 3 >
 1179             scoreboard->tick - instructions_in_sequence) {
 1180                 return false;
 1181         }
 1182 
 1183         for (int slot = 0; slot < instructions_in_sequence; slot++) {
 1184                 /* No scheduling SFU when the result would land in the other
 1185                  * thread.  The simulator complains for safety, though it
 1186                  * would only occur for dead code in our case.
 1187                  */
 1188                 if (slot > 0 &&
 1189                     qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 1190                     (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
 1191                      v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
 1192                         return false;
 1193                 }
 1194 
 1195                 if (slot > 0 && qinst->qpu.sig.ldvary)
 1196                         return false;
 1197 
 1198                 if (is_thrend &&
 1199                     !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) {
 1200                         return false;
 1201                 }
 1202 
 1203                 /* Note that the list is circular, so we can only do this up
 1204                  * to instructions_in_sequence.
 1205                  */
 1206                 qinst = (struct qinst *)qinst->link.next;
 1207         }
 1208 
 1209         return true;
 1210 }
 1211 
 1212 /**
 1213  * Emits a THRSW signal in the stream, trying to move it up to pair with
 1214  * another instruction.
 1215  */
 1216 static int
 1217 emit_thrsw(struct v3d_compile *c,
 1218            struct qblock *block,
 1219            struct choose_scoreboard *scoreboard,
 1220            struct qinst *inst,
 1221            bool is_thrend)
 1222 {
 1223         int time = 0;
 1224 
 1225         /* There should be nothing in a thrsw inst being scheduled other than
 1226          * the signal bits.
 1227          */
 1228         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
 1229         assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
 1230         assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
 1231 
 1232         /* Find how far back into previous instructions we can put the THRSW. */
 1233         int slots_filled = 0;
 1234         struct qinst *merge_inst = NULL;
 1235         vir_for_each_inst_rev(prev_inst, block) {
 1236                 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
 1237                 sig.thrsw = true;
 1238                 uint32_t packed_sig;
 1239 
 1240                 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
 1241                         break;
 1242 
 1243                 if (!valid_thrsw_sequence(c, scoreboard,
 1244                                           prev_inst, slots_filled + 1,
 1245                                           is_thrend)) {
 1246                         break;
 1247                 }
 1248 
 1249                 merge_inst = prev_inst;
 1250                 if (++slots_filled == 3)
 1251                         break;
 1252         }
 1253 
 1254         bool needs_free = false;
 1255         if (merge_inst) {
 1256                 merge_inst->qpu.sig.thrsw = true;
 1257                 needs_free = true;
 1258                 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
 1259         } else {
 1260                 scoreboard->last_thrsw_tick = scoreboard->tick;
 1261                 insert_scheduled_instruction(c, block, scoreboard, inst);
 1262                 time++;
 1263                 slots_filled++;
 1264                 merge_inst = inst;
 1265         }
 1266 
 1267         /* Insert any extra delay slot NOPs we need. */
 1268         for (int i = 0; i < 3 - slots_filled; i++) {
 1269                 emit_nop(c, block, scoreboard);
 1270                 time++;
 1271         }
 1272 
 1273         /* If we're emitting the last THRSW (other than program end), then
 1274          * signal that to the HW by emitting two THRSWs in a row.
 1275          */
 1276         if (inst->is_last_thrsw) {
 1277                 struct qinst *second_inst =
 1278                         (struct qinst *)merge_inst->link.next;
 1279                 second_inst->qpu.sig.thrsw = true;
 1280         }
 1281 
 1282         /* If we put our THRSW into another instruction, free up the
 1283          * instruction that didn't end up scheduled into the list.
 1284          */
 1285         if (needs_free)
 1286                 free(inst);
 1287 
 1288         return time;
 1289 }
 1290 
 1291 static uint32_t
 1292 schedule_instructions(struct v3d_compile *c,
 1293                       struct choose_scoreboard *scoreboard,
 1294                       struct qblock *block,
 1295                       enum quniform_contents *orig_uniform_contents,
 1296                       uint32_t *orig_uniform_data,
 1297                       uint32_t *next_uniform)
 1298 {
 1299         const struct v3d_device_info *devinfo = c->devinfo;
 1300         uint32_t time = 0;
 1301 
 1302         while (!list_is_empty(&scoreboard->dag->heads)) {
 1303                 struct schedule_node *chosen =
 1304                         choose_instruction_to_schedule(devinfo,
 1305                                                        scoreboard,
 1306                                                        NULL);
 1307                 struct schedule_node *merge = NULL;
 1308 
 1309                 /* If there are no valid instructions to schedule, drop a NOP
 1310                  * in.
 1311                  */
 1312                 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
 1313                 struct v3d_qpu_instr *inst = &qinst->qpu;
 1314 
 1315                 if (debug) {
 1316                         fprintf(stderr, "t=%4d: current list:\n",
 1317                                 time);
 1318                         dump_state(devinfo, scoreboard->dag);
 1319                         fprintf(stderr, "t=%4d: chose:   ", time);
 1320                         v3d_qpu_dump(devinfo, inst);
 1321                         fprintf(stderr, "\n");
 1322                 }
 1323 
 1324                 /* We can't mark_instruction_scheduled() the chosen inst until
 1325                  * we're done identifying instructions to merge, so put the
 1326                  * merged instructions on a list for a moment.
 1327                  */
 1328                 struct list_head merged_list;
 1329                 list_inithead(&merged_list);
 1330 
 1331                 /* Schedule this instruction onto the QPU list. Also try to
 1332                  * find an instruction to pair with it.
 1333                  */
 1334                 if (chosen) {
 1335                         time = MAX2(chosen->unblocked_time, time);
 1336                         pre_remove_head(scoreboard->dag, chosen);
 1337 
 1338                         while ((merge =
 1339                                 choose_instruction_to_schedule(devinfo,
 1340                                                                scoreboard,
 1341                                                                chosen))) {
 1342                                 time = MAX2(merge->unblocked_time, time);
 1343                                 pre_remove_head(scoreboard->dag, chosen);
 1344                                 list_addtail(&merge->link, &merged_list);
 1345                                 (void)qpu_merge_inst(devinfo, inst,
 1346                                                      inst, &merge->inst->qpu);
 1347                                 if (merge->inst->uniform != -1) {
 1348                                         chosen->inst->uniform =
 1349                                                 merge->inst->uniform;
 1350                                 }
 1351 
 1352                                 if (debug) {
 1353                                         fprintf(stderr, "t=%4d: merging: ",
 1354                                                 time);
 1355                                         v3d_qpu_dump(devinfo, &merge->inst->qpu);
 1356                                         fprintf(stderr, "\n");
 1357                                         fprintf(stderr, "         result: ");
 1358                                         v3d_qpu_dump(devinfo, inst);
 1359                                         fprintf(stderr, "\n");
 1360                                 }
 1361                         }
 1362                         if (mux_read_stalls(scoreboard, inst))
 1363                                 c->qpu_inst_stalled_count++;
 1364                 }
 1365 
 1366                 /* Update the uniform index for the rewritten location --
 1367                  * branch target updating will still need to change
 1368                  * c->uniform_data[] using this index.
 1369                  */
 1370                 if (qinst->uniform != -1) {
 1371                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
 1372                                 block->branch_uniform = *next_uniform;
 1373 
 1374                         c->uniform_data[*next_uniform] =
 1375                                 orig_uniform_data[qinst->uniform];
 1376                         c->uniform_contents[*next_uniform] =
 1377                                 orig_uniform_contents[qinst->uniform];
 1378                         qinst->uniform = *next_uniform;
 1379                         (*next_uniform)++;
 1380                 }
 1381 
 1382                 if (debug) {
 1383                         fprintf(stderr, "\n");
 1384                 }
 1385 
 1386                 /* Now that we've scheduled a new instruction, some of its
 1387                  * children can be promoted to the list of instructions ready to
 1388                  * be scheduled.  Update the children's unblocked time for this
 1389                  * DAG edge as we do so.
 1390                  */
 1391                 mark_instruction_scheduled(scoreboard->dag, time, chosen);
 1392                 list_for_each_entry(struct schedule_node, merge, &merged_list,
 1393                                     link) {
 1394                         mark_instruction_scheduled(scoreboard->dag, time, merge);
 1395 
 1396                         /* The merged VIR instruction doesn't get re-added to the
 1397                          * block, so free it now.
 1398                          */
 1399                         free(merge->inst);
 1400                 }
 1401 
 1402                 if (inst->sig.thrsw) {
 1403                         time += emit_thrsw(c, block, scoreboard, qinst, false);
 1404                 } else {
 1405                         insert_scheduled_instruction(c, block,
 1406                                                      scoreboard, qinst);
 1407 
 1408                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
 1409                                 block->branch_qpu_ip = c->qpu_inst_count - 1;
 1410                                 /* Fill the delay slots.
 1411                                  *
 1412                                  * We should fill these with actual instructions,
 1413                                  * instead, but that will probably need to be done
 1414                                  * after this, once we know what the leading
 1415                                  * instructions of the successors are (so we can
 1416                                  * handle A/B register file write latency)
 1417                                  */
 1418                                 for (int i = 0; i < 3; i++)
 1419                                         emit_nop(c, block, scoreboard);
 1420                         }
 1421                 }
 1422         }
 1423 
 1424         return time;
 1425 }
 1426 
 1427 static uint32_t
 1428 qpu_schedule_instructions_block(struct v3d_compile *c,
 1429                                 struct choose_scoreboard *scoreboard,
 1430                                 struct qblock *block,
 1431                                 enum quniform_contents *orig_uniform_contents,
 1432                                 uint32_t *orig_uniform_data,
 1433                                 uint32_t *next_uniform)
 1434 {
 1435         void *mem_ctx = ralloc_context(NULL);
 1436         scoreboard->dag = dag_create(mem_ctx);
 1437         struct list_head setup_list;
 1438 
 1439         list_inithead(&setup_list);
 1440 
 1441         /* Wrap each instruction in a scheduler structure. */
 1442         while (!list_is_empty(&block->instructions)) {
 1443                 struct qinst *qinst = (struct qinst *)block->instructions.next;
 1444                 struct schedule_node *n =
 1445                         rzalloc(mem_ctx, struct schedule_node);
 1446 
 1447                 dag_init_node(scoreboard->dag, &n->dag);
 1448                 n->inst = qinst;
 1449 
 1450                 list_del(&qinst->link);
 1451                 list_addtail(&n->link, &setup_list);
 1452         }
 1453 
 1454         calculate_forward_deps(c, scoreboard->dag, &setup_list);
 1455         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
 1456 
 1457         dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
 1458 
 1459         uint32_t cycles = schedule_instructions(c, scoreboard, block,
 1460                                                 orig_uniform_contents,
 1461                                                 orig_uniform_data,
 1462                                                 next_uniform);
 1463 
 1464         ralloc_free(mem_ctx);
 1465         scoreboard->dag = NULL;
 1466 
 1467         return cycles;
 1468 }
 1469 
 1470 static void
 1471 qpu_set_branch_targets(struct v3d_compile *c)
 1472 {
 1473         vir_for_each_block(block, c) {
 1474                 /* The end block of the program has no branch. */
 1475                 if (!block->successors[0])
 1476                         continue;
 1477 
 1478                 /* If there was no branch instruction, then the successor
 1479                  * block must follow immediately after this one.
 1480                  */
 1481                 if (block->branch_qpu_ip == ~0) {
 1482                         assert(block->end_qpu_ip + 1 ==
 1483                                block->successors[0]->start_qpu_ip);
 1484                         continue;
 1485                 }
 1486 
 1487                 /* Walk back through the delay slots to find the branch
 1488                  * instr.
 1489                  */
 1490                 struct list_head *entry = block->instructions.prev;
 1491                 for (int i = 0; i < 3; i++)
 1492                         entry = entry->prev;
 1493                 struct qinst *branch = container_of(entry, branch, link);
 1494                 assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
 1495 
 1496                 /* Make sure that the if-we-don't-jump
 1497                  * successor was scheduled just after the
 1498                  * delay slots.
 1499                  */
 1500                 assert(!block->successors[1] ||
 1501                        block->successors[1]->start_qpu_ip ==
 1502                        block->branch_qpu_ip + 4);
 1503 
 1504                 branch->qpu.branch.offset =
 1505                         ((block->successors[0]->start_qpu_ip -
 1506                           (block->branch_qpu_ip + 4)) *
 1507                          sizeof(uint64_t));
 1508 
 1509                 /* Set up the relative offset to jump in the
 1510                  * uniform stream.
 1511                  *
 1512                  * Use a temporary here, because
 1513                  * uniform_data[inst->uniform] may be shared
 1514                  * between multiple instructions.
 1515                  */
 1516                 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
 1517                 c->uniform_data[branch->uniform] =
 1518                         (block->successors[0]->start_uniform -
 1519                          (block->branch_uniform + 1)) * 4;
 1520         }
 1521 }
 1522 
 1523 uint32_t
 1524 v3d_qpu_schedule_instructions(struct v3d_compile *c)
 1525 {
 1526         const struct v3d_device_info *devinfo = c->devinfo;
 1527         struct qblock *end_block = list_last_entry(&c->blocks,
 1528                                                    struct qblock, link);
 1529 
 1530         /* We reorder the uniforms as we schedule instructions, so save the
 1531          * old data off and replace it.
 1532          */
 1533         uint32_t *uniform_data = c->uniform_data;
 1534         enum quniform_contents *uniform_contents = c->uniform_contents;
 1535         c->uniform_contents = ralloc_array(c, enum quniform_contents,
 1536                                            c->num_uniforms);
 1537         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
 1538         c->uniform_array_size = c->num_uniforms;
 1539         uint32_t next_uniform = 0;
 1540 
 1541         struct choose_scoreboard scoreboard;
 1542         memset(&scoreboard, 0, sizeof(scoreboard));
 1543         scoreboard.last_ldvary_tick = -10;
 1544         scoreboard.last_magic_sfu_write_tick = -10;
 1545         scoreboard.last_uniforms_reset_tick = -10;
 1546         scoreboard.last_thrsw_tick = -10;
 1547         scoreboard.last_stallable_sfu_tick = -10;
 1548 
 1549         if (debug) {
 1550                 fprintf(stderr, "Pre-schedule instructions\n");
 1551                 vir_for_each_block(block, c) {
 1552                         fprintf(stderr, "BLOCK %d\n", block->index);
 1553                         list_for_each_entry(struct qinst, qinst,
 1554                                             &block->instructions, link) {
 1555                                 v3d_qpu_dump(devinfo, &qinst->qpu);
 1556                                 fprintf(stderr, "\n");
 1557                         }
 1558                 }
 1559                 fprintf(stderr, "\n");
 1560         }
 1561 
 1562         uint32_t cycles = 0;
 1563         vir_for_each_block(block, c) {
 1564                 block->start_qpu_ip = c->qpu_inst_count;
 1565                 block->branch_qpu_ip = ~0;
 1566                 block->start_uniform = next_uniform;
 1567 
 1568                 cycles += qpu_schedule_instructions_block(c,
 1569                                                           &scoreboard,
 1570                                                           block,
 1571                                                           uniform_contents,
 1572                                                           uniform_data,
 1573                                                           &next_uniform);
 1574 
 1575                 block->end_qpu_ip = c->qpu_inst_count - 1;
 1576         }
 1577 
 1578         /* Emit the program-end THRSW instruction. */;
 1579         struct qinst *thrsw = vir_nop();
 1580         thrsw->qpu.sig.thrsw = true;
 1581         emit_thrsw(c, end_block, &scoreboard, thrsw, true);
 1582 
 1583         qpu_set_branch_targets(c);
 1584 
 1585         assert(next_uniform == c->num_uniforms);
 1586 
 1587         return cycles;
 1588 }