"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/amd/compiler/aco_spill.cpp" (16 Sep 2020, 79094 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aco_spill.cpp" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 20.2.0-rc3_vs_20.2.0-rc4.

    1 /*
    2  * Copyright © 2018 Valve Corporation
    3  * Copyright © 2018 Google
    4  *
    5  * Permission is hereby granted, free of charge, to any person obtaining a
    6  * copy of this software and associated documentation files (the "Software"),
    7  * to deal in the Software without restriction, including without limitation
    8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    9  * and/or sell copies of the Software, and to permit persons to whom the
   10  * Software is furnished to do so, subject to the following conditions:
   11  *
   12  * The above copyright notice and this permission notice (including the next
   13  * paragraph) shall be included in all copies or substantial portions of the
   14  * Software.
   15  *
   16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   22  * IN THE SOFTWARE.
   23  *
   24  */
   25 
   26 #include "aco_ir.h"
   27 #include "aco_builder.h"
   28 #include "sid.h"
   29 
   30 #include <map>
   31 #include <set>
   32 #include <stack>
   33 
   34 /*
   35  * Implements the spilling algorithm on SSA-form from
   36  * "Register Spilling and Live-Range Splitting for SSA-Form Programs"
   37  * by Matthias Braun and Sebastian Hack.
   38  */
   39 
   40 namespace aco {
   41 
   42 namespace {
   43 
   44 struct remat_info {
   45    Instruction *instr;
   46 };
   47 
   48 struct spill_ctx {
   49    RegisterDemand target_pressure;
   50    Program* program;
   51    std::vector<std::vector<RegisterDemand>> register_demand;
   52    std::vector<std::map<Temp, Temp>> renames;
   53    std::vector<std::map<Temp, uint32_t>> spills_entry;
   54    std::vector<std::map<Temp, uint32_t>> spills_exit;
   55    std::vector<bool> processed;
   56    std::stack<Block*> loop_header;
   57    std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_start;
   58    std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_end;
   59    std::vector<std::pair<RegClass, std::set<uint32_t>>> interferences;
   60    std::vector<std::vector<uint32_t>> affinities;
   61    std::vector<bool> is_reloaded;
   62    std::map<Temp, remat_info> remat;
   63    std::map<Instruction *, bool> remat_used;
   64    unsigned wave_size;
   65 
   66    spill_ctx(const RegisterDemand target_pressure, Program* program,
   67              std::vector<std::vector<RegisterDemand>> register_demand)
   68       : target_pressure(target_pressure), program(program),
   69         register_demand(std::move(register_demand)), renames(program->blocks.size()),
   70         spills_entry(program->blocks.size()), spills_exit(program->blocks.size()),
   71         processed(program->blocks.size(), false), wave_size(program->wave_size) {}
   72 
   73    void add_affinity(uint32_t first, uint32_t second)
   74    {
   75       unsigned found_first = affinities.size();
   76       unsigned found_second = affinities.size();
   77       for (unsigned i = 0; i < affinities.size(); i++) {
   78          std::vector<uint32_t>& vec = affinities[i];
   79          for (uint32_t entry : vec) {
   80             if (entry == first)
   81                found_first = i;
   82             else if (entry == second)
   83                found_second = i;
   84          }
   85       }
   86       if (found_first == affinities.size() && found_second == affinities.size()) {
   87          affinities.emplace_back(std::vector<uint32_t>({first, second}));
   88       } else if (found_first < affinities.size() && found_second == affinities.size()) {
   89          affinities[found_first].push_back(second);
   90       } else if (found_second < affinities.size() && found_first == affinities.size()) {
   91          affinities[found_second].push_back(first);
   92       } else if (found_first != found_second) {
   93          /* merge second into first */
   94          affinities[found_first].insert(affinities[found_first].end(), affinities[found_second].begin(), affinities[found_second].end());
   95          affinities.erase(std::next(affinities.begin(), found_second));
   96       } else {
   97          assert(found_first == found_second);
   98       }
   99    }
  100 
  101    uint32_t allocate_spill_id(RegClass rc)
  102    {
  103       interferences.emplace_back(rc, std::set<uint32_t>());
  104       is_reloaded.push_back(false);
  105       return next_spill_id++;
  106    }
  107 
  108    uint32_t next_spill_id = 0;
  109 };
  110 
  111 int32_t get_dominator(int idx_a, int idx_b, Program* program, bool is_linear)
  112 {
  113 
  114    if (idx_a == -1)
  115       return idx_b;
  116    if (idx_b == -1)
  117       return idx_a;
  118    if (is_linear) {
  119       while (idx_a != idx_b) {
  120          if (idx_a > idx_b)
  121             idx_a = program->blocks[idx_a].linear_idom;
  122          else
  123             idx_b = program->blocks[idx_b].linear_idom;
  124       }
  125    } else {
  126       while (idx_a != idx_b) {
  127          if (idx_a > idx_b)
  128             idx_a = program->blocks[idx_a].logical_idom;
  129          else
  130             idx_b = program->blocks[idx_b].logical_idom;
  131       }
  132    }
  133    assert(idx_a != -1);
  134    return idx_a;
  135 }
  136 
  137 void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& worklist)
  138 {
  139    Block* block = &ctx.program->blocks[block_idx];
  140    std::map<Temp, std::pair<uint32_t, uint32_t>> next_uses = ctx.next_use_distances_end[block_idx];
  141 
  142    /* to compute the next use distance at the beginning of the block, we have to add the block's size */
  143    for (std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = next_uses.begin(); it != next_uses.end(); ++it)
  144       it->second.second = it->second.second + block->instructions.size();
  145 
  146    int idx = block->instructions.size() - 1;
  147    while (idx >= 0) {
  148       aco_ptr<Instruction>& instr = block->instructions[idx];
  149 
  150       if (instr->opcode == aco_opcode::p_linear_phi ||
  151           instr->opcode == aco_opcode::p_phi)
  152          break;
  153 
  154       for (const Definition& def : instr->definitions) {
  155          if (def.isTemp())
  156             next_uses.erase(def.getTemp());
  157       }
  158 
  159       for (const Operand& op : instr->operands) {
  160          /* omit exec mask */
  161          if (op.isFixed() && op.physReg() == exec)
  162             continue;
  163          if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
  164             continue;
  165          if (op.isTemp())
  166             next_uses[op.getTemp()] = {block_idx, idx};
  167       }
  168       idx--;
  169    }
  170 
  171    assert(block_idx != 0 || next_uses.empty());
  172    ctx.next_use_distances_start[block_idx] = next_uses;
  173    while (idx >= 0) {
  174       aco_ptr<Instruction>& instr = block->instructions[idx];
  175       assert(instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi);
  176 
  177       for (unsigned i = 0; i < instr->operands.size(); i++) {
  178          unsigned pred_idx = instr->opcode == aco_opcode::p_phi ?
  179                              block->logical_preds[i] :
  180                              block->linear_preds[i];
  181          if (instr->operands[i].isTemp()) {
  182             if (instr->operands[i].getTemp() == ctx.program->blocks[pred_idx].live_out_exec)
  183                continue;
  184             if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) == ctx.next_use_distances_end[pred_idx].end() ||
  185                 ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != std::pair<uint32_t, uint32_t>{block_idx, 0})
  186                worklist.insert(pred_idx);
  187             ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = {block_idx, 0};
  188          }
  189       }
  190       next_uses.erase(instr->definitions[0].getTemp());
  191       idx--;
  192    }
  193 
  194    /* all remaining live vars must be live-out at the predecessors */
  195    for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_uses) {
  196       Temp temp = pair.first;
  197       uint32_t distance = pair.second.second;
  198       uint32_t dom = pair.second.first;
  199       std::vector<unsigned>& preds = temp.is_linear() ? block->linear_preds : block->logical_preds;
  200       for (unsigned pred_idx : preds) {
  201          if (temp == ctx.program->blocks[pred_idx].live_out_exec)
  202             continue;
  203          if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth)
  204             distance += 0xFFFF;
  205          if (ctx.next_use_distances_end[pred_idx].find(temp) != ctx.next_use_distances_end[pred_idx].end()) {
  206             dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program, temp.is_linear());
  207             distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance);
  208          }
  209          if (ctx.next_use_distances_end[pred_idx][temp] != std::pair<uint32_t, uint32_t>{dom, distance})
  210             worklist.insert(pred_idx);
  211          ctx.next_use_distances_end[pred_idx][temp] = {dom, distance};
  212       }
  213    }
  214 
  215 }
  216 
  217 void compute_global_next_uses(spill_ctx& ctx)
  218 {
  219    ctx.next_use_distances_start.resize(ctx.program->blocks.size());
  220    ctx.next_use_distances_end.resize(ctx.program->blocks.size());
  221    std::set<uint32_t> worklist;
  222    for (Block& block : ctx.program->blocks)
  223       worklist.insert(block.index);
  224 
  225    while (!worklist.empty()) {
  226       std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
  227       unsigned block_idx = *b_it;
  228       worklist.erase(block_idx);
  229       next_uses_per_block(ctx, block_idx, worklist);
  230    }
  231 }
  232 
  233 bool should_rematerialize(aco_ptr<Instruction>& instr)
  234 {
  235    /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
  236    if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO && instr->format != Format::SOPK)
  237       return false;
  238    /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector */
  239    if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_create_vector)
  240       return false;
  241    if (instr->format == Format::SOPK && instr->opcode != aco_opcode::s_movk_i32)
  242       return false;
  243 
  244    for (const Operand& op : instr->operands) {
  245       /* TODO: rematerialization using temporaries isn't yet supported */
  246       if (op.isTemp())
  247          return false;
  248    }
  249 
  250    /* TODO: rematerialization with multiple definitions isn't yet supported */
  251    if (instr->definitions.size() > 1)
  252       return false;
  253 
  254    return true;
  255 }
  256 
  257 aco_ptr<Instruction> do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
  258 {
  259    std::map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
  260    if (remat != ctx.remat.end()) {
  261       Instruction *instr = remat->second.instr;
  262       assert((instr->format == Format::VOP1 || instr->format == Format::SOP1 || instr->format == Format::PSEUDO || instr->format == Format::SOPK) && "unsupported");
  263       assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector) && "unsupported");
  264       assert(instr->definitions.size() == 1 && "unsupported");
  265 
  266       aco_ptr<Instruction> res;
  267       if (instr->format == Format::VOP1) {
  268          res.reset(create_instruction<VOP1_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
  269       } else if (instr->format == Format::SOP1) {
  270          res.reset(create_instruction<SOP1_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
  271       } else if (instr->format == Format::PSEUDO) {
  272          res.reset(create_instruction<Pseudo_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
  273       } else if (instr->format == Format::SOPK) {
  274          res.reset(create_instruction<SOPK_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
  275          static_cast<SOPK_instruction*>(res.get())->imm = static_cast<SOPK_instruction*>(instr)->imm;
  276       }
  277       for (unsigned i = 0; i < instr->operands.size(); i++) {
  278          res->operands[i] = instr->operands[i];
  279          if (instr->operands[i].isTemp()) {
  280             assert(false && "unsupported");
  281             if (ctx.remat.count(instr->operands[i].getTemp()))
  282                ctx.remat_used[ctx.remat[instr->operands[i].getTemp()].instr] = true;
  283          }
  284       }
  285       res->definitions[0] = Definition(new_name);
  286       return res;
  287    } else {
  288       aco_ptr<Pseudo_instruction> reload{create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
  289       reload->operands[0] = Operand(spill_id);
  290       reload->definitions[0] = Definition(new_name);
  291       ctx.is_reloaded[spill_id] = true;
  292       return reload;
  293    }
  294 }
  295 
  296 void get_rematerialize_info(spill_ctx& ctx)
  297 {
  298    for (Block& block : ctx.program->blocks) {
  299       bool logical = false;
  300       for (aco_ptr<Instruction>& instr : block.instructions) {
  301          if (instr->opcode == aco_opcode::p_logical_start)
  302             logical = true;
  303          else if (instr->opcode == aco_opcode::p_logical_end)
  304             logical = false;
  305          if (logical && should_rematerialize(instr)) {
  306             for (const Definition& def : instr->definitions) {
  307                if (def.isTemp()) {
  308                   ctx.remat[def.getTemp()] = (remat_info){instr.get()};
  309                   ctx.remat_used[instr.get()] = false;
  310                }
  311             }
  312          }
  313       }
  314    }
  315 }
  316 
  317 std::vector<std::map<Temp, uint32_t>> local_next_uses(spill_ctx& ctx, Block* block)
  318 {
  319    std::vector<std::map<Temp, uint32_t>> local_next_uses(block->instructions.size());
  320 
  321    std::map<Temp, uint32_t> next_uses;
  322    for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block->index])
  323       next_uses[pair.first] = pair.second.second + block->instructions.size();
  324 
  325    for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
  326       aco_ptr<Instruction>& instr = block->instructions[idx];
  327       if (!instr)
  328          break;
  329       if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi)
  330          break;
  331 
  332       for (const Operand& op : instr->operands) {
  333          if (op.isFixed() && op.physReg() == exec)
  334             continue;
  335          if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
  336             continue;
  337          if (op.isTemp())
  338             next_uses[op.getTemp()] = idx;
  339       }
  340       for (const Definition& def : instr->definitions) {
  341          if (def.isTemp())
  342             next_uses.erase(def.getTemp());
  343       }
  344       local_next_uses[idx] = next_uses;
  345    }
  346    return local_next_uses;
  347 }
  348 
  349 
  350 RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
  351 {
  352    RegisterDemand spilled_registers;
  353 
  354    /* first block, nothing was spilled before */
  355    if (block_idx == 0)
  356       return {0, 0};
  357 
  358    /* loop header block */
  359    if (block->loop_nest_depth > ctx.program->blocks[block_idx - 1].loop_nest_depth) {
  360       assert(block->linear_preds[0] == block_idx - 1);
  361       assert(block->logical_preds[0] == block_idx - 1);
  362 
  363       /* create new loop_info */
  364       ctx.loop_header.emplace(block);
  365 
  366       /* check how many live-through variables should be spilled */
  367       RegisterDemand new_demand;
  368       unsigned i = block_idx;
  369       while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth) {
  370          assert(ctx.program->blocks.size() > i);
  371          new_demand.update(ctx.program->blocks[i].register_demand);
  372          i++;
  373       }
  374       unsigned loop_end = i;
  375 
  376       /* select live-through vgpr variables */
  377       while (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) {
  378          unsigned distance = 0;
  379          Temp to_spill;
  380          for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block_idx - 1]) {
  381             if (pair.first.type() == RegType::vgpr &&
  382                 pair.second.first >= loop_end &&
  383                 pair.second.second > distance &&
  384                 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
  385                to_spill = pair.first;
  386                distance = pair.second.second;
  387             }
  388          }
  389          if (distance == 0)
  390             break;
  391 
  392          uint32_t spill_id;
  393          if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) {
  394             spill_id = ctx.allocate_spill_id(to_spill.regClass());
  395          } else {
  396             spill_id = ctx.spills_exit[block_idx - 1][to_spill];
  397          }
  398 
  399          ctx.spills_entry[block_idx][to_spill] = spill_id;
  400          spilled_registers.vgpr += to_spill.size();
  401       }
  402 
  403       /* select live-through sgpr variables */
  404       while (new_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) {
  405          unsigned distance = 0;
  406          Temp to_spill;
  407          for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block_idx - 1]) {
  408             if (pair.first.type() == RegType::sgpr &&
  409                 pair.second.first >= loop_end &&
  410                 pair.second.second > distance &&
  411                 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
  412                to_spill = pair.first;
  413                distance = pair.second.second;
  414             }
  415          }
  416          if (distance == 0)
  417             break;
  418 
  419          uint32_t spill_id;
  420          if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) {
  421             spill_id = ctx.allocate_spill_id(to_spill.regClass());
  422          } else {
  423             spill_id = ctx.spills_exit[block_idx - 1][to_spill];
  424          }
  425 
  426          ctx.spills_entry[block_idx][to_spill] = spill_id;
  427          spilled_registers.sgpr += to_spill.size();
  428       }
  429 
  430 
  431 
  432       /* shortcut */
  433       if (!RegisterDemand(new_demand - spilled_registers).exceeds(ctx.target_pressure))
  434          return spilled_registers;
  435 
  436       /* if reg pressure is too high at beginning of loop, add variables with furthest use */
  437       unsigned idx = 0;
  438       while (block->instructions[idx]->opcode == aco_opcode::p_phi || block->instructions[idx]->opcode == aco_opcode::p_linear_phi)
  439          idx++;
  440 
  441       assert(idx != 0 && "loop without phis: TODO");
  442       idx--;
  443       RegisterDemand reg_pressure = ctx.register_demand[block_idx][idx] - spilled_registers;
  444       while (reg_pressure.sgpr > ctx.target_pressure.sgpr) {
  445          unsigned distance = 0;
  446          Temp to_spill;
  447          for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
  448             if (pair.first.type() == RegType::sgpr &&
  449                 pair.second.second > distance &&
  450                 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
  451                to_spill = pair.first;
  452                distance = pair.second.second;
  453             }
  454          }
  455          assert(distance != 0);
  456 
  457          ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
  458          spilled_registers.sgpr += to_spill.size();
  459          reg_pressure.sgpr -= to_spill.size();
  460       }
  461       while (reg_pressure.vgpr > ctx.target_pressure.vgpr) {
  462          unsigned distance = 0;
  463          Temp to_spill;
  464          for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
  465             if (pair.first.type() == RegType::vgpr &&
  466                 pair.second.second > distance &&
  467                 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
  468                to_spill = pair.first;
  469                distance = pair.second.second;
  470             }
  471          }
  472          assert(distance != 0);
  473          ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
  474          spilled_registers.vgpr += to_spill.size();
  475          reg_pressure.vgpr -= to_spill.size();
  476       }
  477 
  478       return spilled_registers;
  479    }
  480 
  481    /* branch block */
  482    if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) {
  483       /* keep variables spilled if they are alive and not used in the current block */
  484       unsigned pred_idx = block->linear_preds[0];
  485       for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
  486          if (pair.first.type() == RegType::sgpr &&
  487              ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() &&
  488              ctx.next_use_distances_start[block_idx][pair.first].second > block_idx) {
  489             ctx.spills_entry[block_idx].insert(pair);
  490             spilled_registers.sgpr += pair.first.size();
  491          }
  492       }
  493       if (block->logical_preds.size() == 1) {
  494          pred_idx = block->logical_preds[0];
  495          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
  496             if (pair.first.type() == RegType::vgpr &&
  497                 ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() &&
  498                 ctx.next_use_distances_start[block_idx][pair.first].second > block_idx) {
  499                ctx.spills_entry[block_idx].insert(pair);
  500                spilled_registers.vgpr += pair.first.size();
  501             }
  502          }
  503       }
  504 
  505       /* if register demand is still too high, we just keep all spilled live vars and process the block */
  506       if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) {
  507          pred_idx = block->linear_preds[0];
  508          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
  509             if (pair.first.type() == RegType::sgpr &&
  510                 ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() &&
  511                 ctx.spills_entry[block_idx].insert(pair).second) {
  512                spilled_registers.sgpr += pair.first.size();
  513             }
  514          }
  515       }
  516       if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr && block->logical_preds.size() == 1) {
  517          pred_idx = block->logical_preds[0];
  518          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
  519             if (pair.first.type() == RegType::vgpr &&
  520                 ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() &&
  521                 ctx.spills_entry[block_idx].insert(pair).second) {
  522                spilled_registers.vgpr += pair.first.size();
  523             }
  524          }
  525       }
  526 
  527       return spilled_registers;
  528    }
  529 
  530    /* else: merge block */
  531    std::set<Temp> partial_spills;
  532 
  533    /* keep variables spilled on all incoming paths */
  534    for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
  535       std::vector<unsigned>& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
  536       /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload it.
  537        * Otherwise, if any predecessor reloads it, ensure it's reloaded on all other predecessors.
  538        * The idea is that it's better in practice to rematerialize redundantly than to create lots of phis. */
  539       /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db doesn't seem to exercise this path much) */
  540       bool remat = ctx.remat.count(pair.first);
  541       bool spill = !remat;
  542       uint32_t spill_id = 0;
  543       for (unsigned pred_idx : preds) {
  544          /* variable is not even live at the predecessor: probably from a phi */
  545          if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) {
  546             spill = false;
  547             break;
  548          }
  549          if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) {
  550             if (!remat)
  551                spill = false;
  552          } else {
  553             partial_spills.insert(pair.first);
  554             /* it might be that on one incoming path, the variable has a different spill_id, but add_couple_code() will take care of that. */
  555             spill_id = ctx.spills_exit[pred_idx][pair.first];
  556             if (remat)
  557                spill = true;
  558          }
  559       }
  560       if (spill) {
  561          ctx.spills_entry[block_idx][pair.first] = spill_id;
  562          partial_spills.erase(pair.first);
  563          spilled_registers += pair.first;
  564       }
  565    }
  566 
  567    /* same for phis */
  568    unsigned idx = 0;
  569    while (block->instructions[idx]->opcode == aco_opcode::p_linear_phi ||
  570           block->instructions[idx]->opcode == aco_opcode::p_phi) {
  571       aco_ptr<Instruction>& phi = block->instructions[idx];
  572       std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
  573       bool spill = true;
  574 
  575       for (unsigned i = 0; i < phi->operands.size(); i++) {
  576          if (phi->operands[i].isUndefined())
  577             continue;
  578          assert(phi->operands[i].isTemp());
  579          if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end())
  580             spill = false;
  581          else
  582             partial_spills.insert(phi->definitions[0].getTemp());
  583       }
  584       if (spill) {
  585          ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] = ctx.allocate_spill_id(phi->definitions[0].regClass());
  586          partial_spills.erase(phi->definitions[0].getTemp());
  587          spilled_registers += phi->definitions[0].getTemp();
  588       }
  589 
  590       idx++;
  591    }
  592 
  593    /* if reg pressure at first instruction is still too high, add partially spilled variables */
  594    RegisterDemand reg_pressure;
  595    if (idx == 0) {
  596       for (const Definition& def : block->instructions[idx]->definitions) {
  597          if (def.isTemp()) {
  598             reg_pressure -= def.getTemp();
  599          }
  600       }
  601       for (const Operand& op : block->instructions[idx]->operands) {
  602          if (op.isTemp() && op.isFirstKill()) {
  603             reg_pressure += op.getTemp();
  604          }
  605       }
  606    } else {
  607       idx--;
  608    }
  609    reg_pressure += ctx.register_demand[block_idx][idx] - spilled_registers;
  610 
  611    while (reg_pressure.sgpr > ctx.target_pressure.sgpr) {
  612       assert(!partial_spills.empty());
  613 
  614       std::set<Temp>::iterator it = partial_spills.begin();
  615       Temp to_spill = *it;
  616       unsigned distance = ctx.next_use_distances_start[block_idx][*it].second;
  617       while (it != partial_spills.end()) {
  618          assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end());
  619 
  620          if (it->type() == RegType::sgpr && ctx.next_use_distances_start[block_idx][*it].second > distance) {
  621             distance = ctx.next_use_distances_start[block_idx][*it].second;
  622             to_spill = *it;
  623          }
  624          ++it;
  625       }
  626       assert(distance != 0);
  627 
  628       ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
  629       partial_spills.erase(to_spill);
  630       spilled_registers.sgpr += to_spill.size();
  631       reg_pressure.sgpr -= to_spill.size();
  632    }
  633 
  634    while (reg_pressure.vgpr > ctx.target_pressure.vgpr) {
  635       assert(!partial_spills.empty());
  636 
  637       std::set<Temp>::iterator it = partial_spills.begin();
  638       Temp to_spill = *it;
  639       unsigned distance = ctx.next_use_distances_start[block_idx][*it].second;
  640       while (it != partial_spills.end()) {
  641          assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end());
  642 
  643          if (it->type() == RegType::vgpr && ctx.next_use_distances_start[block_idx][*it].second > distance) {
  644             distance = ctx.next_use_distances_start[block_idx][*it].second;
  645             to_spill = *it;
  646          }
  647          ++it;
  648       }
  649       assert(distance != 0);
  650 
  651       ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
  652       partial_spills.erase(to_spill);
  653       spilled_registers.vgpr += to_spill.size();
  654       reg_pressure.vgpr -= to_spill.size();
  655    }
  656 
  657    return spilled_registers;
  658 }
  659 
  660 
  661 RegisterDemand get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx)
  662 {
  663    if (idx == 0) {
  664       RegisterDemand demand = ctx.register_demand[block_idx][idx];
  665       aco_ptr<Instruction>& instr = ctx.program->blocks[block_idx].instructions[idx];
  666       aco_ptr<Instruction> instr_before(nullptr);
  667       return get_demand_before(demand, instr, instr_before);
  668    } else {
  669       return ctx.register_demand[block_idx][idx - 1];
  670    }
  671 }
  672 
  673 void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
  674 {
  675    /* no coupling code necessary */
  676    if (block->linear_preds.size() == 0)
  677       return;
  678 
  679    std::vector<aco_ptr<Instruction>> instructions;
  680    /* branch block: TODO take other branch into consideration */
  681    if (block->linear_preds.size() == 1 && !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) {
  682       assert(ctx.processed[block->linear_preds[0]]);
  683       assert(ctx.register_demand[block_idx].size() == block->instructions.size());
  684       std::vector<RegisterDemand> reg_demand;
  685       unsigned insert_idx = 0;
  686       unsigned pred_idx = block->linear_preds[0];
  687       RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0);
  688 
  689       for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live : ctx.next_use_distances_start[block_idx]) {
  690          if (!live.first.is_linear())
  691             continue;
  692          /* still spilled */
  693          if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
  694             continue;
  695 
  696          /* in register at end of predecessor */
  697          if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
  698             std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
  699             if (it != ctx.renames[pred_idx].end())
  700                ctx.renames[block_idx].insert(*it);
  701             continue;
  702          }
  703 
  704          /* variable is spilled at predecessor and live at current block: create reload instruction */
  705          Temp new_name = {ctx.program->allocateId(), live.first.regClass()};
  706          aco_ptr<Instruction> reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
  707          instructions.emplace_back(std::move(reload));
  708          reg_demand.push_back(demand_before);
  709          ctx.renames[block_idx][live.first] = new_name;
  710       }
  711 
  712       if (block->logical_preds.size() == 1) {
  713          do {
  714             assert(insert_idx < block->instructions.size());
  715             instructions.emplace_back(std::move(block->instructions[insert_idx]));
  716             reg_demand.push_back(ctx.register_demand[block_idx][insert_idx]);
  717             insert_idx++;
  718          } while (instructions.back()->opcode != aco_opcode::p_logical_start);
  719 
  720          unsigned pred_idx = block->logical_preds[0];
  721          for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live : ctx.next_use_distances_start[block_idx]) {
  722             if (live.first.is_linear())
  723                continue;
  724             /* still spilled */
  725             if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
  726                continue;
  727 
  728             /* in register at end of predecessor */
  729             if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
  730                std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
  731                if (it != ctx.renames[pred_idx].end())
  732                   ctx.renames[block_idx].insert(*it);
  733                continue;
  734             }
  735 
  736             /* variable is spilled at predecessor and live at current block: create reload instruction */
  737             Temp new_name = {ctx.program->allocateId(), live.first.regClass()};
  738             aco_ptr<Instruction> reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
  739             instructions.emplace_back(std::move(reload));
  740             reg_demand.emplace_back(reg_demand.back());
  741             ctx.renames[block_idx][live.first] = new_name;
  742          }
  743       }
  744 
  745       /* combine new reload instructions with original block */
  746       if (!instructions.empty()) {
  747          reg_demand.insert(reg_demand.end(), std::next(ctx.register_demand[block->index].begin(), insert_idx),
  748                            ctx.register_demand[block->index].end());
  749          ctx.register_demand[block_idx] = std::move(reg_demand);
  750          instructions.insert(instructions.end(),
  751                              std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(std::next(block->instructions.begin(), insert_idx)),
  752                              std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
  753          block->instructions = std::move(instructions);
  754       }
  755       return;
  756    }
  757 
  758    /* loop header and merge blocks: check if all (linear) predecessors have been processed */
  759    for (ASSERTED unsigned pred : block->linear_preds)
  760       assert(ctx.processed[pred]);
  761 
  762    /* iterate the phi nodes for which operands to spill at the predecessor */
  763    for (aco_ptr<Instruction>& phi : block->instructions) {
  764       if (phi->opcode != aco_opcode::p_phi &&
  765           phi->opcode != aco_opcode::p_linear_phi)
  766          break;
  767 
  768       /* if the phi is not spilled, add to instructions */
  769       if (ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end()) {
  770          instructions.emplace_back(std::move(phi));
  771          continue;
  772       }
  773 
  774       std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
  775       uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
  776 
  777       for (unsigned i = 0; i < phi->operands.size(); i++) {
  778          if (phi->operands[i].isUndefined())
  779             continue;
  780 
  781          unsigned pred_idx = preds[i];
  782          assert(phi->operands[i].isTemp() && phi->operands[i].isKill());
  783          Temp var = phi->operands[i].getTemp();
  784 
  785          /* build interferences between the phi def and all spilled variables at the predecessor blocks */
  786          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
  787             if (var == pair.first)
  788                continue;
  789             ctx.interferences[def_spill_id].second.emplace(pair.second);
  790             ctx.interferences[pair.second].second.emplace(def_spill_id);
  791          }
  792 
  793          /* check if variable is already spilled at predecessor */
  794          std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(var);
  795          if (spilled != ctx.spills_exit[pred_idx].end()) {
  796             if (spilled->second != def_spill_id)
  797                ctx.add_affinity(def_spill_id, spilled->second);
  798             continue;
  799          }
  800 
  801          /* rename if necessary */
  802          std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
  803          if (rename_it != ctx.renames[pred_idx].end()) {
  804             var = rename_it->second;
  805             ctx.renames[pred_idx].erase(rename_it);
  806          }
  807 
  808          uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass());
  809          ctx.add_affinity(def_spill_id, spill_id);
  810          aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
  811          spill->operands[0] = Operand(var);
  812          spill->operands[1] = Operand(spill_id);
  813          Block& pred = ctx.program->blocks[pred_idx];
  814          unsigned idx = pred.instructions.size();
  815          do {
  816             assert(idx != 0);
  817             idx--;
  818          } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
  819          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
  820          pred.instructions.insert(it, std::move(spill));
  821          ctx.spills_exit[pred_idx][phi->operands[i].getTemp()] = spill_id;
  822       }
  823 
  824       /* remove phi from instructions */
  825       phi.reset();
  826    }
  827 
  828    /* iterate all (other) spilled variables for which to spill at the predecessor */
  829    // TODO: would be better to have them sorted: first vgprs and first with longest distance
  830    for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {
  831       std::vector<unsigned> preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
  832 
  833       for (unsigned pred_idx : preds) {
  834          /* variable is already spilled at predecessor */
  835          std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(pair.first);
  836          if (spilled != ctx.spills_exit[pred_idx].end()) {
  837             if (spilled->second != pair.second)
  838                ctx.add_affinity(pair.second, spilled->second);
  839             continue;
  840          }
  841 
  842          /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
  843          if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end())
  844             continue;
  845 
  846          /* add interferences between spilled variable and predecessors exit spills */
  847          for (std::pair<Temp, uint32_t> exit_spill : ctx.spills_exit[pred_idx]) {
  848             if (exit_spill.first == pair.first)
  849                continue;
  850             ctx.interferences[exit_spill.second].second.emplace(pair.second);
  851             ctx.interferences[pair.second].second.emplace(exit_spill.second);
  852          }
  853 
  854          /* variable is in register at predecessor and has to be spilled */
  855          /* rename if necessary */
  856          Temp var = pair.first;
  857          std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
  858          if (rename_it != ctx.renames[pred_idx].end()) {
  859             var = rename_it->second;
  860             ctx.renames[pred_idx].erase(rename_it);
  861          }
  862 
  863          aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
  864          spill->operands[0] = Operand(var);
  865          spill->operands[1] = Operand(pair.second);
  866          Block& pred = ctx.program->blocks[pred_idx];
  867          unsigned idx = pred.instructions.size();
  868          do {
  869             assert(idx != 0);
  870             idx--;
  871          } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
  872          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
  873          pred.instructions.insert(it, std::move(spill));
  874          ctx.spills_exit[pred.index][pair.first] = pair.second;
  875       }
  876    }
  877 
  878    /* iterate phis for which operands to reload */
  879    for (aco_ptr<Instruction>& phi : instructions) {
  880       assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi);
  881       assert(ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end());
  882 
  883       std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
  884       for (unsigned i = 0; i < phi->operands.size(); i++) {
  885          if (!phi->operands[i].isTemp())
  886             continue;
  887          unsigned pred_idx = preds[i];
  888 
  889          /* rename operand */
  890          if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) == ctx.spills_exit[pred_idx].end()) {
  891             std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(phi->operands[i].getTemp());
  892             if (it != ctx.renames[pred_idx].end())
  893                phi->operands[i].setTemp(it->second);
  894             continue;
  895          }
  896 
  897          Temp tmp = phi->operands[i].getTemp();
  898 
  899          /* reload phi operand at end of predecessor block */
  900          Temp new_name = {ctx.program->allocateId(), tmp.regClass()};
  901          Block& pred = ctx.program->blocks[pred_idx];
  902          unsigned idx = pred.instructions.size();
  903          do {
  904             assert(idx != 0);
  905             idx--;
  906          } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
  907          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
  908 
  909          aco_ptr<Instruction> reload = do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);
  910          pred.instructions.insert(it, std::move(reload));
  911 
  912          ctx.spills_exit[pred_idx].erase(tmp);
  913          ctx.renames[pred_idx][tmp] = new_name;
  914          phi->operands[i].setTemp(new_name);
  915       }
  916    }
  917 
  918    /* iterate live variables for which to reload */
  919    // TODO: reload at current block if variable is spilled on all predecessors
  920    for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
  921       /* skip spilled variables */
  922       if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end())
  923          continue;
  924       std::vector<unsigned> preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
  925 
  926       /* variable is dead at predecessor, it must be from a phi */
  927       bool is_dead = false;
  928       for (unsigned pred_idx : preds) {
  929          if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end())
  930             is_dead = true;
  931       }
  932       if (is_dead)
  933          continue;
  934       for (unsigned pred_idx : preds) {
  935          /* the variable is not spilled at the predecessor */
  936          if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end())
  937             continue;
  938 
  939          /* variable is spilled at predecessor and has to be reloaded */
  940          Temp new_name = {ctx.program->allocateId(), pair.first.regClass()};
  941          Block& pred = ctx.program->blocks[pred_idx];
  942          unsigned idx = pred.instructions.size();
  943          do {
  944             assert(idx != 0);
  945             idx--;
  946          } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
  947          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
  948 
  949          aco_ptr<Instruction> reload = do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]);
  950          pred.instructions.insert(it, std::move(reload));
  951 
  952          ctx.spills_exit[pred.index].erase(pair.first);
  953          ctx.renames[pred.index][pair.first] = new_name;
  954       }
  955 
  956       /* check if we have to create a new phi for this variable */
  957       Temp rename = Temp();
  958       bool is_same = true;
  959       for (unsigned pred_idx : preds) {
  960          if (ctx.renames[pred_idx].find(pair.first) == ctx.renames[pred_idx].end()) {
  961             if (rename == Temp())
  962                rename = pair.first;
  963             else
  964                is_same = rename == pair.first;
  965          } else {
  966             if (rename == Temp())
  967                rename = ctx.renames[pred_idx][pair.first];
  968             else
  969                is_same = rename == ctx.renames[pred_idx][pair.first];
  970          }
  971 
  972          if (!is_same)
  973             break;
  974       }
  975 
  976       if (!is_same) {
  977          /* the variable was renamed differently in the predecessors: we have to create a phi */
  978          aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
  979          aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
  980          rename = {ctx.program->allocateId(), pair.first.regClass()};
  981          for (unsigned i = 0; i < phi->operands.size(); i++) {
  982             Temp tmp;
  983             if (ctx.renames[preds[i]].find(pair.first) != ctx.renames[preds[i]].end())
  984                tmp = ctx.renames[preds[i]][pair.first];
  985             else if (preds[i] >= block_idx)
  986                tmp = rename;
  987             else
  988                tmp = pair.first;
  989             phi->operands[i] = Operand(tmp);
  990          }
  991          phi->definitions[0] = Definition(rename);
  992          instructions.emplace_back(std::move(phi));
  993       }
  994 
  995       /* the variable was renamed: add new name to renames */
  996       if (!(rename == Temp() || rename == pair.first))
  997          ctx.renames[block_idx][pair.first] = rename;
  998    }
  999 
 1000    /* combine phis with instructions */
 1001    unsigned idx = 0;
 1002    while (!block->instructions[idx]) {
 1003       idx++;
 1004    }
 1005 
 1006    if (!ctx.processed[block_idx]) {
 1007       assert(!(block->kind & block_kind_loop_header));
 1008       RegisterDemand demand_before = get_demand_before(ctx, block_idx, idx);
 1009       ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(), ctx.register_demand[block->index].begin() + idx);
 1010       ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(), instructions.size(), demand_before);
 1011    }
 1012 
 1013    std::vector<aco_ptr<Instruction>>::iterator start = std::next(block->instructions.begin(), idx);
 1014    instructions.insert(instructions.end(), std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(start),
 1015                std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
 1016    block->instructions = std::move(instructions);
 1017 }
 1018 
 1019 void process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
 1020                    std::map<Temp, uint32_t> &current_spills, RegisterDemand spilled_registers)
 1021 {
 1022    assert(!ctx.processed[block_idx]);
 1023 
 1024    std::vector<std::map<Temp, uint32_t>> local_next_use_distance;
 1025    std::vector<aco_ptr<Instruction>> instructions;
 1026    unsigned idx = 0;
 1027 
 1028    /* phis are handled separetely */
 1029    while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
 1030           block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
 1031       aco_ptr<Instruction>& instr = block->instructions[idx];
 1032       for (const Operand& op : instr->operands) {
 1033          /* prevent it's definining instruction from being DCE'd if it could be rematerialized */
 1034          if (op.isTemp() && ctx.remat.count(op.getTemp()))
 1035             ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;
 1036       }
 1037       instructions.emplace_back(std::move(instr));
 1038       idx++;
 1039    }
 1040 
 1041    if (block->register_demand.exceeds(ctx.target_pressure))
 1042       local_next_use_distance = local_next_uses(ctx, block);
 1043 
 1044    while (idx < block->instructions.size()) {
 1045       aco_ptr<Instruction>& instr = block->instructions[idx];
 1046 
 1047       std::map<Temp, std::pair<Temp, uint32_t>> reloads;
 1048       std::map<Temp, uint32_t> spills;
 1049       /* rename and reload operands */
 1050       for (Operand& op : instr->operands) {
 1051          if (!op.isTemp())
 1052             continue;
 1053          if (current_spills.find(op.getTemp()) == current_spills.end()) {
 1054             /* the Operand is in register: check if it was renamed */
 1055             if (ctx.renames[block_idx].find(op.getTemp()) != ctx.renames[block_idx].end())
 1056                op.setTemp(ctx.renames[block_idx][op.getTemp()]);
 1057             /* prevent it's definining instruction from being DCE'd if it could be rematerialized */
 1058             if (ctx.remat.count(op.getTemp()))
 1059                ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;
 1060             continue;
 1061          }
 1062          /* the Operand is spilled: add it to reloads */
 1063          Temp new_tmp = {ctx.program->allocateId(), op.regClass()};
 1064          ctx.renames[block_idx][op.getTemp()] = new_tmp;
 1065          reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]);
 1066          current_spills.erase(op.getTemp());
 1067          op.setTemp(new_tmp);
 1068          spilled_registers -= new_tmp;
 1069       }
 1070 
 1071       /* check if register demand is low enough before and after the current instruction */
 1072       if (block->register_demand.exceeds(ctx.target_pressure)) {
 1073 
 1074          RegisterDemand new_demand = ctx.register_demand[block_idx][idx];
 1075          new_demand.update(get_demand_before(ctx, block_idx, idx));
 1076 
 1077          assert(!local_next_use_distance.empty());
 1078 
 1079          /* if reg pressure is too high, spill variable with furthest next use */
 1080          while (RegisterDemand(new_demand - spilled_registers).exceeds(ctx.target_pressure)) {
 1081             unsigned distance = 0;
 1082             Temp to_spill;
 1083             bool do_rematerialize = false;
 1084             if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) {
 1085                for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) {
 1086                   bool can_rematerialize = ctx.remat.count(pair.first);
 1087                   if (pair.first.type() == RegType::vgpr &&
 1088                       ((pair.second > distance && can_rematerialize == do_rematerialize) ||
 1089                        (can_rematerialize && !do_rematerialize && pair.second > idx)) &&
 1090                       current_spills.find(pair.first) == current_spills.end() &&
 1091                       ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) {
 1092                      to_spill = pair.first;
 1093                      distance = pair.second;
 1094                      do_rematerialize = can_rematerialize;
 1095                   }
 1096                }
 1097             } else {
 1098                for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) {
 1099                   bool can_rematerialize = ctx.remat.count(pair.first);
 1100                   if (pair.first.type() == RegType::sgpr &&
 1101                       ((pair.second > distance && can_rematerialize == do_rematerialize) ||
 1102                        (can_rematerialize && !do_rematerialize && pair.second > idx)) &&
 1103                       current_spills.find(pair.first) == current_spills.end() &&
 1104                       ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) {
 1105                      to_spill = pair.first;
 1106                      distance = pair.second;
 1107                      do_rematerialize = can_rematerialize;
 1108                   }
 1109                }
 1110             }
 1111 
 1112             assert(distance != 0 && distance > idx);
 1113             uint32_t spill_id = ctx.allocate_spill_id(to_spill.regClass());
 1114 
 1115             /* add interferences with currently spilled variables */
 1116             for (std::pair<Temp, uint32_t> pair : current_spills) {
 1117                ctx.interferences[spill_id].second.emplace(pair.second);
 1118                ctx.interferences[pair.second].second.emplace(spill_id);
 1119             }
 1120             for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) {
 1121                ctx.interferences[spill_id].second.emplace(pair.second.second);
 1122                ctx.interferences[pair.second.second].second.emplace(spill_id);
 1123             }
 1124 
 1125             current_spills[to_spill] = spill_id;
 1126             spilled_registers += to_spill;
 1127 
 1128             /* rename if necessary */
 1129             if (ctx.renames[block_idx].find(to_spill) != ctx.renames[block_idx].end()) {
 1130                to_spill = ctx.renames[block_idx][to_spill];
 1131             }
 1132 
 1133             /* add spill to new instructions */
 1134             aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
 1135             spill->operands[0] = Operand(to_spill);
 1136             spill->operands[1] = Operand(spill_id);
 1137             instructions.emplace_back(std::move(spill));
 1138          }
 1139       }
 1140 
 1141       /* add reloads and instruction to new instructions */
 1142       for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) {
 1143          aco_ptr<Instruction> reload = do_reload(ctx, pair.second.first, pair.first, pair.second.second);
 1144          instructions.emplace_back(std::move(reload));
 1145       }
 1146       instructions.emplace_back(std::move(instr));
 1147       idx++;
 1148    }
 1149 
 1150    block->instructions = std::move(instructions);
 1151    ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
 1152 }
 1153 
 1154 void spill_block(spill_ctx& ctx, unsigned block_idx)
 1155 {
 1156    Block* block = &ctx.program->blocks[block_idx];
 1157 
 1158    /* determine set of variables which are spilled at the beginning of the block */
 1159    RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx);
 1160 
 1161    /* add interferences for spilled variables */
 1162    for (std::pair<Temp, uint32_t> x : ctx.spills_entry[block_idx]) {
 1163       for (std::pair<Temp, uint32_t> y : ctx.spills_entry[block_idx])
 1164          if (x.second != y.second)
 1165             ctx.interferences[x.second].second.emplace(y.second);
 1166    }
 1167 
 1168    bool is_loop_header = block->loop_nest_depth && ctx.loop_header.top()->index == block_idx;
 1169    if (!is_loop_header) {
 1170       /* add spill/reload code on incoming control flow edges */
 1171       add_coupling_code(ctx, block, block_idx);
 1172    }
 1173 
 1174    std::map<Temp, uint32_t> current_spills = ctx.spills_entry[block_idx];
 1175 
 1176    /* check conditions to process this block */
 1177    bool process = RegisterDemand(block->register_demand - spilled_registers).exceeds(ctx.target_pressure) ||
 1178                   !ctx.renames[block_idx].empty() ||
 1179                   ctx.remat_used.size();
 1180 
 1181    std::map<Temp, uint32_t>::iterator it = current_spills.begin();
 1182    while (!process && it != current_spills.end()) {
 1183       if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx)
 1184          process = true;
 1185       ++it;
 1186    }
 1187 
 1188    if (process)
 1189       process_block(ctx, block_idx, block, current_spills, spilled_registers);
 1190    else
 1191       ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
 1192 
 1193    ctx.processed[block_idx] = true;
 1194 
 1195    /* check if the next block leaves the current loop */
 1196    if (block->loop_nest_depth == 0 || ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)
 1197       return;
 1198 
 1199    Block* loop_header = ctx.loop_header.top();
 1200 
 1201    /* preserve original renames at end of loop header block */
 1202    std::map<Temp, Temp> renames = std::move(ctx.renames[loop_header->index]);
 1203 
 1204    /* add coupling code to all loop header predecessors */
 1205    add_coupling_code(ctx, loop_header, loop_header->index);
 1206 
 1207    /* update remat_used for phis added in add_coupling_code() */
 1208    for (aco_ptr<Instruction>& instr : loop_header->instructions) {
 1209       if (!is_phi(instr))
 1210          break;
 1211       for (const Operand& op : instr->operands) {
 1212          if (op.isTemp() && ctx.remat.count(op.getTemp()))
 1213             ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;
 1214       }
 1215    }
 1216 
 1217    /* propagate new renames through loop: i.e. repair the SSA */
 1218    renames.swap(ctx.renames[loop_header->index]);
 1219    for (std::pair<Temp, Temp> rename : renames) {
 1220       for (unsigned idx = loop_header->index; idx <= block_idx; idx++) {
 1221          Block& current = ctx.program->blocks[idx];
 1222          std::vector<aco_ptr<Instruction>>::iterator instr_it = current.instructions.begin();
 1223 
 1224          /* first rename phis */
 1225          while (instr_it != current.instructions.end()) {
 1226             aco_ptr<Instruction>& phi = *instr_it;
 1227             if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
 1228                break;
 1229             /* no need to rename the loop header phis once again. this happened in add_coupling_code() */
 1230             if (idx == loop_header->index) {
 1231                instr_it++;
 1232                continue;
 1233             }
 1234 
 1235             for (Operand& op : phi->operands) {
 1236                if (!op.isTemp())
 1237                   continue;
 1238                if (op.getTemp() == rename.first)
 1239                   op.setTemp(rename.second);
 1240             }
 1241             instr_it++;
 1242          }
 1243 
 1244          std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = ctx.next_use_distances_start[idx].find(rename.first);
 1245 
 1246          /* variable is not live at beginning of this block */
 1247          if (it == ctx.next_use_distances_start[idx].end())
 1248             continue;
 1249 
 1250          /* if the variable is live at the block's exit, add rename */
 1251          if (ctx.next_use_distances_end[idx].find(rename.first) != ctx.next_use_distances_end[idx].end())
 1252             ctx.renames[idx].insert(rename);
 1253 
 1254          /* rename all uses in this block */
 1255          bool renamed = false;
 1256          while (!renamed && instr_it != current.instructions.end()) {
 1257             aco_ptr<Instruction>& instr = *instr_it;
 1258             for (Operand& op : instr->operands) {
 1259                if (!op.isTemp())
 1260                   continue;
 1261                if (op.getTemp() == rename.first) {
 1262                   op.setTemp(rename.second);
 1263                   /* we can stop with this block as soon as the variable is spilled */
 1264                   if (instr->opcode == aco_opcode::p_spill)
 1265                     renamed = true;
 1266                }
 1267             }
 1268             instr_it++;
 1269          }
 1270       }
 1271    }
 1272 
 1273    /* remove loop header info from stack */
 1274    ctx.loop_header.pop();
 1275 }
 1276 
 1277 Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
 1278                            std::vector<aco_ptr<Instruction>>& instructions,
 1279                            unsigned offset, bool is_top_level)
 1280 {
 1281    Builder bld(ctx.program);
 1282    if (is_top_level) {
 1283       bld.reset(&instructions);
 1284    } else {
 1285       /* find p_logical_end */
 1286       unsigned idx = instructions.size() - 1;
 1287       while (instructions[idx]->opcode != aco_opcode::p_logical_end)
 1288          idx--;
 1289       bld.reset(&instructions, std::next(instructions.begin(), idx));
 1290    }
 1291 
 1292    Temp private_segment_buffer = ctx.program->private_segment_buffer;
 1293    if (ctx.program->stage != compute_cs)
 1294       private_segment_buffer = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(0u));
 1295 
 1296    if (offset)
 1297       scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), scratch_offset, Operand(offset));
 1298 
 1299    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
 1300                         S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2);
 1301 
 1302    if (ctx.program->chip_class >= GFX10) {
 1303       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
 1304                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
 1305                    S_008F0C_RESOURCE_LEVEL(1);
 1306    } else if (ctx.program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
 1307       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
 1308                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 1309    }
 1310    /* older generations need element size = 4 bytes. element size removed in GFX9 */
 1311    if (ctx.program->chip_class <= GFX8)
 1312       rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
 1313 
 1314    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
 1315                      private_segment_buffer, Operand(-1u),
 1316                      Operand(rsrc_conf));
 1317 }
 1318 
 1319 void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
 1320    std::map<uint32_t, uint32_t> sgpr_slot;
 1321    std::map<uint32_t, uint32_t> vgpr_slot;
 1322    std::vector<bool> is_assigned(ctx.interferences.size());
 1323 
 1324    /* first, handle affinities: just merge all interferences into both spill ids */
 1325    for (std::vector<uint32_t>& vec : ctx.affinities) {
 1326       for (unsigned i = 0; i < vec.size(); i++) {
 1327          for (unsigned j = i + 1; j < vec.size(); j++) {
 1328             assert(vec[i] != vec[j]);
 1329             for (uint32_t id : ctx.interferences[vec[i]].second)
 1330                ctx.interferences[id].second.insert(vec[j]);
 1331             for (uint32_t id : ctx.interferences[vec[j]].second)
 1332                ctx.interferences[id].second.insert(vec[i]);
 1333             ctx.interferences[vec[i]].second.insert(ctx.interferences[vec[j]].second.begin(), ctx.interferences[vec[j]].second.end());
 1334             ctx.interferences[vec[j]].second.insert(ctx.interferences[vec[i]].second.begin(), ctx.interferences[vec[i]].second.end());
 1335 
 1336             bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]];
 1337             ctx.is_reloaded[vec[i]] = reloaded;
 1338             ctx.is_reloaded[vec[j]] = reloaded;
 1339          }
 1340       }
 1341    }
 1342    for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++)
 1343       for (ASSERTED uint32_t id : ctx.interferences[i].second)
 1344          assert(i != id);
 1345 
 1346    /* for each spill slot, assign as many spill ids as possible */
 1347    std::vector<std::set<uint32_t>> spill_slot_interferences;
 1348    unsigned slot_idx = 0;
 1349    bool done = false;
 1350 
 1351    /* assign sgpr spill slots */
 1352    while (!done) {
 1353       done = true;
 1354       for (unsigned id = 0; id < ctx.interferences.size(); id++) {
 1355          if (is_assigned[id] || !ctx.is_reloaded[id])
 1356             continue;
 1357          if (ctx.interferences[id].first.type() != RegType::sgpr)
 1358             continue;
 1359 
 1360          /* check interferences */
 1361          bool interferes = false;
 1362          for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) {
 1363             if (i == spill_slot_interferences.size())
 1364                spill_slot_interferences.emplace_back(std::set<uint32_t>());
 1365             if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end() || i / ctx.wave_size != slot_idx / ctx.wave_size) {
 1366                interferes = true;
 1367                break;
 1368             }
 1369          }
 1370          if (interferes) {
 1371             done = false;
 1372             continue;
 1373          }
 1374 
 1375          /* we found a spill id which can be assigned to current spill slot */
 1376          sgpr_slot[id] = slot_idx;
 1377          is_assigned[id] = true;
 1378          for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++)
 1379             spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end());
 1380 
 1381          /* add all affinities: there are no additional interferences */
 1382          for (std::vector<uint32_t>& vec : ctx.affinities) {
 1383             bool found_affinity = false;
 1384             for (uint32_t entry : vec) {
 1385                if (entry == id) {
 1386                   found_affinity = true;
 1387                   break;
 1388                }
 1389             }
 1390             if (!found_affinity)
 1391                continue;
 1392             for (uint32_t entry : vec) {
 1393                sgpr_slot[entry] = slot_idx;
 1394                is_assigned[entry] = true;
 1395             }
 1396          }
 1397       }
 1398       slot_idx++;
 1399    }
 1400 
 1401    unsigned sgpr_spill_slots = spill_slot_interferences.size();
 1402    spill_slot_interferences.clear();
 1403    slot_idx = 0;
 1404    done = false;
 1405 
 1406    /* assign vgpr spill slots */
 1407    while (!done) {
 1408       done = true;
 1409       for (unsigned id = 0; id < ctx.interferences.size(); id++) {
 1410          if (is_assigned[id] || !ctx.is_reloaded[id])
 1411             continue;
 1412          if (ctx.interferences[id].first.type() != RegType::vgpr)
 1413             continue;
 1414 
 1415          /* check interferences */
 1416          bool interferes = false;
 1417          for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) {
 1418             if (i == spill_slot_interferences.size())
 1419                spill_slot_interferences.emplace_back(std::set<uint32_t>());
 1420             /* check for interference and ensure that vector regs are stored next to each other */
 1421             if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end()) {
 1422                interferes = true;
 1423                break;
 1424             }
 1425          }
 1426          if (interferes) {
 1427             done = false;
 1428             continue;
 1429          }
 1430 
 1431          /* we found a spill id which can be assigned to current spill slot */
 1432          vgpr_slot[id] = slot_idx;
 1433          is_assigned[id] = true;
 1434          for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++)
 1435             spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end());
 1436 
 1437          /* add all affinities: there are no additional interferences */
 1438          for (std::vector<uint32_t>& vec : ctx.affinities) {
 1439             bool found_affinity = false;
 1440             for (uint32_t entry : vec) {
 1441                if (entry == id) {
 1442                   found_affinity = true;
 1443                   break;
 1444                }
 1445             }
 1446             if (!found_affinity)
 1447                continue;
 1448             for (uint32_t entry : vec) {
 1449                vgpr_slot[entry] = slot_idx;
 1450                is_assigned[entry] = true;
 1451             }
 1452          }
 1453       }
 1454       slot_idx++;
 1455    }
 1456 
 1457    unsigned vgpr_spill_slots = spill_slot_interferences.size();
 1458 
 1459    for (unsigned id = 0; id < is_assigned.size(); id++)
 1460       assert(is_assigned[id] || !ctx.is_reloaded[id]);
 1461 
 1462    for (std::vector<uint32_t>& vec : ctx.affinities) {
 1463       for (unsigned i = 0; i < vec.size(); i++) {
 1464          for (unsigned j = i + 1; j < vec.size(); j++) {
 1465             assert(is_assigned[vec[i]] == is_assigned[vec[j]]);
 1466             if (!is_assigned[vec[i]])
 1467                continue;
 1468             assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]);
 1469             assert(ctx.interferences[vec[i]].first.type() == ctx.interferences[vec[j]].first.type());
 1470             if (ctx.interferences[vec[i]].first.type() == RegType::sgpr)
 1471                assert(sgpr_slot[vec[i]] == sgpr_slot[vec[j]]);
 1472             else
 1473                assert(vgpr_slot[vec[i]] == vgpr_slot[vec[j]]);
 1474          }
 1475       }
 1476    }
 1477 
 1478    /* hope, we didn't mess up */
 1479    std::vector<Temp> vgpr_spill_temps((sgpr_spill_slots + ctx.wave_size - 1) / ctx.wave_size);
 1480    assert(vgpr_spill_temps.size() <= spills_to_vgpr);
 1481 
 1482    /* replace pseudo instructions with actual hardware instructions */
 1483    Temp scratch_offset = ctx.program->scratch_offset, scratch_rsrc = Temp();
 1484    unsigned last_top_level_block_idx = 0;
 1485    std::vector<bool> reload_in_loop(vgpr_spill_temps.size());
 1486    for (Block& block : ctx.program->blocks) {
 1487 
 1488       /* after loops, we insert a user if there was a reload inside the loop */
 1489       if (block.loop_nest_depth == 0) {
 1490          int end_vgprs = 0;
 1491          for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
 1492             if (reload_in_loop[i])
 1493                end_vgprs++;
 1494          }
 1495 
 1496          if (end_vgprs > 0) {
 1497             aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)};
 1498             int k = 0;
 1499             for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
 1500                if (reload_in_loop[i])
 1501                   destr->operands[k++] = Operand(vgpr_spill_temps[i]);
 1502                reload_in_loop[i] = false;
 1503             }
 1504             /* find insertion point */
 1505             std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
 1506             while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
 1507                ++it;
 1508             block.instructions.insert(it, std::move(destr));
 1509          }
 1510       }
 1511 
 1512       if (block.kind & block_kind_top_level && !block.linear_preds.empty()) {
 1513          last_top_level_block_idx = block.index;
 1514 
 1515          /* check if any spilled variables use a created linear vgpr, otherwise destroy them */
 1516          for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
 1517             if (vgpr_spill_temps[i] == Temp())
 1518                continue;
 1519 
 1520             bool can_destroy = true;
 1521             for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[block.linear_preds[0]]) {
 1522 
 1523                if (sgpr_slot.find(pair.second) != sgpr_slot.end() &&
 1524                    sgpr_slot[pair.second] / ctx.wave_size == i) {
 1525                   can_destroy = false;
 1526                   break;
 1527                }
 1528             }
 1529             if (can_destroy)
 1530                vgpr_spill_temps[i] = Temp();
 1531          }
 1532       }
 1533 
 1534       std::vector<aco_ptr<Instruction>>::iterator it;
 1535       std::vector<aco_ptr<Instruction>> instructions;
 1536       instructions.reserve(block.instructions.size());
 1537       Builder bld(ctx.program, &instructions);
 1538       for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
 1539 
 1540          if ((*it)->opcode == aco_opcode::p_spill) {
 1541             uint32_t spill_id = (*it)->operands[1].constantValue();
 1542 
 1543             if (!ctx.is_reloaded[spill_id]) {
 1544                /* never reloaded, so don't spill */
 1545             } else if (vgpr_slot.find(spill_id) != vgpr_slot.end()) {
 1546                /* spill vgpr */
 1547                ctx.program->config->spilled_vgprs += (*it)->operands[0].size();
 1548                uint32_t spill_slot = vgpr_slot[spill_id];
 1549                bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
 1550                unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
 1551 
 1552                /* check if the scratch resource descriptor already exists */
 1553                if (scratch_rsrc == Temp()) {
 1554                   unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
 1555                   scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
 1556                                                        last_top_level_block_idx == block.index ?
 1557                                                        instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
 1558                                                        offset,
 1559                                                        last_top_level_block_idx == block.index);
 1560                }
 1561 
 1562                unsigned offset = base_offset + spill_slot * 4;
 1563                aco_opcode opcode = aco_opcode::buffer_store_dword;
 1564                assert((*it)->operands[0].isTemp());
 1565                Temp temp = (*it)->operands[0].getTemp();
 1566                assert(temp.type() == RegType::vgpr && !temp.is_linear());
 1567                if (temp.size() > 1) {
 1568                   Instruction* split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
 1569                   split->operands[0] = Operand(temp);
 1570                   for (unsigned i = 0; i < temp.size(); i++)
 1571                      split->definitions[i] = bld.def(v1);
 1572                   bld.insert(split);
 1573                   for (unsigned i = 0; i < temp.size(); i++)
 1574                      bld.mubuf(opcode, scratch_rsrc, Operand(), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
 1575                } else {
 1576                   bld.mubuf(opcode, scratch_rsrc, Operand(), scratch_offset, temp, offset, false);
 1577                }
 1578             } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
 1579                ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
 1580 
 1581                uint32_t spill_slot = sgpr_slot[spill_id];
 1582 
 1583                /* check if the linear vgpr already exists */
 1584                if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
 1585                   Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()};
 1586                   vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
 1587                   aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
 1588                   create->definitions[0] = Definition(linear_vgpr);
 1589                   /* find the right place to insert this definition */
 1590                   if (last_top_level_block_idx == block.index) {
 1591                      /* insert right before the current instruction */
 1592                      instructions.emplace_back(std::move(create));
 1593                   } else {
 1594                      assert(last_top_level_block_idx < block.index);
 1595                      /* insert before the branch at last top level block */
 1596                      std::vector<aco_ptr<Instruction>>& instructions = ctx.program->blocks[last_top_level_block_idx].instructions;
 1597                      instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
 1598                   }
 1599                }
 1600 
 1601                /* spill sgpr: just add the vgpr temp to operands */
 1602                Pseudo_instruction* spill = create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
 1603                spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
 1604                spill->operands[1] = Operand(spill_slot % ctx.wave_size);
 1605                spill->operands[2] = (*it)->operands[0];
 1606                instructions.emplace_back(aco_ptr<Instruction>(spill));
 1607             } else {
 1608                unreachable("No spill slot assigned for spill id");
 1609             }
 1610 
 1611          } else if ((*it)->opcode == aco_opcode::p_reload) {
 1612             uint32_t spill_id = (*it)->operands[0].constantValue();
 1613             assert(ctx.is_reloaded[spill_id]);
 1614 
 1615             if (vgpr_slot.find(spill_id) != vgpr_slot.end()) {
 1616                /* reload vgpr */
 1617                uint32_t spill_slot = vgpr_slot[spill_id];
 1618                bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
 1619                unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
 1620 
 1621                /* check if the scratch resource descriptor already exists */
 1622                if (scratch_rsrc == Temp()) {
 1623                   unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
 1624                   scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
 1625                                                        last_top_level_block_idx == block.index ?
 1626                                                        instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
 1627                                                        offset,
 1628                                                        last_top_level_block_idx == block.index);
 1629                }
 1630 
 1631                unsigned offset = base_offset + spill_slot * 4;
 1632                aco_opcode opcode = aco_opcode::buffer_load_dword;
 1633                Definition def = (*it)->definitions[0];
 1634                if (def.size() > 1) {
 1635                   Instruction* vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
 1636                   vec->definitions[0] = def;
 1637                   for (unsigned i = 0; i < def.size(); i++) {
 1638                      Temp tmp = bld.tmp(v1);
 1639                      vec->operands[i] = Operand(tmp);
 1640                      bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(), scratch_offset, offset + i * 4, false);
 1641                   }
 1642                   bld.insert(vec);
 1643                } else {
 1644                   bld.mubuf(opcode, def, scratch_rsrc, Operand(), scratch_offset, offset, false);
 1645                }
 1646             } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
 1647                uint32_t spill_slot = sgpr_slot[spill_id];
 1648                reload_in_loop[spill_slot / ctx.wave_size] = block.loop_nest_depth > 0;
 1649 
 1650                /* check if the linear vgpr already exists */
 1651                if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
 1652                   Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()};
 1653                   vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
 1654                   aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
 1655                   create->definitions[0] = Definition(linear_vgpr);
 1656                   /* find the right place to insert this definition */
 1657                   if (last_top_level_block_idx == block.index) {
 1658                      /* insert right before the current instruction */
 1659                      instructions.emplace_back(std::move(create));
 1660                   } else {
 1661                      assert(last_top_level_block_idx < block.index);
 1662                      /* insert before the branch at last top level block */
 1663                      std::vector<aco_ptr<Instruction>>& instructions = ctx.program->blocks[last_top_level_block_idx].instructions;
 1664                      instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
 1665                   }
 1666                }
 1667 
 1668                /* reload sgpr: just add the vgpr temp to operands */
 1669                Pseudo_instruction* reload = create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 2, 1);
 1670                reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
 1671                reload->operands[1] = Operand(spill_slot % ctx.wave_size);
 1672                reload->definitions[0] = (*it)->definitions[0];
 1673                instructions.emplace_back(aco_ptr<Instruction>(reload));
 1674             } else {
 1675                unreachable("No spill slot assigned for spill id");
 1676             }
 1677          } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) {
 1678             instructions.emplace_back(std::move(*it));
 1679          }
 1680 
 1681       }
 1682       block.instructions = std::move(instructions);
 1683    }
 1684 
 1685    /* update required scratch memory */
 1686    ctx.program->config->scratch_bytes_per_wave += align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024);
 1687 
 1688    /* SSA elimination inserts copies for logical phis right before p_logical_end
 1689     * So if a linear vgpr is used between that p_logical_end and the branch,
 1690     * we need to ensure logical phis don't choose a definition which aliases
 1691     * the linear vgpr.
 1692     * TODO: Moving the spills and reloads to before p_logical_end might produce
 1693     *       slightly better code. */
 1694    for (Block& block : ctx.program->blocks) {
 1695       /* loops exits are already handled */
 1696       if (block.logical_preds.size() <= 1)
 1697          continue;
 1698 
 1699       bool has_logical_phis = false;
 1700       for (aco_ptr<Instruction>& instr : block.instructions) {
 1701          if (instr->opcode == aco_opcode::p_phi) {
 1702             has_logical_phis = true;
 1703             break;
 1704          } else if (instr->opcode != aco_opcode::p_linear_phi) {
 1705             break;
 1706          }
 1707       }
 1708       if (!has_logical_phis)
 1709          continue;
 1710 
 1711       std::set<Temp> vgprs;
 1712       for (unsigned pred_idx : block.logical_preds) {
 1713          Block& pred = ctx.program->blocks[pred_idx];
 1714          for (int i = pred.instructions.size() - 1; i >= 0; i--) {
 1715             aco_ptr<Instruction>& pred_instr = pred.instructions[i];
 1716             if (pred_instr->opcode == aco_opcode::p_logical_end) {
 1717                break;
 1718             } else if (pred_instr->opcode == aco_opcode::p_spill ||
 1719                        pred_instr->opcode == aco_opcode::p_reload) {
 1720                vgprs.insert(pred_instr->operands[0].getTemp());
 1721             }
 1722          }
 1723       }
 1724       if (!vgprs.size())
 1725          continue;
 1726 
 1727       aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)};
 1728       int k = 0;
 1729       for (Temp tmp : vgprs) {
 1730          destr->operands[k++] = Operand(tmp);
 1731       }
 1732       /* find insertion point */
 1733       std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
 1734       while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
 1735          ++it;
 1736       block.instructions.insert(it, std::move(destr));
 1737    }
 1738 }
 1739 
 1740 } /* end namespace */
 1741 
 1742 
 1743 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options)
 1744 {
 1745    program->config->spilled_vgprs = 0;
 1746    program->config->spilled_sgprs = 0;
 1747 
 1748    /* no spilling when register pressure is low enough */
 1749    if (program->num_waves > 0)
 1750       return;
 1751 
 1752    /* lower to CSSA before spilling to ensure correctness w.r.t. phis */
 1753    lower_to_cssa(program, live_vars, options);
 1754 
 1755    /* calculate target register demand */
 1756    RegisterDemand register_target = program->max_reg_demand;
 1757    if (register_target.sgpr > program->sgpr_limit)
 1758       register_target.vgpr += (register_target.sgpr - program->sgpr_limit + program->wave_size - 1 + 32) / program->wave_size;
 1759    register_target.sgpr = program->sgpr_limit;
 1760 
 1761    if (register_target.vgpr > program->vgpr_limit)
 1762       register_target.sgpr = program->sgpr_limit - 5;
 1763    int spills_to_vgpr = (program->max_reg_demand.sgpr - register_target.sgpr + program->wave_size - 1 + 32) / program->wave_size;
 1764    register_target.vgpr = program->vgpr_limit - spills_to_vgpr;
 1765 
 1766    /* initialize ctx */
 1767    spill_ctx ctx(register_target, program, live_vars.register_demand);
 1768    compute_global_next_uses(ctx);
 1769    get_rematerialize_info(ctx);
 1770 
 1771    /* create spills and reloads */
 1772    for (unsigned i = 0; i < program->blocks.size(); i++)
 1773       spill_block(ctx, i);
 1774 
 1775    /* assign spill slots and DCE rematerialized code */
 1776    assign_spill_slots(ctx, spills_to_vgpr);
 1777 
 1778    /* update live variable information */
 1779    live_vars = live_var_analysis(program, options);
 1780 
 1781    assert(program->num_waves > 0);
 1782 }
 1783 
 1784 }
 1785