"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/amd/compiler/aco_ir.h" (16 Sep 2020, 48526 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aco_ir.h" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 20.2.0-rc1_vs_20.2.0-rc2.

    1 /*
    2  * Copyright © 2018 Valve Corporation
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8  * and/or sell copies of the Software, and to permit persons to whom the
    9  * Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   21  * IN THE SOFTWARE.
   22  *
   23  */
   24 
   25 #ifndef ACO_IR_H
   26 #define ACO_IR_H
   27 
   28 #include <vector>
   29 #include <set>
   30 #include <unordered_set>
   31 #include <bitset>
   32 #include <memory>
   33 
   34 #include "nir.h"
   35 #include "ac_binary.h"
   36 #include "amd_family.h"
   37 #include "aco_opcodes.h"
   38 #include "aco_util.h"
   39 
   40 struct radv_nir_compiler_options;
   41 struct radv_shader_args;
   42 struct radv_shader_info;
   43 
   44 namespace aco {
   45 
   46 extern uint64_t debug_flags;
   47 
   48 enum {
   49    DEBUG_VALIDATE = 0x1,
   50    DEBUG_VALIDATE_RA = 0x2,
   51    DEBUG_PERFWARN = 0x4,
   52 };
   53 
   54 /**
   55  * Representation of the instruction's microcode encoding format
   56  * Note: Some Vector ALU Formats can be combined, such that:
   57  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
   58  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
   59  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
   60  *
   61  * (*) The same is applicable for VOP1 and VOPC instructions.
   62  */
   63 enum class Format : std::uint16_t {
   64    /* Pseudo Instruction Format */
   65    PSEUDO = 0,
   66    /* Scalar ALU & Control Formats */
   67    SOP1 = 1,
   68    SOP2 = 2,
   69    SOPK = 3,
   70    SOPP = 4,
   71    SOPC = 5,
   72    /* Scalar Memory Format */
   73    SMEM = 6,
   74    /* LDS/GDS Format */
   75    DS = 8,
   76    /* Vector Memory Buffer Formats */
   77    MTBUF = 9,
   78    MUBUF = 10,
   79    /* Vector Memory Image Format */
   80    MIMG = 11,
   81    /* Export Format */
   82    EXP = 12,
   83    /* Flat Formats */
   84    FLAT = 13,
   85    GLOBAL = 14,
   86    SCRATCH = 15,
   87 
   88    PSEUDO_BRANCH = 16,
   89    PSEUDO_BARRIER = 17,
   90    PSEUDO_REDUCTION = 18,
   91 
   92    /* Vector ALU Formats */
   93    VOP3P = 19,
   94    VOP1 = 1 << 8,
   95    VOP2 = 1 << 9,
   96    VOPC = 1 << 10,
   97    VOP3 = 1 << 11,
   98    VOP3A = 1 << 11,
   99    VOP3B = 1 << 11,
  100    /* Vector Parameter Interpolation Format */
  101    VINTRP = 1 << 12,
  102    DPP = 1 << 13,
  103    SDWA = 1 << 14,
  104 };
  105 
  106 enum barrier_interaction : uint8_t {
  107    barrier_none = 0,
  108    barrier_buffer = 0x1,
  109    barrier_image = 0x2,
  110    barrier_atomic = 0x4,
  111    barrier_shared = 0x8,
  112    /* used for geometry shaders to ensure vertex data writes are before the
  113     * GS_DONE s_sendmsg. */
  114    barrier_gs_data = 0x10,
  115    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
  116    barrier_gs_sendmsg = 0x20,
  117    /* used by barriers. created by s_barrier */
  118    barrier_barrier = 0x40,
  119    barrier_count = 7,
  120 };
  121 
  122 enum fp_round {
  123    fp_round_ne = 0,
  124    fp_round_pi = 1,
  125    fp_round_ni = 2,
  126    fp_round_tz = 3,
  127 };
  128 
  129 enum fp_denorm {
  130    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
  131     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
  132    fp_denorm_flush = 0x0,
  133    fp_denorm_keep = 0x3,
  134 };
  135 
  136 struct float_mode {
  137    /* matches encoding of the MODE register */
  138    union {
  139       struct {
  140           fp_round round32:2;
  141           fp_round round16_64:2;
  142           unsigned denorm32:2;
  143           unsigned denorm16_64:2;
  144       };
  145       uint8_t val = 0;
  146    };
  147    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
  148    bool preserve_signed_zero_inf_nan32:1;
  149    bool preserve_signed_zero_inf_nan16_64:1;
  150    /* if false, optimizations which may remove denormal flushing can be done */
  151    bool must_flush_denorms32:1;
  152    bool must_flush_denorms16_64:1;
  153    bool care_about_round32:1;
  154    bool care_about_round16_64:1;
  155 
  156    /* Returns true if instructions using the mode "other" can safely use the
  157     * current one instead. */
  158    bool canReplace(float_mode other) const noexcept {
  159       return val == other.val &&
  160              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
  161              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
  162              (must_flush_denorms32  || !other.must_flush_denorms32) &&
  163              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
  164              (care_about_round32 || !other.care_about_round32) &&
  165              (care_about_round16_64 || !other.care_about_round16_64);
  166    }
  167 };
  168 
  169 constexpr Format asVOP3(Format format) {
  170    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
  171 };
  172 
  173 constexpr Format asSDWA(Format format) {
  174    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
  175    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
  176 }
  177 
  178 enum class RegType {
  179    none = 0,
  180    sgpr,
  181    vgpr,
  182    linear_vgpr,
  183 };
  184 
  185 struct RegClass {
  186 
  187    enum RC : uint8_t {
  188       s1 = 1,
  189       s2 = 2,
  190       s3 = 3,
  191       s4 = 4,
  192       s6 = 6,
  193       s8 = 8,
  194       s16 = 16,
  195       v1 = s1 | (1 << 5),
  196       v2 = s2 | (1 << 5),
  197       v3 = s3 | (1 << 5),
  198       v4 = s4 | (1 << 5),
  199       v5 = 5  | (1 << 5),
  200       v6 = 6  | (1 << 5),
  201       v7 = 7  | (1 << 5),
  202       v8 = 8  | (1 << 5),
  203       /* byte-sized register class */
  204       v1b = v1 | (1 << 7),
  205       v2b = v2 | (1 << 7),
  206       v3b = v3 | (1 << 7),
  207       v4b = v4 | (1 << 7),
  208       v6b = v6 | (1 << 7),
  209       v8b = v8 | (1 << 7),
  210       /* these are used for WWM and spills to vgpr */
  211       v1_linear = v1 | (1 << 6),
  212       v2_linear = v2 | (1 << 6),
  213    };
  214 
  215    RegClass() = default;
  216    constexpr RegClass(RC rc)
  217       : rc(rc) {}
  218    constexpr RegClass(RegType type, unsigned size)
  219       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
  220 
  221    constexpr operator RC() const { return rc; }
  222    explicit operator bool() = delete;
  223 
  224    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
  225    constexpr bool is_subdword() const { return rc & (1 << 7); }
  226    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
  227    //TODO: use size() less in favor of bytes()
  228    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
  229    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
  230    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
  231    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
  232 
  233    static constexpr RegClass get(RegType type, unsigned bytes) {
  234       if (type == RegType::sgpr) {
  235          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
  236       } else {
  237          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
  238                              RegClass(type, bytes / 4u);
  239       }
  240    }
  241 
  242 private:
  243    RC rc;
  244 };
  245 
  246 /* transitional helper expressions */
  247 static constexpr RegClass s1{RegClass::s1};
  248 static constexpr RegClass s2{RegClass::s2};
  249 static constexpr RegClass s3{RegClass::s3};
  250 static constexpr RegClass s4{RegClass::s4};
  251 static constexpr RegClass s8{RegClass::s8};
  252 static constexpr RegClass s16{RegClass::s16};
  253 static constexpr RegClass v1{RegClass::v1};
  254 static constexpr RegClass v2{RegClass::v2};
  255 static constexpr RegClass v3{RegClass::v3};
  256 static constexpr RegClass v4{RegClass::v4};
  257 static constexpr RegClass v5{RegClass::v5};
  258 static constexpr RegClass v6{RegClass::v6};
  259 static constexpr RegClass v7{RegClass::v7};
  260 static constexpr RegClass v8{RegClass::v8};
  261 static constexpr RegClass v1b{RegClass::v1b};
  262 static constexpr RegClass v2b{RegClass::v2b};
  263 static constexpr RegClass v3b{RegClass::v3b};
  264 static constexpr RegClass v4b{RegClass::v4b};
  265 static constexpr RegClass v6b{RegClass::v6b};
  266 static constexpr RegClass v8b{RegClass::v8b};
  267 
  268 /**
  269  * Temp Class
  270  * Each temporary virtual register has a
  271  * register class (i.e. size and type)
  272  * and SSA id.
  273  */
  274 struct Temp {
  275    Temp() noexcept : id_(0), reg_class(0) {}
  276    constexpr Temp(uint32_t id, RegClass cls) noexcept
  277       : id_(id), reg_class(uint8_t(cls)) {}
  278 
  279    constexpr uint32_t id() const noexcept { return id_; }
  280    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
  281 
  282    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
  283    constexpr unsigned size() const noexcept { return regClass().size(); }
  284    constexpr RegType type() const noexcept { return regClass().type(); }
  285    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
  286 
  287    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
  288    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
  289    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
  290 
  291 private:
  292    uint32_t id_: 24;
  293    uint32_t reg_class : 8;
  294 };
  295 
  296 /**
  297  * PhysReg
  298  * Represents the physical register for each
  299  * Operand and Definition.
  300  */
  301 struct PhysReg {
  302    constexpr PhysReg() = default;
  303    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
  304    constexpr unsigned reg() const { return reg_b >> 2; }
  305    constexpr unsigned byte() const { return reg_b & 0x3; }
  306    constexpr operator unsigned() const { return reg(); }
  307    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
  308    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
  309    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
  310 
  311    uint16_t reg_b = 0;
  312 };
  313 
  314 /* helper expressions for special registers */
  315 static constexpr PhysReg m0{124};
  316 static constexpr PhysReg vcc{106};
  317 static constexpr PhysReg vcc_hi{107};
  318 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
  319 static constexpr PhysReg exec{126};
  320 static constexpr PhysReg exec_lo{126};
  321 static constexpr PhysReg exec_hi{127};
  322 static constexpr PhysReg vccz{251};
  323 static constexpr PhysReg execz{252};
  324 static constexpr PhysReg scc{253};
  325 
  326 /**
  327  * Operand Class
  328  * Initially, each Operand refers to either
  329  * a temporary virtual register
  330  * or to a constant value
  331  * Temporary registers get mapped to physical register during RA
  332  * Constant values are inlined into the instruction sequence.
  333  */
  334 class Operand final
  335 {
  336 public:
  337    constexpr Operand()
  338       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
  339         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
  340         isLateKill_(false) {}
  341 
  342    explicit Operand(Temp r) noexcept
  343    {
  344       data_.temp = r;
  345       if (r.id()) {
  346          isTemp_ = true;
  347       } else {
  348          isUndef_ = true;
  349          setFixed(PhysReg{128});
  350       }
  351    };
  352    explicit Operand(uint32_t v, bool is64bit = false) noexcept
  353    {
  354       data_.i = v;
  355       isConstant_ = true;
  356       is64BitConst_ = is64bit;
  357       if (v <= 64)
  358          setFixed(PhysReg{128 + v});
  359       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
  360          setFixed(PhysReg{192 - v});
  361       else if (v == 0x3f000000) /* 0.5 */
  362          setFixed(PhysReg{240});
  363       else if (v == 0xbf000000) /* -0.5 */
  364          setFixed(PhysReg{241});
  365       else if (v == 0x3f800000) /* 1.0 */
  366          setFixed(PhysReg{242});
  367       else if (v == 0xbf800000) /* -1.0 */
  368          setFixed(PhysReg{243});
  369       else if (v == 0x40000000) /* 2.0 */
  370          setFixed(PhysReg{244});
  371       else if (v == 0xc0000000) /* -2.0 */
  372          setFixed(PhysReg{245});
  373       else if (v == 0x40800000) /* 4.0 */
  374          setFixed(PhysReg{246});
  375       else if (v == 0xc0800000) /* -4.0 */
  376          setFixed(PhysReg{247});
  377       else { /* Literal Constant */
  378          assert(!is64bit && "attempt to create a 64-bit literal constant");
  379          setFixed(PhysReg{255});
  380       }
  381    };
  382    explicit Operand(uint64_t v) noexcept
  383    {
  384       isConstant_ = true;
  385       is64BitConst_ = true;
  386       if (v <= 64) {
  387          data_.i = (uint32_t) v;
  388          setFixed(PhysReg{128 + (uint32_t) v});
  389       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
  390          data_.i = (uint32_t) v;
  391          setFixed(PhysReg{192 - (uint32_t) v});
  392       } else if (v == 0x3FE0000000000000) { /* 0.5 */
  393          data_.i = 0x3f000000;
  394          setFixed(PhysReg{240});
  395       } else if (v == 0xBFE0000000000000) { /* -0.5 */
  396          data_.i = 0xbf000000;
  397          setFixed(PhysReg{241});
  398       } else if (v == 0x3FF0000000000000) { /* 1.0 */
  399          data_.i = 0x3f800000;
  400          setFixed(PhysReg{242});
  401       } else if (v == 0xBFF0000000000000) { /* -1.0 */
  402          data_.i = 0xbf800000;
  403          setFixed(PhysReg{243});
  404       } else if (v == 0x4000000000000000) { /* 2.0 */
  405          data_.i = 0x40000000;
  406          setFixed(PhysReg{244});
  407       } else if (v == 0xC000000000000000) { /* -2.0 */
  408          data_.i = 0xc0000000;
  409          setFixed(PhysReg{245});
  410       } else if (v == 0x4010000000000000) { /* 4.0 */
  411          data_.i = 0x40800000;
  412          setFixed(PhysReg{246});
  413       } else if (v == 0xC010000000000000) { /* -4.0 */
  414          data_.i = 0xc0800000;
  415          setFixed(PhysReg{247});
  416       } else { /* Literal Constant: we don't know if it is a long or double.*/
  417          isConstant_ = 0;
  418          assert(false && "attempt to create a 64-bit literal constant");
  419       }
  420    };
  421    explicit Operand(RegClass type) noexcept
  422    {
  423       isUndef_ = true;
  424       data_.temp = Temp(0, type);
  425       setFixed(PhysReg{128});
  426    };
  427    explicit Operand(PhysReg reg, RegClass type) noexcept
  428    {
  429       data_.temp = Temp(0, type);
  430       setFixed(reg);
  431    }
  432 
  433    constexpr bool isTemp() const noexcept
  434    {
  435       return isTemp_;
  436    }
  437 
  438    constexpr void setTemp(Temp t) noexcept {
  439       assert(!isConstant_);
  440       isTemp_ = true;
  441       data_.temp = t;
  442    }
  443 
  444    constexpr Temp getTemp() const noexcept
  445    {
  446       return data_.temp;
  447    }
  448 
  449    constexpr uint32_t tempId() const noexcept
  450    {
  451       return data_.temp.id();
  452    }
  453 
  454    constexpr bool hasRegClass() const noexcept
  455    {
  456       return isTemp() || isUndefined();
  457    }
  458 
  459    constexpr RegClass regClass() const noexcept
  460    {
  461       return data_.temp.regClass();
  462    }
  463 
  464    constexpr unsigned bytes() const noexcept
  465    {
  466       if (isConstant())
  467          return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
  468       else
  469          return data_.temp.bytes();
  470    }
  471 
  472    constexpr unsigned size() const noexcept
  473    {
  474       if (isConstant())
  475          return is64BitConst_ ? 2 : 1;
  476       else
  477          return data_.temp.size();
  478    }
  479 
  480    constexpr bool isFixed() const noexcept
  481    {
  482       return isFixed_;
  483    }
  484 
  485    constexpr PhysReg physReg() const noexcept
  486    {
  487       return reg_;
  488    }
  489 
  490    constexpr void setFixed(PhysReg reg) noexcept
  491    {
  492       isFixed_ = reg != unsigned(-1);
  493       reg_ = reg;
  494    }
  495 
  496    constexpr bool isConstant() const noexcept
  497    {
  498       return isConstant_;
  499    }
  500 
  501    constexpr bool isLiteral() const noexcept
  502    {
  503       return isConstant() && reg_ == 255;
  504    }
  505 
  506    constexpr bool isUndefined() const noexcept
  507    {
  508       return isUndef_;
  509    }
  510 
  511    constexpr uint32_t constantValue() const noexcept
  512    {
  513       return data_.i;
  514    }
  515 
  516    constexpr bool constantEquals(uint32_t cmp) const noexcept
  517    {
  518       return isConstant() && constantValue() == cmp;
  519    }
  520 
  521    constexpr uint64_t constantValue64(bool signext=false) const noexcept
  522    {
  523       if (is64BitConst_) {
  524          if (reg_ <= 192)
  525             return reg_ - 128;
  526          else if (reg_ <= 208)
  527             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
  528 
  529          switch (reg_) {
  530          case 240:
  531             return 0x3FE0000000000000;
  532          case 241:
  533             return 0xBFE0000000000000;
  534          case 242:
  535             return 0x3FF0000000000000;
  536          case 243:
  537             return 0xBFF0000000000000;
  538          case 244:
  539             return 0x4000000000000000;
  540          case 245:
  541             return 0xC000000000000000;
  542          case 246:
  543             return 0x4010000000000000;
  544          case 247:
  545             return 0xC010000000000000;
  546          }
  547       }
  548       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
  549    }
  550 
  551    /* Indicates that the killed operand's live range intersects with the
  552     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
  553     * not set by liveness analysis. */
  554    constexpr void setLateKill(bool flag) noexcept
  555    {
  556       isLateKill_ = flag;
  557    }
  558 
  559    constexpr bool isLateKill() const noexcept
  560    {
  561       return isLateKill_;
  562    }
  563 
  564    constexpr void setKill(bool flag) noexcept
  565    {
  566       isKill_ = flag;
  567       if (!flag)
  568          setFirstKill(false);
  569    }
  570 
  571    constexpr bool isKill() const noexcept
  572    {
  573       return isKill_ || isFirstKill();
  574    }
  575 
  576    constexpr void setFirstKill(bool flag) noexcept
  577    {
  578       isFirstKill_ = flag;
  579       if (flag)
  580          setKill(flag);
  581    }
  582 
  583    /* When there are multiple operands killing the same temporary,
  584     * isFirstKill() is only returns true for the first one. */
  585    constexpr bool isFirstKill() const noexcept
  586    {
  587       return isFirstKill_;
  588    }
  589 
  590    constexpr bool isKillBeforeDef() const noexcept
  591    {
  592       return isKill() && !isLateKill();
  593    }
  594 
  595    constexpr bool isFirstKillBeforeDef() const noexcept
  596    {
  597       return isFirstKill() && !isLateKill();
  598    }
  599 
  600    constexpr bool operator == (Operand other) const noexcept
  601    {
  602       if (other.size() != size())
  603          return false;
  604       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
  605          return false;
  606       if (isFixed() && other.isFixed() && physReg() != other.physReg())
  607          return false;
  608       if (isLiteral())
  609          return other.isLiteral() && other.constantValue() == constantValue();
  610       else if (isConstant())
  611          return other.isConstant() && other.physReg() == physReg();
  612       else if (isUndefined())
  613          return other.isUndefined() && other.regClass() == regClass();
  614       else
  615          return other.isTemp() && other.getTemp() == getTemp();
  616    }
  617 private:
  618    union {
  619       uint32_t i;
  620       float f;
  621       Temp temp = Temp(0, s1);
  622    } data_;
  623    PhysReg reg_;
  624    union {
  625       struct {
  626          uint8_t isTemp_:1;
  627          uint8_t isFixed_:1;
  628          uint8_t isConstant_:1;
  629          uint8_t isKill_:1;
  630          uint8_t isUndef_:1;
  631          uint8_t isFirstKill_:1;
  632          uint8_t is64BitConst_:1;
  633          uint8_t isLateKill_:1;
  634       };
  635       /* can't initialize bit-fields in c++11, so work around using a union */
  636       uint8_t control_ = 0;
  637    };
  638 };
  639 
  640 /**
  641  * Definition Class
  642  * Definitions are the results of Instructions
  643  * and refer to temporary virtual registers
  644  * which are later mapped to physical registers
  645  */
  646 class Definition final
  647 {
  648 public:
  649    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
  650    Definition(uint32_t index, RegClass type) noexcept
  651       : temp(index, type) {}
  652    explicit Definition(Temp tmp) noexcept
  653       : temp(tmp) {}
  654    Definition(PhysReg reg, RegClass type) noexcept
  655       : temp(Temp(0, type))
  656    {
  657       setFixed(reg);
  658    }
  659    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
  660       : temp(Temp(tmpId, type))
  661    {
  662       setFixed(reg);
  663    }
  664 
  665    constexpr bool isTemp() const noexcept
  666    {
  667       return tempId() > 0;
  668    }
  669 
  670    constexpr Temp getTemp() const noexcept
  671    {
  672       return temp;
  673    }
  674 
  675    constexpr uint32_t tempId() const noexcept
  676    {
  677       return temp.id();
  678    }
  679 
  680    constexpr void setTemp(Temp t) noexcept {
  681       temp = t;
  682    }
  683 
  684    constexpr RegClass regClass() const noexcept
  685    {
  686       return temp.regClass();
  687    }
  688 
  689    constexpr unsigned bytes() const noexcept
  690    {
  691       return temp.bytes();
  692    }
  693 
  694    constexpr unsigned size() const noexcept
  695    {
  696       return temp.size();
  697    }
  698 
  699    constexpr bool isFixed() const noexcept
  700    {
  701       return isFixed_;
  702    }
  703 
  704    constexpr PhysReg physReg() const noexcept
  705    {
  706       return reg_;
  707    }
  708 
  709    constexpr void setFixed(PhysReg reg) noexcept
  710    {
  711       isFixed_ = 1;
  712       reg_ = reg;
  713    }
  714 
  715    constexpr void setHint(PhysReg reg) noexcept
  716    {
  717       hasHint_ = 1;
  718       reg_ = reg;
  719    }
  720 
  721    constexpr bool hasHint() const noexcept
  722    {
  723       return hasHint_;
  724    }
  725 
  726    constexpr void setKill(bool flag) noexcept
  727    {
  728       isKill_ = flag;
  729    }
  730 
  731    constexpr bool isKill() const noexcept
  732    {
  733       return isKill_;
  734    }
  735 
  736 private:
  737    Temp temp = Temp(0, s1);
  738    PhysReg reg_;
  739    union {
  740       struct {
  741          uint8_t isFixed_:1;
  742          uint8_t hasHint_:1;
  743          uint8_t isKill_:1;
  744       };
  745       /* can't initialize bit-fields in c++11, so work around using a union */
  746       uint8_t control_ = 0;
  747    };
  748 };
  749 
  750 class Block;
  751 
  752 struct Instruction {
  753    aco_opcode opcode;
  754    Format format;
  755    uint32_t pass_flags;
  756 
  757    aco::span<Operand> operands;
  758    aco::span<Definition> definitions;
  759 
  760    constexpr bool isVALU() const noexcept
  761    {
  762       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
  763           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
  764           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
  765           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
  766           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
  767           || format == Format::VOP3P;
  768    }
  769 
  770    constexpr bool isSALU() const noexcept
  771    {
  772       return format == Format::SOP1 ||
  773              format == Format::SOP2 ||
  774              format == Format::SOPC ||
  775              format == Format::SOPK ||
  776              format == Format::SOPP;
  777    }
  778 
  779    constexpr bool isVMEM() const noexcept
  780    {
  781       return format == Format::MTBUF ||
  782              format == Format::MUBUF ||
  783              format == Format::MIMG;
  784    }
  785 
  786    constexpr bool isDPP() const noexcept
  787    {
  788       return (uint16_t) format & (uint16_t) Format::DPP;
  789    }
  790 
  791    constexpr bool isVOP3() const noexcept
  792    {
  793       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
  794              ((uint16_t) format & (uint16_t) Format::VOP3B);
  795    }
  796 
  797    constexpr bool isSDWA() const noexcept
  798    {
  799       return (uint16_t) format & (uint16_t) Format::SDWA;
  800    }
  801 
  802    constexpr bool isFlatOrGlobal() const noexcept
  803    {
  804       return format == Format::FLAT || format == Format::GLOBAL;
  805    }
  806 
  807    constexpr bool usesModifiers() const noexcept;
  808 
  809    constexpr bool reads_exec() const noexcept
  810    {
  811       for (const Operand& op : operands) {
  812          if (op.isFixed() && op.physReg() == exec)
  813             return true;
  814       }
  815       return false;
  816    }
  817 };
  818 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
  819 
  820 struct SOPK_instruction : public Instruction {
  821    uint16_t imm;
  822    uint16_t padding;
  823 };
  824 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
  825 
  826 struct SOPP_instruction : public Instruction {
  827    uint32_t imm;
  828    int block;
  829 };
  830 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  831 
  832 struct SOPC_instruction : public Instruction {
  833 };
  834 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
  835 
  836 struct SOP1_instruction : public Instruction {
  837 };
  838 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
  839 
  840 struct SOP2_instruction : public Instruction {
  841 };
  842 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
  843 
  844 /**
  845  * Scalar Memory Format:
  846  * For s_(buffer_)load_dword*:
  847  * Operand(0): SBASE - SGPR-pair which provides base address
  848  * Operand(1): Offset - immediate (un)signed offset or SGPR
  849  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
  850  * Operand(n-1): SOffset - SGPR offset (Vega only)
  851  *
  852  * Having no operands is also valid for instructions such as s_dcache_inv.
  853  *
  854  */
  855 struct SMEM_instruction : public Instruction {
  856    barrier_interaction barrier;
  857    bool glc : 1; /* VI+: globally coherent */
  858    bool dlc : 1; /* NAVI: device level coherent */
  859    bool nv : 1; /* VEGA only: Non-volatile */
  860    bool can_reorder : 1;
  861    bool disable_wqm : 1;
  862    uint32_t padding: 19;
  863 };
  864 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
  865 
  866 struct VOP1_instruction : public Instruction {
  867 };
  868 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
  869 
  870 struct VOP2_instruction : public Instruction {
  871 };
  872 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
  873 
  874 struct VOPC_instruction : public Instruction {
  875 };
  876 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
  877 
  878 struct VOP3A_instruction : public Instruction {
  879    bool abs[3];
  880    bool neg[3];
  881    uint8_t opsel : 4;
  882    uint8_t omod : 2;
  883    bool clamp : 1;
  884    uint32_t padding : 9;
  885 };
  886 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  887 
  888 struct VOP3P_instruction : public Instruction {
  889    bool neg_lo[3];
  890    bool neg_hi[3];
  891    uint8_t opsel_lo : 3;
  892    uint8_t opsel_hi : 3;
  893    bool clamp : 1;
  894    uint32_t padding : 9;
  895 };
  896 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  897 
  898 /**
  899  * Data Parallel Primitives Format:
  900  * This format can be used for VOP1, VOP2 or VOPC instructions.
  901  * The swizzle applies to the src0 operand.
  902  *
  903  */
  904 struct DPP_instruction : public Instruction {
  905    bool abs[2];
  906    bool neg[2];
  907    uint16_t dpp_ctrl;
  908    uint8_t row_mask : 4;
  909    uint8_t bank_mask : 4;
  910    bool bound_ctrl : 1;
  911    uint32_t padding : 7;
  912 };
  913 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  914 
  915 enum sdwa_sel : uint8_t {
  916     /* masks */
  917     sdwa_wordnum = 0x1,
  918     sdwa_bytenum = 0x3,
  919     sdwa_asuint = 0x7 | 0x10,
  920     sdwa_rasize = 0x3,
  921 
  922     /* flags */
  923     sdwa_isword = 0x4,
  924     sdwa_sext = 0x8,
  925     sdwa_isra = 0x10,
  926 
  927     /* specific values */
  928     sdwa_ubyte0 = 0,
  929     sdwa_ubyte1 = 1,
  930     sdwa_ubyte2 = 2,
  931     sdwa_ubyte3 = 3,
  932     sdwa_uword0 = sdwa_isword | 0,
  933     sdwa_uword1 = sdwa_isword | 1,
  934     sdwa_udword = 6,
  935 
  936     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
  937     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
  938     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
  939     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
  940     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
  941     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
  942     sdwa_sdword = sdwa_udword | sdwa_sext,
  943 
  944     /* register-allocated */
  945     sdwa_ubyte = 1 | sdwa_isra,
  946     sdwa_uword = 2 | sdwa_isra,
  947     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
  948     sdwa_sword = sdwa_uword | sdwa_sext,
  949 };
  950 
  951 /**
  952  * Sub-Dword Addressing Format:
  953  * This format can be used for VOP1, VOP2 or VOPC instructions.
  954  *
  955  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
  956  * the definition doesn't have to be VCC on GFX9+.
  957  *
  958  */
  959 struct SDWA_instruction : public Instruction {
  960    /* these destination modifiers aren't available with VOPC except for
  961     * clamp on GFX8 */
  962    uint8_t sel[2];
  963    uint8_t dst_sel;
  964    bool neg[2];
  965    bool abs[2];
  966    bool dst_preserve : 1;
  967    bool clamp : 1;
  968    uint8_t omod : 2; /* GFX9+ */
  969    uint32_t padding : 4;
  970 };
  971 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  972 
  973 struct Interp_instruction : public Instruction {
  974    uint8_t attribute;
  975    uint8_t component;
  976    uint16_t padding;
  977 };
  978 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
  979 
  980 /**
  981  * Local and Global Data Sharing instructions
  982  * Operand(0): ADDR - VGPR which supplies the address.
  983  * Operand(1): DATA0 - First data VGPR.
  984  * Operand(2): DATA1 - Second data VGPR.
  985  * Operand(n-1): M0 - LDS size.
  986  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
  987  *
  988  */
  989 struct DS_instruction : public Instruction {
  990    int16_t offset0;
  991    int8_t offset1;
  992    bool gds;
  993 };
  994 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
  995 
  996 /**
  997  * Vector Memory Untyped-buffer Instructions
  998  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
  999  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 1000  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 1001  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 1002  *
 1003  */
 1004 struct MUBUF_instruction : public Instruction {
 1005    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
 1006    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 1007    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 1008    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
 1009    bool glc : 1; /* globally coherent */
 1010    bool dlc : 1; /* NAVI: device level coherent */
 1011    bool slc : 1; /* system level coherent */
 1012    bool tfe : 1; /* texture fail enable */
 1013    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
 1014    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 1015    bool can_reorder : 1;
 1016    uint8_t padding : 2;
 1017    barrier_interaction barrier;
 1018 };
 1019 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 1020 
 1021 /**
 1022  * Vector Memory Typed-buffer Instructions
 1023  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 1024  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 1025  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 1026  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 1027  *
 1028  */
 1029 struct MTBUF_instruction : public Instruction {
 1030    uint16_t offset; /* Unsigned byte offset - 12 bit */
 1031    barrier_interaction barrier;
 1032    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 1033    uint8_t nfmt : 3; /* Numeric format of data in memory */
 1034    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 1035    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 1036    bool glc : 1; /* globally coherent */
 1037    bool dlc : 1; /* NAVI: device level coherent */
 1038    bool slc : 1; /* system level coherent */
 1039    bool tfe : 1; /* texture fail enable */
 1040    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 1041    bool can_reorder : 1;
 1042    uint32_t padding : 25;
 1043 };
 1044 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 1045 
 1046 /**
 1047  * Vector Memory Image Instructions
 1048  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
 1049  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
 1050  *             or VDATA - Vector GPR for write data.
 1051  * Operand(2): VADDR - Address source. Can carry an offset or an index.
 1052  * Definition(0): VDATA - Vector GPR for read result.
 1053  *
 1054  */
 1055 struct MIMG_instruction : public Instruction {
 1056    uint8_t dmask; /* Data VGPR enable mask */
 1057    uint8_t dim : 3; /* NAVI: dimensionality */
 1058    bool unrm : 1; /* Force address to be un-normalized */
 1059    bool dlc : 1; /* NAVI: device level coherent */
 1060    bool glc : 1; /* globally coherent */
 1061    bool slc : 1; /* system level coherent */
 1062    bool tfe : 1; /* texture fail enable */
 1063    bool da : 1; /* declare an array */
 1064    bool lwe : 1; /* Force data to be un-normalized */
 1065    bool r128 : 1; /* NAVI: Texture resource size */
 1066    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
 1067    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
 1068    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 1069    bool can_reorder : 1;
 1070    uint8_t padding : 1;
 1071    barrier_interaction barrier;
 1072 };
 1073 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 1074 
 1075 /**
 1076  * Flat/Scratch/Global Instructions
 1077  * Operand(0): ADDR
 1078  * Operand(1): SADDR
 1079  * Operand(2) / Definition(0): DATA/VDST
 1080  *
 1081  */
 1082 struct FLAT_instruction : public Instruction {
 1083    uint16_t offset; /* Vega/Navi only */
 1084    bool slc : 1; /* system level coherent */
 1085    bool glc : 1; /* globally coherent */
 1086    bool dlc : 1; /* NAVI: device level coherent */
 1087    bool lds : 1;
 1088    bool nv : 1;
 1089    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 1090    bool can_reorder : 1;
 1091    uint8_t padding : 1;
 1092    barrier_interaction barrier;
 1093 };
 1094 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 1095 
 1096 struct Export_instruction : public Instruction {
 1097    uint8_t enabled_mask;
 1098    uint8_t dest;
 1099    bool compressed : 1;
 1100    bool done : 1;
 1101    bool valid_mask : 1;
 1102    uint32_t padding : 13;
 1103 };
 1104 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 1105 
 1106 struct Pseudo_instruction : public Instruction {
 1107    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 1108    bool tmp_in_scc;
 1109    uint8_t padding;
 1110 };
 1111 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 1112 
 1113 struct Pseudo_branch_instruction : public Instruction {
 1114    /* target[0] is the block index of the branch target.
 1115     * For conditional branches, target[1] contains the fall-through alternative.
 1116     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 1117     */
 1118    uint32_t target[2];
 1119 };
 1120 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 1121 
 1122 struct Pseudo_barrier_instruction : public Instruction {
 1123 };
 1124 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 1125 
 1126 enum ReduceOp : uint16_t {
 1127    iadd32, iadd64,
 1128    imul32, imul64,
 1129    fadd32, fadd64,
 1130    fmul32, fmul64,
 1131    imin32, imin64,
 1132    imax32, imax64,
 1133    umin32, umin64,
 1134    umax32, umax64,
 1135    fmin32, fmin64,
 1136    fmax32, fmax64,
 1137    iand32, iand64,
 1138    ior32, ior64,
 1139    ixor32, ixor64,
 1140    gfx10_wave64_bpermute
 1141 };
 1142 
 1143 /**
 1144  * Subgroup Reduction Instructions, everything except for the data to be
 1145  * reduced and the result as inserted by setup_reduce_temp().
 1146  * Operand(0): data to be reduced
 1147  * Operand(1): reduce temporary
 1148  * Operand(2): vector temporary
 1149  * Definition(0): result
 1150  * Definition(1): scalar temporary
 1151  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
 1152  * Definition(3): scc clobber
 1153  * Definition(4): vcc clobber
 1154  *
 1155  */
 1156 struct Pseudo_reduction_instruction : public Instruction {
 1157    ReduceOp reduce_op;
 1158    uint16_t cluster_size; // must be 0 for scans
 1159 };
 1160 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 1161 
 1162 struct instr_deleter_functor {
 1163    void operator()(void* p) {
 1164       free(p);
 1165    }
 1166 };
 1167 
 1168 template<typename T>
 1169 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 1170 
 1171 template<typename T>
 1172 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 1173 {
 1174    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 1175    char *data = (char*) calloc(1, size);
 1176    T* inst = (T*) data;
 1177 
 1178    inst->opcode = opcode;
 1179    inst->format = format;
 1180 
 1181    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
 1182    inst->operands = aco::span<Operand>(operands_offset, num_operands);
 1183    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
 1184    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
 1185 
 1186    return inst;
 1187 }
 1188 
 1189 constexpr bool Instruction::usesModifiers() const noexcept
 1190 {
 1191    if (isDPP() || isSDWA())
 1192       return true;
 1193 
 1194    if (format == Format::VOP3P) {
 1195       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
 1196       for (unsigned i = 0; i < operands.size(); i++) {
 1197          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
 1198             return true;
 1199       }
 1200       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
 1201    } else if (isVOP3()) {
 1202       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
 1203       for (unsigned i = 0; i < operands.size(); i++) {
 1204          if (vop3->abs[i] || vop3->neg[i])
 1205             return true;
 1206       }
 1207       return vop3->opsel || vop3->clamp || vop3->omod;
 1208    }
 1209    return false;
 1210 }
 1211 
 1212 constexpr bool is_phi(Instruction* instr)
 1213 {
 1214    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 1215 }
 1216 
 1217 static inline bool is_phi(aco_ptr<Instruction>& instr)
 1218 {
 1219    return is_phi(instr.get());
 1220 }
 1221 
 1222 barrier_interaction get_barrier_interaction(const Instruction* instr);
 1223 
 1224 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
 1225 
 1226 enum block_kind {
 1227    /* uniform indicates that leaving this block,
 1228     * all actives lanes stay active */
 1229    block_kind_uniform = 1 << 0,
 1230    block_kind_top_level = 1 << 1,
 1231    block_kind_loop_preheader = 1 << 2,
 1232    block_kind_loop_header = 1 << 3,
 1233    block_kind_loop_exit = 1 << 4,
 1234    block_kind_continue = 1 << 5,
 1235    block_kind_break = 1 << 6,
 1236    block_kind_continue_or_break = 1 << 7,
 1237    block_kind_discard = 1 << 8,
 1238    block_kind_branch = 1 << 9,
 1239    block_kind_merge = 1 << 10,
 1240    block_kind_invert = 1 << 11,
 1241    block_kind_uses_discard_if = 1 << 12,
 1242    block_kind_needs_lowering = 1 << 13,
 1243    block_kind_uses_demote = 1 << 14,
 1244    block_kind_export_end = 1 << 15,
 1245 };
 1246 
 1247 
 1248 struct RegisterDemand {
 1249    constexpr RegisterDemand() = default;
 1250    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
 1251       : vgpr{v}, sgpr{s} {}
 1252    int16_t vgpr = 0;
 1253    int16_t sgpr = 0;
 1254 
 1255    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
 1256       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
 1257    }
 1258 
 1259    constexpr bool exceeds(const RegisterDemand other) const noexcept {
 1260       return vgpr > other.vgpr || sgpr > other.sgpr;
 1261    }
 1262 
 1263    constexpr RegisterDemand operator+(const Temp t) const noexcept {
 1264       if (t.type() == RegType::sgpr)
 1265          return RegisterDemand( vgpr, sgpr + t.size() );
 1266       else
 1267          return RegisterDemand( vgpr + t.size(), sgpr );
 1268    }
 1269 
 1270    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
 1271       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
 1272    }
 1273 
 1274    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
 1275       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
 1276    }
 1277 
 1278    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
 1279       vgpr += other.vgpr;
 1280       sgpr += other.sgpr;
 1281       return *this;
 1282    }
 1283 
 1284    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
 1285       vgpr -= other.vgpr;
 1286       sgpr -= other.sgpr;
 1287       return *this;
 1288    }
 1289 
 1290    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
 1291       if (t.type() == RegType::sgpr)
 1292          sgpr += t.size();
 1293       else
 1294          vgpr += t.size();
 1295       return *this;
 1296    }
 1297 
 1298    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
 1299       if (t.type() == RegType::sgpr)
 1300          sgpr -= t.size();
 1301       else
 1302          vgpr -= t.size();
 1303       return *this;
 1304    }
 1305 
 1306    constexpr void update(const RegisterDemand other) noexcept {
 1307       vgpr = std::max(vgpr, other.vgpr);
 1308       sgpr = std::max(sgpr, other.sgpr);
 1309    }
 1310 
 1311 };
 1312 
 1313 /* CFG */
 1314 struct Block {
 1315    float_mode fp_mode;
 1316    unsigned index;
 1317    unsigned offset = 0;
 1318    std::vector<aco_ptr<Instruction>> instructions;
 1319    std::vector<unsigned> logical_preds;
 1320    std::vector<unsigned> linear_preds;
 1321    std::vector<unsigned> logical_succs;
 1322    std::vector<unsigned> linear_succs;
 1323    RegisterDemand register_demand = RegisterDemand();
 1324    uint16_t loop_nest_depth = 0;
 1325    uint16_t kind = 0;
 1326    int logical_idom = -1;
 1327    int linear_idom = -1;
 1328    Temp live_out_exec = Temp();
 1329 
 1330    /* this information is needed for predecessors to blocks with phis when
 1331     * moving out of ssa */
 1332    bool scc_live_out = false;
 1333    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
 1334 
 1335    Block(unsigned idx) : index(idx) {}
 1336    Block() : index(0) {}
 1337 };
 1338 
 1339 using Stage = uint16_t;
 1340 
 1341 /* software stages */
 1342 static constexpr Stage sw_vs = 1 << 0;
 1343 static constexpr Stage sw_gs = 1 << 1;
 1344 static constexpr Stage sw_tcs = 1 << 2;
 1345 static constexpr Stage sw_tes = 1 << 3;
 1346 static constexpr Stage sw_fs = 1 << 4;
 1347 static constexpr Stage sw_cs = 1 << 5;
 1348 static constexpr Stage sw_gs_copy = 1 << 6;
 1349 static constexpr Stage sw_mask = 0x7f;
 1350 
 1351 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
 1352 static constexpr Stage hw_vs = 1 << 7;
 1353 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
 1354 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
 1355 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
 1356 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
 1357 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
 1358 static constexpr Stage hw_fs = 1 << 13;
 1359 static constexpr Stage hw_cs = 1 << 14;
 1360 static constexpr Stage hw_mask = 0xff << 7;
 1361 
 1362 /* possible settings of Program::stage */
 1363 static constexpr Stage vertex_vs = sw_vs | hw_vs;
 1364 static constexpr Stage fragment_fs = sw_fs | hw_fs;
 1365 static constexpr Stage compute_cs = sw_cs | hw_cs;
 1366 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
 1367 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
 1368 /* GFX10/NGG */
 1369 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
 1370 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
 1371 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
 1372 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
 1373 /* GFX9 (and GFX10 if NGG isn't used) */
 1374 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
 1375 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
 1376 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
 1377 /* pre-GFX9 */
 1378 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
 1379 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
 1380 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
 1381 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
 1382 static constexpr Stage geometry_gs = sw_gs | hw_gs;
 1383 
 1384 enum statistic {
 1385    statistic_hash,
 1386    statistic_instructions,
 1387    statistic_copies,
 1388    statistic_branches,
 1389    statistic_cycles,
 1390    statistic_vmem_clauses,
 1391    statistic_smem_clauses,
 1392    statistic_vmem_score,
 1393    statistic_smem_score,
 1394    statistic_sgpr_presched,
 1395    statistic_vgpr_presched,
 1396    num_statistics
 1397 };
 1398 
 1399 class Program final {
 1400 public:
 1401    float_mode next_fp_mode;
 1402    std::vector<Block> blocks;
 1403    RegisterDemand max_reg_demand = RegisterDemand();
 1404    uint16_t num_waves = 0;
 1405    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
 1406    ac_shader_config* config;
 1407    struct radv_shader_info *info;
 1408    enum chip_class chip_class;
 1409    enum radeon_family family;
 1410    unsigned wave_size;
 1411    RegClass lane_mask;
 1412    Stage stage; /* Stage */
 1413    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
 1414    bool needs_wqm = false; /* there exists a p_wqm instruction */
 1415    bool wb_smem_l1_on_end = false;
 1416 
 1417    std::vector<uint8_t> constant_data;
 1418    Temp private_segment_buffer;
 1419    Temp scratch_offset;
 1420 
 1421    uint16_t min_waves = 0;
 1422    uint16_t lds_alloc_granule;
 1423    uint32_t lds_limit; /* in bytes */
 1424    bool has_16bank_lds;
 1425    uint16_t vgpr_limit;
 1426    uint16_t sgpr_limit;
 1427    uint16_t physical_sgprs;
 1428    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
 1429    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
 1430    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
 1431 
 1432    bool xnack_enabled = false;
 1433 
 1434    bool needs_vcc = false;
 1435    bool needs_flat_scr = false;
 1436 
 1437    bool collect_statistics = false;
 1438    uint32_t statistics[num_statistics];
 1439 
 1440    uint32_t allocateId()
 1441    {
 1442       assert(allocationID <= 16777215);
 1443       return allocationID++;
 1444    }
 1445 
 1446    uint32_t peekAllocationId()
 1447    {
 1448       return allocationID;
 1449    }
 1450 
 1451    void setAllocationId(uint32_t id)
 1452    {
 1453       allocationID = id;
 1454    }
 1455 
 1456    Block* create_and_insert_block() {
 1457       blocks.emplace_back(blocks.size());
 1458       blocks.back().fp_mode = next_fp_mode;
 1459       return &blocks.back();
 1460    }
 1461 
 1462    Block* insert_block(Block&& block) {
 1463       block.index = blocks.size();
 1464       block.fp_mode = next_fp_mode;
 1465       blocks.emplace_back(std::move(block));
 1466       return &blocks.back();
 1467    }
 1468 
 1469 private:
 1470    uint32_t allocationID = 1;
 1471 };
 1472 
 1473 struct TempHash {
 1474    std::size_t operator()(Temp t) const {
 1475       return t.id();
 1476    }
 1477 };
 1478 using TempSet = std::unordered_set<Temp, TempHash>;
 1479 
 1480 struct live {
 1481    /* live temps out per block */
 1482    std::vector<TempSet> live_out;
 1483    /* register demand (sgpr/vgpr) per instruction per block */
 1484    std::vector<std::vector<RegisterDemand>> register_demand;
 1485 };
 1486 
 1487 void select_program(Program *program,
 1488                     unsigned shader_count,
 1489                     struct nir_shader *const *shaders,
 1490                     ac_shader_config* config,
 1491                     struct radv_shader_args *args);
 1492 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
 1493                            ac_shader_config* config,
 1494                            struct radv_shader_args *args);
 1495 
 1496 void lower_wqm(Program* program, live& live_vars,
 1497                const struct radv_nir_compiler_options *options);
 1498 void lower_phis(Program* program);
 1499 void calc_min_waves(Program* program);
 1500 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
 1501 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
 1502 std::vector<uint16_t> dead_code_analysis(Program *program);
 1503 void dominator_tree(Program* program);
 1504 void insert_exec_mask(Program *program);
 1505 void value_numbering(Program* program);
 1506 void optimize(Program* program);
 1507 void setup_reduce_temp(Program* program);
 1508 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
 1509 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
 1510 void ssa_elimination(Program* program);
 1511 void lower_to_hw_instr(Program* program);
 1512 void schedule_program(Program* program, live& live_vars);
 1513 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
 1514 void insert_wait_states(Program* program);
 1515 void insert_NOPs(Program* program);
 1516 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
 1517 void print_asm(Program *program, std::vector<uint32_t>& binary,
 1518                unsigned exec_size, std::ostream& out);
 1519 void validate(Program* program, FILE *output);
 1520 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
 1521 #ifndef NDEBUG
 1522 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
 1523 #else
 1524 #define perfwarn(program, cond, msg, ...) do {} while(0)
 1525 #endif
 1526 
 1527 void collect_presched_stats(Program *program);
 1528 void collect_preasm_stats(Program *program);
 1529 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
 1530 
 1531 void aco_print_instr(const Instruction *instr, FILE *output);
 1532 void aco_print_program(const Program *program, FILE *output);
 1533 
 1534 /* utilities for dealing with register demand */
 1535 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
 1536 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
 1537 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
 1538 
 1539 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
 1540 uint16_t get_extra_sgprs(Program *program);
 1541 
 1542 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
 1543 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
 1544 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
 1545 
 1546 /* return number of addressable sgprs/vgprs for max_waves */
 1547 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
 1548 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
 1549 
 1550 typedef struct {
 1551    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
 1552    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
 1553    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
 1554    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
 1555    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
 1556    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
 1557    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
 1558    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
 1559 } Info;
 1560 
 1561 extern const Info instr_info;
 1562 
 1563 }
 1564 
 1565 #endif /* ACO_IR_H */
 1566