"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/compiler/nir/nir_opt_algebraic.py" (16 Sep 2020, 101299 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "nir_opt_algebraic.py" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 20.2.0-rc3_vs_20.2.0-rc4.

    1 #
    2 # Copyright (C) 2014 Intel Corporation
    3 #
    4 # Permission is hereby granted, free of charge, to any person obtaining a
    5 # copy of this software and associated documentation files (the "Software"),
    6 # to deal in the Software without restriction, including without limitation
    7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8 # and/or sell copies of the Software, and to permit persons to whom the
    9 # Software is furnished to do so, subject to the following conditions:
   10 #
   11 # The above copyright notice and this permission notice (including the next
   12 # paragraph) shall be included in all copies or substantial portions of the
   13 # Software.
   14 #
   15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   21 # IN THE SOFTWARE.
   22 #
   23 # Authors:
   24 #    Jason Ekstrand (jason@jlekstrand.net)
   25 
   26 from __future__ import print_function
   27 
   28 from collections import OrderedDict
   29 import nir_algebraic
   30 from nir_opcodes import type_sizes
   31 import itertools
   32 import struct
   33 from math import pi
   34 
   35 # Convenience variables
   36 a = 'a'
   37 b = 'b'
   38 c = 'c'
   39 d = 'd'
   40 e = 'e'
   41 
   42 # Written in the form (<search>, <replace>) where <search> is an expression
   43 # and <replace> is either an expression or a value.  An expression is
   44 # defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
   45 # where each source is either an expression or a value.  A value can be
   46 # either a numeric constant or a string representing a variable name.
   47 #
   48 # If the opcode in a search expression is prefixed by a '~' character, this
   49 # indicates that the operation is inexact.  Such operations will only get
   50 # applied to SSA values that do not have the exact bit set.  This should be
   51 # used by by any optimizations that are not bit-for-bit exact.  It should not,
   52 # however, be used for backend-requested lowering operations as those need to
   53 # happen regardless of precision.
   54 #
   55 # Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
   56 # "#" indicates that the given variable will only match constants,
   57 # type indicates that the given variable will only match values from ALU
   58 #    instructions with the given output type,
   59 # (cond) specifies an additional condition function (see nir_search_helpers.h),
   60 # swiz is a swizzle applied to the variable (only in the <replace> expression)
   61 #
   62 # For constants, you have to be careful to make sure that it is the right
   63 # type because python is unaware of the source and destination types of the
   64 # opcodes.
   65 #
   66 # All expression types can have a bit-size specified.  For opcodes, this
   67 # looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a
   68 # type and size.  In the search half of the expression this indicates that it
   69 # should only match that particular bit-size.  In the replace half of the
   70 # expression this indicates that the constructed value should have that
   71 # bit-size.
   72 #
   73 # If the opcode in a replacement expression is prefixed by a '!' character,
   74 # this indicated that the new expression will be marked exact.
   75 #
   76 # A special condition "many-comm-expr" can be used with expressions to note
   77 # that the expression and its subexpressions have more commutative expressions
   78 # than nir_replace_instr can handle.  If this special condition is needed with
   79 # another condition, the two can be separated by a comma (e.g.,
   80 # "(many-comm-expr,is_used_once)").
   81 
   82 # based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
   83 def lowered_sincos(c):
   84     x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
   85     x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
   86     return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
   87 
   88 def intBitsToFloat(i):
   89     return struct.unpack('!f', struct.pack('!I', i))[0]
   90 
   91 optimizations = [
   92 
   93    (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
   94    (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
   95    (('ishl', a, '#b@32'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
   96 
   97    (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
   98    (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
   99    (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
  100    (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
  101    (('udiv', a, 1), a),
  102    (('idiv', a, 1), a),
  103    (('umod', a, 1), 0),
  104    (('imod', a, 1), 0),
  105    (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
  106    (('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'),
  107    (('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'),
  108    (('umod', a, '#b(is_pos_power_of_two)'),    ('iand', a, ('isub', b, 1))),
  109 
  110    (('~fneg', ('fneg', a)), a),
  111    (('ineg', ('ineg', a)), a),
  112    (('fabs', ('fneg', a)), ('fabs', a)),
  113    (('fabs', ('u2f', a)), ('u2f', a)),
  114    (('iabs', ('iabs', a)), ('iabs', a)),
  115    (('iabs', ('ineg', a)), ('iabs', a)),
  116    (('f2b', ('fneg', a)), ('f2b', a)),
  117    (('i2b', ('ineg', a)), ('i2b', a)),
  118    (('~fadd', a, 0.0), a),
  119    (('iadd', a, 0), a),
  120    (('usadd_4x8', a, 0), a),
  121    (('usadd_4x8', a, ~0), ~0),
  122    (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
  123    (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
  124    (('~fadd', ('fneg', a), a), 0.0),
  125    (('iadd', ('ineg', a), a), 0),
  126    (('iadd', ('ineg', a), ('iadd', a, b)), b),
  127    (('iadd', a, ('iadd', ('ineg', a), b)), b),
  128    (('~fadd', ('fneg', a), ('fadd', a, b)), b),
  129    (('~fadd', a, ('fadd', ('fneg', a), b)), b),
  130    (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
  131    (('~fmul', a, 0.0), 0.0),
  132    (('imul', a, 0), 0),
  133    (('umul_unorm_4x8', a, 0), 0),
  134    (('umul_unorm_4x8', a, ~0), a),
  135    (('~fmul', a, 1.0), a),
  136    (('imul', a, 1), a),
  137    (('fmul', a, -1.0), ('fneg', a)),
  138    (('imul', a, -1), ('ineg', a)),
  139    # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a
  140    # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a
  141    # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0
  142    (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
  143    (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
  144    (('~ffma', 0.0, a, b), b),
  145    (('~ffma', a, b, 0.0), ('fmul', a, b)),
  146    (('ffma', 1.0, a, b), ('fadd', a, b)),
  147    (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
  148    (('~flrp', a, b, 0.0), a),
  149    (('~flrp', a, b, 1.0), b),
  150    (('~flrp', a, a, b), a),
  151    (('~flrp', 0.0, a, b), ('fmul', a, b)),
  152 
  153    # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
  154    (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
  155    (('~flrp@32', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp32'),
  156    (('~flrp@64', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp64'),
  157 
  158    (('~flrp@32', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp32'),
  159    (('~flrp@64', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp64'),
  160 
  161    (('~flrp@32', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp32'),
  162    (('~flrp@64', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp64'),
  163 
  164    (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
  165 
  166    (('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'),
  167    (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
  168    (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
  169    (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
  170    (('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
  171    (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
  172    (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
  173    (('~fadd',    ('fmul', a,          ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f',  c))), ('bcsel', c, b, a), 'options->lower_flrp32'),
  174    (('~fadd@32', ('fmul', a,          ('fadd', 1.0, ('fneg',          c   ) )), ('fmul', b,          c )), ('flrp', a, b, c), '!options->lower_flrp32'),
  175    (('~fadd@64', ('fmul', a,          ('fadd', 1.0, ('fneg',          c   ) )), ('fmul', b,          c )), ('flrp', a, b, c), '!options->lower_flrp64'),
  176    # These are the same as the previous three rules, but it depends on
  177    # 1-fsat(x) <=> fsat(1-x).  See below.
  178    (('~fadd@32', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg',          c   )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp32'),
  179    (('~fadd@64', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg',          c   )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp64'),
  180 
  181    (('~fadd', a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp32'),
  182    (('~fadd@32', a, ('fmul',         c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp32'),
  183    (('~fadd@64', a, ('fmul',         c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp64'),
  184    (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
  185    (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
  186 
  187    (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
  188     ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
  189 
  190    (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
  191 
  192    (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d), '!options->lower_fdph'),
  193    (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
  194    (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
  195    (('fdot4', ('vec4', a, b,   c,   0.0), d), ('fdot3', ('vec3', a, b, c), d)),
  196 
  197    (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
  198    (('fdot3', ('vec3', a, b,   0.0), c), ('fdot2', ('vec2', a, b), c)),
  199 
  200    (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
  201    (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
  202 
  203    # Lower fdot to fsum when it is available
  204    (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
  205    (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
  206    (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
  207    (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
  208 
  209    # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
  210    # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
  211    # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
  212    (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
  213 
  214    # 1 - ((1 - a) * (1 - b))
  215    # 1 - (1 - a - b + a*b)
  216    # 1 - 1 + a + b - a*b
  217    # a + b - a*b
  218    # a + b*(1 - a)
  219    # b*(1 - a) + 1*a
  220    # flrp(b, 1, a)
  221    (('~fadd@32', 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))),
  222     ('flrp', b, 1.0, a), '!options->lower_flrp32'),
  223 
  224    # (a * #b + #c) << #d
  225    # ((a * #b) << #d) + (#c << #d)
  226    # (a * (#b << #d)) + (#c << #d)
  227    (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'),
  228     ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))),
  229 
  230    # (a * #b) << #c
  231    # a * (#b << #c)
  232    (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
  233 ]
  234 
  235 # Care must be taken here.  Shifts in NIR uses only the lower log2(bitsize)
  236 # bits of the second source.  These replacements must correctly handle the
  237 # case where (b % bitsize) + (c % bitsize) >= bitsize.
  238 for s in [8, 16, 32, 64]:
  239    mask = (1 << s) - 1
  240 
  241    ishl = "ishl@{}".format(s)
  242    ishr = "ishr@{}".format(s)
  243    ushr = "ushr@{}".format(s)
  244 
  245    in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
  246 
  247    optimizations.extend([
  248        ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
  249        ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
  250 
  251        # To get get -1 for large shifts of negative values, ishr must instead
  252        # clamp the shift count to the maximum value.
  253        ((ishr, (ishr, a, '#b'), '#c'),
  254         (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
  255    ])
  256 
  257 # Optimize a pattern of address calculation created by DXVK where the offset is
  258 # divided by 4 and then multipled by 4. This can be turned into an iand and the
  259 # additions before can be reassociated to CSE the iand instruction.
  260 for log2 in range(1, 7): # powers of two from 2 to 64
  261    v = 1 << log2
  262    mask = 0xffffffff & ~(v - 1)
  263    b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
  264 
  265    optimizations.extend([
  266        # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
  267        (('ishl@32', ('ushr@32', a, log2), log2), ('iand', a, mask)),
  268 
  269        # Reassociate for improved CSE
  270        (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
  271    ])
  272 
  273 # To save space in the state tables, reduce to the set that is known to help.
  274 # Previously, this was range(1, 32).  In addition, a couple rules inside the
  275 # loop are commented out.  Revisit someday, probably after mesa/#2635 has some
  276 # resolution.
  277 for i in [1, 2, 16, 24]:
  278     lo_mask = 0xffffffff >> i
  279     hi_mask = (0xffffffff << i) & 0xffffffff
  280 
  281     optimizations.extend([
  282         # This pattern seems to only help in the soft-fp64 code.
  283         (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
  284 #        (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
  285 #        (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
  286 
  287         (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
  288         (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
  289 #        (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
  290     ])
  291 
  292 optimizations.extend([
  293    # This is common for address calculations.  Reassociating may enable the
  294    # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
  295    # instruction or a constant offset field for in load / store instructions.
  296    (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
  297 
  298    # Comparison simplifications
  299    (('~inot', ('flt', a, b)), ('fge', a, b)),
  300    (('~inot', ('fge', a, b)), ('flt', a, b)),
  301    (('inot', ('feq', a, b)), ('fne', a, b)),
  302    (('inot', ('fne', a, b)), ('feq', a, b)),
  303    (('inot', ('ilt', a, b)), ('ige', a, b)),
  304    (('inot', ('ult', a, b)), ('uge', a, b)),
  305    (('inot', ('ige', a, b)), ('ilt', a, b)),
  306    (('inot', ('uge', a, b)), ('ult', a, b)),
  307    (('inot', ('ieq', a, b)), ('ine', a, b)),
  308    (('inot', ('ine', a, b)), ('ieq', a, b)),
  309 
  310    (('iand', ('feq', a, b), ('fne', a, b)), False),
  311    (('iand', ('flt', a, b), ('flt', b, a)), False),
  312    (('iand', ('ieq', a, b), ('ine', a, b)), False),
  313    (('iand', ('ilt', a, b), ('ilt', b, a)), False),
  314    (('iand', ('ult', a, b), ('ult', b, a)), False),
  315 
  316    # This helps some shaders because, after some optimizations, they end up
  317    # with patterns like (-a < -b) || (b < a).  In an ideal world, this sort of
  318    # matching would be handled by CSE.
  319    (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
  320    (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
  321    (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
  322    (('fne', ('fneg', a), ('fneg', b)), ('fne', b, a)),
  323    (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
  324    (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
  325    (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
  326    (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
  327    (('fne', ('fneg', a), -1.0), ('fne', 1.0, a)),
  328    (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
  329 
  330    # flt(fsat(a), b > 0 && b < 1) is inexact if a is NaN (fsat(NaN) is 0)
  331    # because it returns True while flt(a, b) always returns False.
  332    (('~flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
  333    (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
  334    (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
  335    # fge(b > 0 && b < 1, fsat(a)) is inexact if a is NaN (fsat(NaN) is 0)
  336    # because it returns True while fge(b, a) always returns False.
  337    (('~fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
  338    (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
  339    (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
  340 
  341    (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
  342    (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
  343    (('fge', 0.0, ('fsat(is_used_once)', a)), ('fge', 0.0, a)),
  344    (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
  345 
  346    # 0.0 >= b2f(a)
  347    # b2f(a) <= 0.0
  348    # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
  349    # inot(a)
  350    (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
  351 
  352    (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
  353 
  354    (('fne', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
  355    (('fne', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
  356    (('fne', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('ior', a, b)),
  357    (('fne', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('ior', a, b)),
  358    (('fne', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
  359    (('fne', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
  360    (('fne', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('iand', a, b)),
  361    (('fne', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
  362    (('fne',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ixor', a, b)),
  363    (('fne', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ixor', a, b)),
  364    (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
  365    (('feq', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
  366    (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('inot', ('ior', a, b))),
  367    (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('inot', ('ior', a, b))),
  368    (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
  369    (('feq', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
  370    (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('inot', ('iand', a, b))),
  371    (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),
  372    (('feq',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ieq', a, b)),
  373    (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ieq', a, b)),
  374 
  375    # -(b2f(a) + b2f(b)) < 0
  376    # 0 < b2f(a) + b2f(b)
  377    # 0 != b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
  378    # a || b
  379    (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),
  380    (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),
  381 
  382    # -(b2f(a) + b2f(b)) >= 0
  383    # 0 >= b2f(a) + b2f(b)
  384    # 0 == b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
  385    # !(a || b)
  386    (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
  387    (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
  388 
  389    (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
  390    (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
  391 
  392    # Some optimizations (below) convert things like (a < b || c < b) into
  393    # (min(a, c) < b).  However, this interfers with the previous optimizations
  394    # that try to remove comparisons with negated sums of b2f.  This just
  395    # breaks that apart.
  396    (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),
  397     ('ior', ('flt', c, 0.0), ('ior', a, b))),
  398 
  399    (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
  400    (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
  401    (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
  402    (('~fne', ('fadd', a, b), a), ('fne', b, 0.0)),
  403    (('~flt',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
  404    (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
  405    (('~fge',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
  406    (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
  407    (('~feq',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
  408    (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
  409    (('~fne',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fne', a, ('fadd', c, ('fneg', b)))),
  410    (('~fne', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fne', ('fneg', ('fadd', c, b)), a)),
  411 
  412    # Cannot remove the addition from ilt or ige due to overflow.
  413    (('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
  414    (('ine', ('iadd', a, b), a), ('ine', b, 0)),
  415 
  416    # fmin(-b2f(a), b) >= 0.0
  417    # -b2f(a) >= 0.0 && b >= 0.0
  418    # -b2f(a) == 0.0 && b >= 0.0    -b2f can only be 0 or -1, never >0
  419    # b2f(a) == 0.0 && b >= 0.0
  420    # a == False && b >= 0.0
  421    # !a && b >= 0.0
  422    #
  423    # The fge in the second replacement is not a typo.  I leave the proof that
  424    # "fmin(-b2f(a), b) >= 0 <=> fmin(-b2f(a), b) == 0" as an exercise for the
  425    # reader.
  426    (('fge', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))),
  427    (('feq', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))),
  428 
  429    (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
  430    (('~fne', ('b2f', 'a@1'), 0.0), a),
  431    (('ieq', ('b2i', 'a@1'), 0),   ('inot', a)),
  432    (('ine', ('b2i', 'a@1'), 0),   a),
  433 
  434    (('fne', ('u2f', a), 0.0), ('ine', a, 0)),
  435    (('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
  436    (('fge', ('u2f', a), 0.0), True),
  437    (('fge', 0.0, ('u2f', a)), ('uge', 0, a)),    # ieq instead?
  438    (('flt', ('u2f', a), 0.0), False),
  439    (('flt', 0.0, ('u2f', a)), ('ult', 0, a)),    # ine instead?
  440    (('fne', ('i2f', a), 0.0), ('ine', a, 0)),
  441    (('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
  442    (('fge', ('i2f', a), 0.0), ('ige', a, 0)),
  443    (('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
  444    (('flt', ('i2f', a), 0.0), ('ilt', a, 0)),
  445    (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),
  446 
  447    # 0.0 < fabs(a)
  448    # fabs(a) > 0.0
  449    # fabs(a) != 0.0 because fabs(a) must be >= 0
  450    # a != 0.0
  451    (('~flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
  452 
  453    # -fabs(a) < 0.0
  454    # fabs(a) > 0.0
  455    (('~flt', ('fneg', ('fabs', a)), 0.0), ('fne', a, 0.0)),
  456 
  457    # 0.0 >= fabs(a)
  458    # 0.0 == fabs(a)   because fabs(a) must be >= 0
  459    # 0.0 == a
  460    (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),
  461 
  462    # -fabs(a) >= 0.0
  463    # 0.0 >= fabs(a)
  464    (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
  465 
  466    # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
  467    (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
  468 
  469    # (a < 0.0) || (a > 1.0)
  470    # !(!(a < 0.0) && !(a > 1.0))
  471    # !((a >= 0.0) && (a <= 1.0))
  472    # !(a == fsat(a))
  473    # a != fsat(a)
  474    (('ior', ('flt', a, 0.0), ('flt', 1.0, a)), ('fne', a, ('fsat', a)), '!options->lower_fsat'),
  475 
  476    (('fmax',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('ior', a, b))),
  477    (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
  478    (('fmin',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('iand', a, b))),
  479    (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),
  480 
  481    # fmin(b2f(a), b)
  482    # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))
  483    # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b))
  484    # bcsel(a, fmin(1.0, b), fmin(0.0, b))
  485    #
  486    # Since b is a constant, constant folding will eliminate the fmin and the
  487    # fmax.  If b is > 1.0, the bcsel will be replaced with a b2f.
  488    (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
  489 
  490    (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
  491 
  492    (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
  493    (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
  494    (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
  495    (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),
  496    (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),
  497    (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)),
  498    (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),
  499    (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
  500    (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),
  501    (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
  502    (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
  503    (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
  504    (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
  505    (('bcsel', a, True, b), ('ior', a, b)),
  506    (('bcsel', a, a, b), ('ior', a, b)),
  507    (('bcsel', a, b, False), ('iand', a, b)),
  508    (('bcsel', a, b, a), ('iand', a, b)),
  509    (('~fmin', a, a), a),
  510    (('~fmax', a, a), a),
  511    (('imin', a, a), a),
  512    (('imax', a, a), a),
  513    (('umin', a, a), a),
  514    (('umax', a, a), a),
  515    (('fmax', ('fmax', a, b), b), ('fmax', a, b)),
  516    (('umax', ('umax', a, b), b), ('umax', a, b)),
  517    (('imax', ('imax', a, b), b), ('imax', a, b)),
  518    (('fmin', ('fmin', a, b), b), ('fmin', a, b)),
  519    (('umin', ('umin', a, b), b), ('umin', a, b)),
  520    (('imin', ('imin', a, b), b), ('imin', a, b)),
  521    (('iand@32', a, ('inot', ('ishr', a, 31))), ('imax', a, 0)),
  522 
  523    # Simplify logic to detect sign of an integer.
  524    (('ieq', ('iand', 'a@32', 0x80000000), 0x00000000), ('ige', a, 0)),
  525    (('ine', ('iand', 'a@32', 0x80000000), 0x80000000), ('ige', a, 0)),
  526    (('ine', ('iand', 'a@32', 0x80000000), 0x00000000), ('ilt', a, 0)),
  527    (('ieq', ('iand', 'a@32', 0x80000000), 0x80000000), ('ilt', a, 0)),
  528    (('ine', ('ushr', 'a@32', 31), 0), ('ilt', a, 0)),
  529    (('ieq', ('ushr', 'a@32', 31), 0), ('ige', a, 0)),
  530    (('ieq', ('ushr', 'a@32', 31), 1), ('ilt', a, 0)),
  531    (('ine', ('ushr', 'a@32', 31), 1), ('ige', a, 0)),
  532    (('ine', ('ishr', 'a@32', 31), 0), ('ilt', a, 0)),
  533    (('ieq', ('ishr', 'a@32', 31), 0), ('ige', a, 0)),
  534    (('ieq', ('ishr', 'a@32', 31), -1), ('ilt', a, 0)),
  535    (('ine', ('ishr', 'a@32', 31), -1), ('ige', a, 0)),
  536 
  537    (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
  538    (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
  539    (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
  540    (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
  541    (('~fmin', a, ('fabs', a)), a),
  542    (('imin', a, ('iabs', a)), a),
  543    (('~fmax', a, ('fneg', ('fabs', a))), a),
  544    (('imax', a, ('ineg', ('iabs', a))), a),
  545    (('fmax', a, ('fabs', a)), ('fabs', a)),
  546    (('imax', a, ('iabs', a)), ('iabs', a)),
  547    (('fmax', a, ('fneg', a)), ('fabs', a)),
  548    (('imax', a, ('ineg', a)), ('iabs', a)),
  549    (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
  550    (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
  551    (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
  552    (('~fmin', ('fmax', a, -1.0),  0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
  553    (('~fmax', ('fmin', a,  0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
  554    (('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))),
  555    (('fsat', ('b2f', a)), ('b2f', a)),
  556    (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
  557    (('fsat', ('fsat', a)), ('fsat', a)),
  558    (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
  559    (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
  560    (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
  561    (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
  562    (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
  563    (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
  564    (('fmax', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
  565    (('fmin', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
  566 
  567    # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
  568    # fsat(b-a).
  569    #
  570    # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
  571    #
  572    # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
  573    (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b@32(is_zero_to_one)'), 0.0),
  574     ('fsat', ('fadd', ('fneg',  a), b)), '!options->lower_fsat'),
  575 
  576    (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
  577    (('~ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
  578    (('~ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
  579    (('~ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
  580    (('~ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
  581    (('~ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('fmax', b, c))),
  582    (('~ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('fmin', a, b), c)),
  583    (('~ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('fmin', b, c))),
  584    (('~ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('fmax', a, b), c)),
  585    (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))),
  586    (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)),
  587    (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))),
  588    (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)),
  589    (('~iand', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('fmin', b, c))),
  590    (('~iand', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('fmax', a, b), c)),
  591    (('~iand', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('fmax', b, c))),
  592    (('~iand', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('fmin', a, b), c)),
  593 
  594    (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))),
  595    (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)),
  596    (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))),
  597    (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)),
  598    (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))),
  599    (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)),
  600    (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))),
  601    (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)),
  602    (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))),
  603    (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)),
  604    (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))),
  605    (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)),
  606    (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))),
  607    (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)),
  608    (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
  609    (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
  610 
  611    # These derive from the previous patterns with the application of b < 0 <=>
  612    # 0 < -b.  The transformation should be applied if either comparison is
  613    # used once as this ensures that the number of comparisons will not
  614    # increase.  The sources to the ior and iand are not symmetric, so the
  615    # rules have to be duplicated to get this behavior.
  616    (('~ior', ('flt(is_used_once)', 0.0, 'a@32'), ('flt', 'b@32', 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
  617    (('~ior', ('flt', 0.0, 'a@32'), ('flt(is_used_once)', 'b@32', 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
  618    (('~ior', ('fge(is_used_once)', 0.0, 'a@32'), ('fge', 'b@32', 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
  619    (('~ior', ('fge', 0.0, 'a@32'), ('fge(is_used_once)', 'b@32', 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
  620    (('~iand', ('flt(is_used_once)', 0.0, 'a@32'), ('flt', 'b@32', 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
  621    (('~iand', ('flt', 0.0, 'a@32'), ('flt(is_used_once)', 'b@32', 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
  622    (('~iand', ('fge(is_used_once)', 0.0, 'a@32'), ('fge', 'b@32', 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
  623    (('~iand', ('fge', 0.0, 'a@32'), ('fge(is_used_once)', 'b@32', 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
  624 
  625    # Common pattern like 'if (i == 0 || i == 1 || ...)'
  626    (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
  627    (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
  628    (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
  629 
  630    # The (i2f32, ...) part is an open-coded fsign.  When that is combined with
  631    # the bcsel, it's basically copysign(1.0, a).  There is no copysign in NIR,
  632    # so emit an open-coded version of that.
  633    (('bcsel@32', ('feq', a, 0.0), 1.0, ('i2f32', ('iadd', ('b2i32', ('flt', 0.0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0.0)))))),
  634     ('ior', 0x3f800000, ('iand', a, 0x80000000))),
  635 
  636    (('ior', a, ('ieq', a, False)), True),
  637    (('ior', a, ('inot', a)), -1),
  638 
  639    (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
  640    (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
  641 
  642    (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', a, b), 0), '!options->lower_bitops'),
  643    (('ior',  ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('ior', a, b), 0), '!options->lower_bitops'),
  644 
  645    # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
  646    # The first part of the iand comes from the !__feq64_nonnan.
  647    #
  648    # The second pattern is a reformulation of the first based on the relation
  649    # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
  650    # happens to be y == 0.
  651    (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0),  b), c)), ('ilt', a, 0)),
  652     ('iand', ('inot', ('iand',                         b , c)), ('ilt', a, 0))),
  653    (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
  654     ('iand', ('inot', ('iand', ('ieq',             b , 0), c)), ('ilt', a, 0))),
  655 
  656    # These patterns can result when (a < b || a < c) => (a < min(b, c))
  657    # transformations occur before constant propagation and loop-unrolling.
  658    (('~flt', a, ('fmax', b, a)), ('flt', a, b)),
  659    (('~flt', ('fmin', a, b), a), ('flt', b, a)),
  660    (('~fge', a, ('fmin', b, a)), True),
  661    (('~fge', ('fmax', a, b), a), True),
  662    (('~flt', a, ('fmin', b, a)), False),
  663    (('~flt', ('fmax', a, b), a), False),
  664    (('~fge', a, ('fmax', b, a)), ('fge', a, b)),
  665    (('~fge', ('fmin', a, b), a), ('fge', b, a)),
  666 
  667    (('ilt', a, ('imax', b, a)), ('ilt', a, b)),
  668    (('ilt', ('imin', a, b), a), ('ilt', b, a)),
  669    (('ige', a, ('imin', b, a)), True),
  670    (('ige', ('imax', a, b), a), True),
  671    (('ult', a, ('umax', b, a)), ('ult', a, b)),
  672    (('ult', ('umin', a, b), a), ('ult', b, a)),
  673    (('uge', a, ('umin', b, a)), True),
  674    (('uge', ('umax', a, b), a), True),
  675    (('ilt', a, ('imin', b, a)), False),
  676    (('ilt', ('imax', a, b), a), False),
  677    (('ige', a, ('imax', b, a)), ('ige', a, b)),
  678    (('ige', ('imin', a, b), a), ('ige', b, a)),
  679    (('ult', a, ('umin', b, a)), False),
  680    (('ult', ('umax', a, b), a), False),
  681    (('uge', a, ('umax', b, a)), ('uge', a, b)),
  682    (('uge', ('umin', a, b), a), ('uge', b, a)),
  683    (('ult', a, ('iand', b, a)), False),
  684    (('ult', ('ior', a, b), a), False),
  685    (('uge', a, ('iand', b, a)), True),
  686    (('uge', ('ior', a, b), a), True),
  687 
  688    (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
  689    (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
  690    (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),
  691    (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),
  692    (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),
  693    (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),
  694    (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),
  695    (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),
  696    (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),
  697    (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),
  698    (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),
  699    (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))),
  700    (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))),
  701    (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))),
  702    (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),
  703    (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),
  704 
  705    # Thanks to sign extension, the ishr(a, b) is negative if and only if a is
  706    # negative.
  707    (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),
  708     ('iabs', ('ishr', a, b))),
  709    (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)),
  710 
  711    (('fabs', ('slt', a, b)), ('slt', a, b)),
  712    (('fabs', ('sge', a, b)), ('sge', a, b)),
  713    (('fabs', ('seq', a, b)), ('seq', a, b)),
  714    (('fabs', ('sne', a, b)), ('sne', a, b)),
  715    (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
  716    (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
  717    (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
  718    (('sne', a, b), ('b2f', ('fne', a, b)), 'options->lower_scmp'),
  719    (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
  720    (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
  721    (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
  722    (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
  723    (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
  724    (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
  725    (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
  726    (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
  727    (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
  728    (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
  729    (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
  730    (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
  731    (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
  732    (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
  733    (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
  734    (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
  735    (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
  736    (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
  737    (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
  738    (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
  739    (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
  740    (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
  741    (('fne', ('fneg', a), a), ('fne', a, 0.0)),
  742    (('feq', ('fneg', a), a), ('feq', a, 0.0)),
  743    # Emulating booleans
  744    (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
  745    (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
  746    (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
  747    (('iand', 'a@bool32', 1.0), ('b2f', a)),
  748    # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
  749    (('ineg', ('b2i32', 'a@32')), a),
  750    (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
  751    # Comparison with the same args.  Note that these are not done for
  752    # the float versions because NaN always returns false on float
  753    # inequalities.
  754    (('ilt', a, a), False),
  755    (('ige', a, a), True),
  756    (('ieq', a, a), True),
  757    (('ine', a, a), False),
  758    (('ult', a, a), False),
  759    (('uge', a, a), True),
  760    # Logical and bit operations
  761    (('iand', a, a), a),
  762    (('iand', a, ~0), a),
  763    (('iand', a, 0), 0),
  764    (('ior', a, a), a),
  765    (('ior', a, 0), a),
  766    (('ior', a, True), True),
  767    (('ixor', a, a), 0),
  768    (('ixor', a, 0), a),
  769    (('inot', ('inot', a)), a),
  770    (('ior', ('iand', a, b), b), b),
  771    (('ior', ('ior', a, b), b), ('ior', a, b)),
  772    (('iand', ('ior', a, b), b), b),
  773    (('iand', ('iand', a, b), b), ('iand', a, b)),
  774    # DeMorgan's Laws
  775    (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
  776    (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
  777    # Shift optimizations
  778    (('ishl', 0, a), 0),
  779    (('ishl', a, 0), a),
  780    (('ishr', 0, a), 0),
  781    (('ishr', a, 0), a),
  782    (('ushr', 0, a), 0),
  783    (('ushr', a, 0), a),
  784    (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
  785    (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
  786    (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
  787    (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'),
  788    (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
  789    (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'),
  790    (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
  791    (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'),
  792    (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'),
  793    (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'),
  794    (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'),
  795    (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'),
  796    # Exponential/logarithmic identities
  797    (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
  798    (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
  799    (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
  800    (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
  801    (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
  802     ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
  803    (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
  804    (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
  805    (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
  806    (('~fpow', a, 1.0), a),
  807    (('~fpow', a, 2.0), ('fmul', a, a)),
  808    (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
  809    (('~fpow', 2.0, a), ('fexp2', a)),
  810    (('~fpow', ('fpow', a, 2.2), 0.454545), a),
  811    (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
  812    (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
  813    (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
  814    (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
  815    (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
  816    (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
  817    (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
  818    (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
  819    (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
  820    (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
  821    (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
  822    # Division and reciprocal
  823    (('~fdiv', 1.0, a), ('frcp', a)),
  824    (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
  825    (('~frcp', ('frcp', a)), a),
  826    (('~frcp', ('fsqrt', a)), ('frsq', a)),
  827    (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
  828    (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
  829    # Trig
  830    (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
  831    (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
  832    # Boolean simplifications
  833    (('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
  834    (('i2b1(is_used_by_if)', a), ('ine', a, 0)),
  835    (('ieq', a, True), a),
  836    (('ine(is_not_used_by_if)', a, True), ('inot', a)),
  837    (('ine', a, False), a),
  838    (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
  839    (('bcsel', a, True, False), a),
  840    (('bcsel', a, False, True), ('inot', a)),
  841    (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
  842    (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),
  843    (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),
  844    (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
  845    (('bcsel', True, b, c), b),
  846    (('bcsel', False, b, c), c),
  847    (('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))),
  848 
  849    (('bcsel', a, b, b), b),
  850    (('~fcsel', a, b, b), b),
  851 
  852    # D3D Boolean emulation
  853    (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
  854    (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
  855    (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
  856     ('ineg', ('b2i', ('iand', a, b)))),
  857    (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
  858     ('ineg', ('b2i', ('ior', a, b)))),
  859    (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
  860    (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
  861    (('ine', ('ineg', ('b2i', 'a@1')), 0), a),
  862    (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
  863    (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
  864    (('iand', ('ineg', ('b2i', a)), 1),   ('b2i', a)),
  865 
  866    # SM5 32-bit shifts are defined to use the 5 least significant bits
  867    (('ishl', 'a@32', ('iand', 31, b)), ('ishl', a, b)),
  868    (('ishr', 'a@32', ('iand', 31, b)), ('ishr', a, b)),
  869    (('ushr', 'a@32', ('iand', 31, b)), ('ushr', a, b)),
  870 
  871    # Conversions
  872    (('i2b32', ('b2i', 'a@32')), a),
  873    (('f2i', ('ftrunc', a)), ('f2i', a)),
  874    (('f2u', ('ftrunc', a)), ('f2u', a)),
  875    (('i2b', ('ineg', a)), ('i2b', a)),
  876    (('i2b', ('iabs', a)), ('i2b', a)),
  877    (('inot', ('f2b1', a)), ('feq', a, 0.0)),
  878 
  879    # The C spec says, "If the value of the integral part cannot be represented
  880    # by the integer type, the behavior is undefined."  "Undefined" can mean
  881    # "the conversion doesn't happen at all."
  882    (('~i2f32', ('f2i32', 'a@32')), ('ftrunc', a)),
  883 
  884    # Ironically, mark these as imprecise because removing the conversions may
  885    # preserve more precision than doing the conversions (e.g.,
  886    # uint(float(0x81818181u)) == 0x81818200).
  887    (('~f2i32', ('i2f', 'a@32')), a),
  888    (('~f2i32', ('u2f', 'a@32')), a),
  889    (('~f2u32', ('i2f', 'a@32')), a),
  890    (('~f2u32', ('u2f', 'a@32')), a),
  891 
  892    # Conversions from float16 to float32 and back can always be removed
  893    (('f2f16', ('f2f32', 'a@16')), a),
  894    (('f2fmp', ('f2f32', 'a@16')), a),
  895    # Conversions to float16 would be lossy so they should only be removed if
  896    # the instruction was generated by the precision lowering pass.
  897    (('f2f32', ('f2fmp', 'a@32')), a),
  898 
  899    (('ffloor', 'a(is_integral)'), a),
  900    (('fceil', 'a(is_integral)'), a),
  901    (('ftrunc', 'a(is_integral)'), a),
  902    # fract(x) = x - floor(x), so fract(NaN) = NaN
  903    (('~ffract', 'a(is_integral)'), 0.0),
  904    (('fabs', 'a(is_not_negative)'), a),
  905    (('iabs', 'a(is_not_negative)'), a),
  906    (('fsat', 'a(is_not_positive)'), 0.0),
  907 
  908    # Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec
  909    # says:
  910    #
  911    #    It is undefined to convert a negative floating-point value to an
  912    #    uint.
  913    #
  914    # Assuming that (uint)some_float behaves like (uint)(int)some_float allows
  915    # some optimizations in the i965 backend to proceed.
  916    (('ige', ('f2u', a), b), ('ige', ('f2i', a), b)),
  917    (('ige', b, ('f2u', a)), ('ige', b, ('f2i', a))),
  918    (('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)),
  919    (('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))),
  920 
  921    (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
  922 
  923    # The result of the multiply must be in [-1, 0], so the result of the ffma
  924    # must be in [0, 1].
  925    (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
  926    (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
  927    (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
  928    (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
  929 
  930    (('fne', 'a(is_not_zero)', 0.0), True),
  931    (('feq', 'a(is_not_zero)', 0.0), False),
  932 
  933    # In this chart, + means value > 0 and - means value < 0.
  934    #
  935    # + >= + -> unknown  0 >= + -> false    - >= + -> false
  936    # + >= 0 -> true     0 >= 0 -> true     - >= 0 -> false
  937    # + >= - -> true     0 >= - -> true     - >= - -> unknown
  938    #
  939    # Using grouping conceptually similar to a Karnaugh map...
  940    #
  941    # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
  942    # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
  943    # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
  944    #
  945    # The flt / ilt cases just invert the expected result.
  946    #
  947    # The results expecting true, must be marked imprecise.  The results
  948    # expecting false are fine because NaN compared >= or < anything is false.
  949 
  950    (('~fge', 'a(is_not_negative)', 'b(is_not_positive)'), True),
  951    (('fge',  'a(is_not_positive)', 'b(is_gt_zero)'),      False),
  952    (('fge',  'a(is_lt_zero)',      'b(is_not_negative)'), False),
  953 
  954    (('flt',  'a(is_not_negative)', 'b(is_not_positive)'), False),
  955    (('~flt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
  956    (('~flt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
  957 
  958    (('ine', 'a(is_not_zero)', 0), True),
  959    (('ieq', 'a(is_not_zero)', 0), False),
  960 
  961    (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
  962    (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'),      False),
  963    (('ige', 'a(is_lt_zero)',      'b(is_not_negative)'), False),
  964 
  965    (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
  966    (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
  967    (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
  968 
  969    (('ult', 0, 'a(is_gt_zero)'), True),
  970    (('ult', a, 0), False),
  971 
  972    # Packing and then unpacking does nothing
  973    (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
  974    (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
  975    (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
  976                            ('unpack_64_2x32_split_y', a)), a),
  977 
  978    # Comparing two halves of an unpack separately.  While this optimization
  979    # should be correct for non-constant values, it's less obvious that it's
  980    # useful in that case.  For constant values, the pack will fold and we're
  981    # guaranteed to reduce the whole tree to one instruction.
  982    (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),
  983              ('ieq', ('unpack_32_2x16_split_y', a), '#c')),
  984     ('ieq', a, ('pack_32_2x16_split', b, c))),
  985 
  986    # Byte extraction
  987    (('ushr', 'a@16',  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
  988    (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
  989    (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
  990    (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
  991    (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
  992    (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
  993    (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
  994 
  995    # Useless masking before unpacking
  996    (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
  997    (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
  998    (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
  999    (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
 1000    (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
 1001    (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
 1002 
 1003    # Optimize half packing
 1004    (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
 1005    (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
 1006 
 1007    (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
 1008     ('pack_half_2x16', ('vec2', a, b))),
 1009    (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
 1010     ('pack_half_2x16', ('vec2', a, b))),
 1011 ])
 1012 
 1013 # After the ('extract_u8', a, 0) pattern, above, triggers, there will be
 1014 # patterns like those below.
 1015 for op in ('ushr', 'ishr'):
 1016    optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
 1017    optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
 1018    optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
 1019 
 1020 optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
 1021 
 1022 # After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
 1023 # patterns like those below.
 1024 for op in ('extract_u8', 'extract_i8'):
 1025    optimizations.extend([((op, ('ishl', 'a@16',      8),     1), (op, a, 0))])
 1026    optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
 1027    optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
 1028 
 1029 optimizations.extend([
 1030     # Word extraction
 1031    (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
 1032    (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
 1033    (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
 1034    (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
 1035    (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
 1036 
 1037    # Subtracts
 1038    (('ussub_4x8', a, 0), a),
 1039    (('ussub_4x8', a, ~0), 0),
 1040    # Lower all Subtractions first - they can get recombined later
 1041    (('fsub', a, b), ('fadd', a, ('fneg', b))),
 1042    (('isub', a, b), ('iadd', a, ('ineg', b))),
 1043    (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
 1044    # This is correct.  We don't need isub_sat because the result type is unsigned, so it cannot overflow.
 1045    (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
 1046 
 1047    # Propagate negation up multiplication chains
 1048    (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
 1049    (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
 1050 
 1051    # Propagate constants up multiplication chains
 1052    (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
 1053    (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
 1054    (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
 1055    (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
 1056 
 1057    # Reassociate constants in add/mul chains so they can be folded together.
 1058    # For now, we mostly only handle cases where the constants are separated by
 1059    # a single non-constant.  We could do better eventually.
 1060    (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
 1061    (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
 1062    (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
 1063    (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
 1064    (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
 1065    (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
 1066    (('ior',  '#a', ('ior',  'b(is_not_const)', '#c')), ('ior',  ('ior',  a, c), b)),
 1067    (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
 1068 
 1069    # Drop mul-div by the same value when there's no wrapping.
 1070    (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
 1071 
 1072    # By definition...
 1073    (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
 1074    (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
 1075    (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
 1076 
 1077    (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),
 1078    (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
 1079    (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
 1080 
 1081    (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
 1082 
 1083    (('fmin3@64', a, b, c), ('fmin@64', a, ('fmin@64', b, c))),
 1084    (('fmax3@64', a, b, c), ('fmax@64', a, ('fmax@64', b, c))),
 1085    (('fmed3@64', a, b, c), ('fmax@64', ('fmin@64', ('fmax@64', a, b), c), ('fmin@64', a, b))),
 1086 
 1087    # Misc. lowering
 1088    (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
 1089    (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
 1090    (('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
 1091    (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
 1092 
 1093    (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
 1094     ('bcsel', ('ult', 31, 'bits'), 'insert',
 1095               ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
 1096     'options->lower_bitfield_insert'),
 1097    (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
 1098    (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
 1099    (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
 1100    (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
 1101    (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
 1102    (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
 1103    (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
 1104    (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
 1105 
 1106    (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
 1107    (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'),
 1108    (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'),
 1109    (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
 1110 
 1111    # int64_t sum = a + b;
 1112    #
 1113    # if (a < 0 && b < 0 && a < sum)
 1114    #    sum = INT64_MIN;
 1115    # } else if (a >= 0 && b >= 0 && sum < a)
 1116    #    sum = INT64_MAX;
 1117    # }
 1118    #
 1119    # A couple optimizations are applied.
 1120    #
 1121    # 1. a < sum => sum >= 0.  This replacement works because it is known that
 1122    #    a < 0 and b < 0, so sum should also be < 0 unless there was
 1123    #    underflow.
 1124    #
 1125    # 2. sum < a => sum < 0.  This replacement works because it is known that
 1126    #    a >= 0 and b >= 0, so sum should also be >= 0 unless there was
 1127    #    overflow.
 1128    #
 1129    # 3. Invert the second if-condition and swap the order of parameters for
 1130    #    the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
 1131    #    0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
 1132    #
 1133    # On Intel Gen11, this saves ~11 instructions.
 1134    (('iadd_sat@64', a, b), ('bcsel',
 1135                             ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
 1136                             0x8000000000000000,
 1137                             ('bcsel',
 1138                              ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
 1139                              ('iadd', a, b),
 1140                              0x7fffffffffffffff)),
 1141     '(options->lower_int64_options & nir_lower_iadd64) != 0'),
 1142 
 1143    # int64_t sum = a - b;
 1144    #
 1145    # if (a < 0 && b >= 0 && a < sum)
 1146    #    sum = INT64_MIN;
 1147    # } else if (a >= 0 && b < 0 && a >= sum)
 1148    #    sum = INT64_MAX;
 1149    # }
 1150    #
 1151    # Optimizations similar to the iadd_sat case are applied here.
 1152    (('isub_sat@64', a, b), ('bcsel',
 1153                             ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
 1154                             0x8000000000000000,
 1155                             ('bcsel',
 1156                              ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
 1157                              ('isub', a, b),
 1158                              0x7fffffffffffffff)),
 1159     '(options->lower_int64_options & nir_lower_iadd64) != 0'),
 1160 
 1161    # These are done here instead of in the backend because the int64 lowering
 1162    # pass will make a mess of the patterns.  The first patterns are
 1163    # conditioned on nir_lower_minmax64 because it was not clear that it was
 1164    # always an improvement on platforms that have real int64 support.  No
 1165    # shaders in shader-db hit this, so it was hard to say one way or the
 1166    # other.
 1167    (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
 1168    (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
 1169    (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
 1170    (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
 1171    (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
 1172    (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
 1173 
 1174    (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
 1175    (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
 1176    # 0u < uint(a) <=> uint(a) != 0u
 1177    (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
 1178 
 1179    # Alternative lowering that doesn't rely on bfi.
 1180    (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
 1181     ('bcsel', ('ult', 31, 'bits'),
 1182      'insert',
 1183     (('ior',
 1184      ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
 1185      ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),
 1186     'options->lower_bitfield_insert_to_shifts'),
 1187 
 1188    # Alternative lowering that uses bitfield_select.
 1189    (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
 1190     ('bcsel', ('ult', 31, 'bits'), 'insert',
 1191               ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),
 1192     'options->lower_bitfield_insert_to_bitfield_select'),
 1193 
 1194    (('ibitfield_extract', 'value', 'offset', 'bits'),
 1195     ('bcsel', ('ult', 31, 'bits'), 'value',
 1196               ('ibfe', 'value', 'offset', 'bits')),
 1197     'options->lower_bitfield_extract'),
 1198 
 1199    (('ubitfield_extract', 'value', 'offset', 'bits'),
 1200     ('bcsel', ('ult', 31, 'bits'), 'value',
 1201               ('ubfe', 'value', 'offset', 'bits')),
 1202     'options->lower_bitfield_extract'),
 1203 
 1204    # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'
 1205    (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),
 1206    (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),
 1207    (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),
 1208    (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),
 1209    (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
 1210    (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
 1211 
 1212    (('ibitfield_extract', 'value', 'offset', 'bits'),
 1213     ('bcsel', ('ieq', 0, 'bits'),
 1214      0,
 1215      ('ishr',
 1216        ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),
 1217        ('isub', 32, 'bits'))),
 1218     'options->lower_bitfield_extract_to_shifts'),
 1219 
 1220    (('ubitfield_extract', 'value', 'offset', 'bits'),
 1221     ('iand',
 1222      ('ushr', 'value', 'offset'),
 1223      ('bcsel', ('ieq', 'bits', 32),
 1224       0xffffffff,
 1225       ('isub', ('ishl', 1, 'bits'), 1))),
 1226     'options->lower_bitfield_extract_to_shifts'),
 1227 
 1228    (('ifind_msb', 'value'),
 1229     ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),
 1230     'options->lower_ifind_msb'),
 1231 
 1232    (('find_lsb', 'value'),
 1233     ('ufind_msb', ('iand', 'value', ('ineg', 'value'))),
 1234     'options->lower_find_lsb'),
 1235 
 1236    (('extract_i8', a, 'b@32'),
 1237     ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
 1238     'options->lower_extract_byte'),
 1239 
 1240    (('extract_u8', a, 'b@32'),
 1241     ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
 1242     'options->lower_extract_byte'),
 1243 
 1244    (('extract_i16', a, 'b@32'),
 1245     ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
 1246     'options->lower_extract_word'),
 1247 
 1248    (('extract_u16', a, 'b@32'),
 1249     ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
 1250     'options->lower_extract_word'),
 1251 
 1252     (('pack_unorm_2x16', 'v'),
 1253      ('pack_uvec2_to_uint',
 1254         ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
 1255      'options->lower_pack_unorm_2x16'),
 1256 
 1257     (('pack_unorm_4x8', 'v'),
 1258      ('pack_uvec4_to_uint',
 1259         ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
 1260      'options->lower_pack_unorm_4x8'),
 1261 
 1262     (('pack_snorm_2x16', 'v'),
 1263      ('pack_uvec2_to_uint',
 1264         ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
 1265      'options->lower_pack_snorm_2x16'),
 1266 
 1267     (('pack_snorm_4x8', 'v'),
 1268      ('pack_uvec4_to_uint',
 1269         ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
 1270      'options->lower_pack_snorm_4x8'),
 1271 
 1272     (('unpack_unorm_2x16', 'v'),
 1273      ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),
 1274                                   ('extract_u16', 'v', 1))),
 1275               65535.0),
 1276      'options->lower_unpack_unorm_2x16'),
 1277 
 1278     (('unpack_unorm_4x8', 'v'),
 1279      ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),
 1280                                   ('extract_u8', 'v', 1),
 1281                                   ('extract_u8', 'v', 2),
 1282                                   ('extract_u8', 'v', 3))),
 1283               255.0),
 1284      'options->lower_unpack_unorm_4x8'),
 1285 
 1286     (('unpack_snorm_2x16', 'v'),
 1287      ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
 1288                                                             ('extract_i16', 'v', 1))),
 1289                                            32767.0))),
 1290      'options->lower_unpack_snorm_2x16'),
 1291 
 1292     (('unpack_snorm_4x8', 'v'),
 1293      ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
 1294                                                             ('extract_i8', 'v', 1),
 1295                                                             ('extract_i8', 'v', 2),
 1296                                                             ('extract_i8', 'v', 3))),
 1297                                            127.0))),
 1298      'options->lower_unpack_snorm_4x8'),
 1299 
 1300    (('pack_half_2x16_split', 'a@32', 'b@32'),
 1301     ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
 1302     'options->lower_pack_split'),
 1303 
 1304    (('unpack_half_2x16_split_x', 'a@32'),
 1305     ('f2f32', ('u2u16', a)),
 1306     'options->lower_pack_split'),
 1307 
 1308    (('unpack_half_2x16_split_y', 'a@32'),
 1309     ('f2f32', ('u2u16', ('ushr', a, 16))),
 1310     'options->lower_pack_split'),
 1311 
 1312    (('pack_32_2x16_split', 'a@16', 'b@16'),
 1313     ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)),
 1314     'options->lower_pack_split'),
 1315 
 1316    (('unpack_32_2x16_split_x', 'a@32'),
 1317     ('u2u16', a),
 1318     'options->lower_pack_split'),
 1319 
 1320    (('unpack_32_2x16_split_y', 'a@32'),
 1321     ('u2u16', ('ushr', 'a', 16)),
 1322     'options->lower_pack_split'),
 1323 
 1324    (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
 1325    (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
 1326 
 1327    # Address/offset calculations:
 1328    # Drivers supporting imul24 should use the nir_lower_amul() pass, this
 1329    # rule converts everyone else to imul:
 1330    (('amul', a, b), ('imul', a, b), '!options->has_imul24'),
 1331 
 1332    (('umul24', a, b),
 1333     ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
 1334     '!options->has_umul24'),
 1335    (('umad24', a, b, c),
 1336     ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
 1337     '!options->has_umad24'),
 1338 
 1339    (('imad24_ir3', a, b, 0), ('imul24', a, b)),
 1340    (('imad24_ir3', a, 0, c), (c)),
 1341    (('imad24_ir3', a, 1, c), ('iadd', a, c)),
 1342 
 1343    # if first two srcs are const, crack apart the imad so constant folding
 1344    # can clean up the imul:
 1345    # TODO ffma should probably get a similar rule:
 1346    (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
 1347 
 1348    # These will turn 24b address/offset calc back into 32b shifts, but
 1349    # it should be safe to get back some of the bits of precision that we
 1350    # already decided were no necessary:
 1351    (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
 1352    (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
 1353    (('imul24', a, 0), (0)),
 1354 ])
 1355 
 1356 # bit_size dependent lowerings
 1357 for bit_size in [8, 16, 32, 64]:
 1358    # convenience constants
 1359    intmax = (1 << (bit_size - 1)) - 1
 1360    intmin = 1 << (bit_size - 1)
 1361 
 1362    optimizations += [
 1363       (('iadd_sat@' + str(bit_size), a, b),
 1364        ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
 1365                                 ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_add_sat'),
 1366       (('isub_sat@' + str(bit_size), a, b),
 1367        ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
 1368                                 ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'),
 1369    ]
 1370 
 1371 invert = OrderedDict([('feq', 'fne'), ('fne', 'feq')])
 1372 
 1373 for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
 1374    optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
 1375                          ('iand', (invert[left], a, b), (invert[right], c, d))))
 1376    optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))),
 1377                          ('ior', (invert[left], a, b), (invert[right], c, d))))
 1378 
 1379 # Optimize x2bN(b2x(x)) -> x
 1380 for size in type_sizes('bool'):
 1381     aN = 'a@' + str(size)
 1382     f2bN = 'f2b' + str(size)
 1383     i2bN = 'i2b' + str(size)
 1384     optimizations.append(((f2bN, ('b2f', aN)), a))
 1385     optimizations.append(((i2bN, ('b2i', aN)), a))
 1386 
 1387 # Optimize x2yN(b2x(x)) -> b2y
 1388 for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
 1389    if x != 'f' and y != 'f' and x != y:
 1390       continue
 1391 
 1392    b2x = 'b2f' if x == 'f' else 'b2i'
 1393    b2y = 'b2f' if y == 'f' else 'b2i'
 1394    x2yN = '{}2{}'.format(x, y)
 1395    optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
 1396 
 1397 # Optimize away x2xN(a@N)
 1398 for t in ['int', 'uint', 'float', 'bool']:
 1399    for N in type_sizes(t):
 1400       x2xN = '{0}2{0}{1}'.format(t[0], N)
 1401       aN = 'a@{0}'.format(N)
 1402       optimizations.append(((x2xN, aN), a))
 1403 
 1404 # Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
 1405 # In particular, we can optimize away everything except upcast of downcast and
 1406 # upcasts where the type differs from the other cast
 1407 for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
 1408    if N < M:
 1409       # The outer cast is a down-cast.  It doesn't matter what the size of the
 1410       # argument of the inner cast is because we'll never been in the upcast
 1411       # of downcast case.  Regardless of types, we'll always end up with y2yN
 1412       # in the end.
 1413       for x, y in itertools.product(['i', 'u'], ['i', 'u']):
 1414          x2xN = '{0}2{0}{1}'.format(x, N)
 1415          y2yM = '{0}2{0}{1}'.format(y, M)
 1416          y2yN = '{0}2{0}{1}'.format(y, N)
 1417          optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
 1418    elif N > M:
 1419       # If the outer cast is an up-cast, we have to be more careful about the
 1420       # size of the argument of the inner cast and with types.  In this case,
 1421       # the type is always the type of type up-cast which is given by the
 1422       # outer cast.
 1423       for P in type_sizes('uint'):
 1424          # We can't optimize away up-cast of down-cast.
 1425          if M < P:
 1426             continue
 1427 
 1428          # Because we're doing down-cast of down-cast, the types always have
 1429          # to match between the two casts
 1430          for x in ['i', 'u']:
 1431             x2xN = '{0}2{0}{1}'.format(x, N)
 1432             x2xM = '{0}2{0}{1}'.format(x, M)
 1433             aP = 'a@{0}'.format(P)
 1434             optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
 1435    else:
 1436       # The N == M case is handled by other optimizations
 1437       pass
 1438 
 1439 # Downcast operations should be able to see through pack
 1440 for t in ['i', 'u']:
 1441     for N in [8, 16, 32]:
 1442         x2xN = '{0}2{0}{1}'.format(t, N)
 1443         optimizations += [
 1444             ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
 1445             ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
 1446         ]
 1447 
 1448 # Optimize comparisons with up-casts
 1449 for t in ['int', 'uint', 'float']:
 1450     for N, M in itertools.product(type_sizes(t), repeat=2):
 1451         if N == 1 or N >= M:
 1452             continue
 1453 
 1454         cond = 'true'
 1455         if N == 8:
 1456             cond = 'options->support_8bit_alu'
 1457         elif N == 16:
 1458             cond = 'options->support_16bit_alu'
 1459         x2xM = '{0}2{0}{1}'.format(t[0], M)
 1460         x2xN = '{0}2{0}{1}'.format(t[0], N)
 1461         aN = 'a@' + str(N)
 1462         bN = 'b@' + str(N)
 1463         xeq = 'feq' if t == 'float' else 'ieq'
 1464         xne = 'fne' if t == 'float' else 'ine'
 1465         xge = '{0}ge'.format(t[0])
 1466         xlt = '{0}lt'.format(t[0])
 1467 
 1468         # Up-casts are lossless so for correctly signed comparisons of
 1469         # up-casted values we can do the comparison at the largest of the two
 1470         # original sizes and drop one or both of the casts.  (We have
 1471         # optimizations to drop the no-op casts which this may generate.)
 1472         for P in type_sizes(t):
 1473             if P == 1 or P > N:
 1474                 continue
 1475 
 1476             bP = 'b@' + str(P)
 1477             optimizations += [
 1478                 ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
 1479                 ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
 1480                 ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
 1481                 ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
 1482                 ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
 1483                 ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
 1484             ]
 1485 
 1486         # The next bit doesn't work on floats because the range checks would
 1487         # get way too complicated.
 1488         if t in ['int', 'uint']:
 1489             if t == 'int':
 1490                 xN_min = -(1 << (N - 1))
 1491                 xN_max = (1 << (N - 1)) - 1
 1492             elif t == 'uint':
 1493                 xN_min = 0
 1494                 xN_max = (1 << N) - 1
 1495             else:
 1496                 assert False
 1497 
 1498             # If we're up-casting and comparing to a constant, we can unfold
 1499             # the comparison into a comparison with the shrunk down constant
 1500             # and a check that the constant fits in the smaller bit size.
 1501             optimizations += [
 1502                 ((xeq, (x2xM, aN), '#b'),
 1503                  ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
 1504                 ((xne, (x2xM, aN), '#b'),
 1505                  ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
 1506                 ((xlt, (x2xM, aN), '#b'),
 1507                  ('iand', (xlt, xN_min, b),
 1508                           ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
 1509                 ((xlt, '#a', (x2xM, bN)),
 1510                  ('iand', (xlt, a, xN_max),
 1511                           ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
 1512                 ((xge, (x2xM, aN), '#b'),
 1513                  ('iand', (xge, xN_max, b),
 1514                           ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
 1515                 ((xge, '#a', (x2xM, bN)),
 1516                  ('iand', (xge, a, xN_min),
 1517                           ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
 1518             ]
 1519 
 1520 def fexp2i(exp, bits):
 1521    # Generate an expression which constructs value 2.0^exp or 0.0.
 1522    #
 1523    # We assume that exp is already in a valid range:
 1524    #
 1525    #   * [-15, 15] for 16-bit float
 1526    #   * [-127, 127] for 32-bit float
 1527    #   * [-1023, 1023] for 16-bit float
 1528    #
 1529    # If exp is the lowest value in the valid range, a value of 0.0 is
 1530    # constructed.  Otherwise, the value 2.0^exp is constructed.
 1531    if bits == 16:
 1532       return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
 1533    elif bits == 32:
 1534       return ('ishl', ('iadd', exp, 127), 23)
 1535    elif bits == 64:
 1536       return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
 1537    else:
 1538       assert False
 1539 
 1540 def ldexp(f, exp, bits):
 1541    # The maximum possible range for a normal exponent is [-126, 127] and,
 1542    # throwing in denormals, you get a maximum range of [-149, 127].  This
 1543    # means that we can potentially have a swing of +-276.  If you start with
 1544    # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
 1545    # all the way to zero.  The GLSL spec only requires that we handle a subset
 1546    # of this range.  From version 4.60 of the spec:
 1547    #
 1548    #    "If exp is greater than +128 (single-precision) or +1024
 1549    #    (double-precision), the value returned is undefined. If exp is less
 1550    #    than -126 (single-precision) or -1022 (double-precision), the value
 1551    #    returned may be flushed to zero. Additionally, splitting the value
 1552    #    into a significand and exponent using frexp() and then reconstructing
 1553    #    a floating-point value using ldexp() should yield the original input
 1554    #    for zero and all finite non-denormalized values."
 1555    #
 1556    # The SPIR-V spec has similar language.
 1557    #
 1558    # In order to handle the maximum value +128 using the fexp2i() helper
 1559    # above, we have to split the exponent in half and do two multiply
 1560    # operations.
 1561    #
 1562    # First, we clamp exp to a reasonable range.  Specifically, we clamp to
 1563    # twice the full range that is valid for the fexp2i() function above.  If
 1564    # exp/2 is the bottom value of that range, the fexp2i() expression will
 1565    # yield 0.0f which, when multiplied by f, will flush it to zero which is
 1566    # allowed by the GLSL and SPIR-V specs for low exponent values.  If the
 1567    # value is clamped from above, then it must have been above the supported
 1568    # range of the GLSL built-in and therefore any return value is acceptable.
 1569    if bits == 16:
 1570       exp = ('imin', ('imax', exp, -30), 30)
 1571    elif bits == 32:
 1572       exp = ('imin', ('imax', exp, -254), 254)
 1573    elif bits == 64:
 1574       exp = ('imin', ('imax', exp, -2046), 2046)
 1575    else:
 1576       assert False
 1577 
 1578    # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
 1579    # (We use ishr which isn't the same for -1, but the -1 case still works
 1580    # since we use exp-exp/2 as the second exponent.)  While the spec
 1581    # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't
 1582    # work with denormals and doesn't allow for the full swing in exponents
 1583    # that you can get with normalized values.  Instead, we create two powers
 1584    # of two and multiply by them each in turn.  That way the effective range
 1585    # of our exponent is doubled.
 1586    pow2_1 = fexp2i(('ishr', exp, 1), bits)
 1587    pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits)
 1588    return ('fmul', ('fmul', f, pow2_1), pow2_2)
 1589 
 1590 optimizations += [
 1591    (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
 1592    (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
 1593    (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
 1594 ]
 1595 
 1596 # Unreal Engine 4 demo applications open-codes bitfieldReverse()
 1597 def bitfield_reverse(u):
 1598     step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
 1599     step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
 1600     step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
 1601     step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
 1602     step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
 1603 
 1604     return step5
 1605 
 1606 optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
 1607 
 1608 # For any float comparison operation, "cmp", if you have "a == a && a cmp b"
 1609 # then the "a == a" is redundant because it's equivalent to "a is not NaN"
 1610 # and, if a is a NaN then the second comparison will fail anyway.
 1611 for op in ['flt', 'fge', 'feq']:
 1612    optimizations += [
 1613       (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
 1614       (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
 1615    ]
 1616 
 1617 # Add optimizations to handle the case where the result of a ternary is
 1618 # compared to a constant.  This way we can take things like
 1619 #
 1620 # (a ? 0 : 1) > 0
 1621 #
 1622 # and turn it into
 1623 #
 1624 # a ? (0 > 0) : (1 > 0)
 1625 #
 1626 # which constant folding will eat for lunch.  The resulting ternary will
 1627 # further get cleaned up by the boolean reductions above and we will be
 1628 # left with just the original variable "a".
 1629 for op in ['flt', 'fge', 'feq', 'fne',
 1630            'ilt', 'ige', 'ieq', 'ine', 'ult', 'uge']:
 1631    optimizations += [
 1632       ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
 1633        ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
 1634       ((op, '#d', ('bcsel', a, '#b', '#c')),
 1635        ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
 1636    ]
 1637 
 1638 
 1639 # For example, this converts things like
 1640 #
 1641 #    1 + mix(0, a - 1, condition)
 1642 #
 1643 # into
 1644 #
 1645 #    mix(1, (a-1)+1, condition)
 1646 #
 1647 # Other optimizations will rearrange the constants.
 1648 for op in ['fadd', 'fmul', 'iadd', 'imul']:
 1649    optimizations += [
 1650       ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
 1651    ]
 1652 
 1653 # For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives
 1654 # states:
 1655 #
 1656 #     If neither layout qualifier is specified, derivatives in compute shaders
 1657 #     return zero, which is consistent with the handling of built-in texture
 1658 #     functions like texture() in GLSL 4.50 compute shaders.
 1659 for op in ['fddx', 'fddx_fine', 'fddx_coarse',
 1660            'fddy', 'fddy_fine', 'fddy_coarse']:
 1661    optimizations += [
 1662       ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')
 1663 ]
 1664 
 1665 # Some optimizations for ir3-specific instructions.
 1666 optimizations += [
 1667    # 'al * bl': If either 'al' or 'bl' is zero, return zero.
 1668    (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
 1669    # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.
 1670    (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
 1671    (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
 1672 ]
 1673 
 1674 # These kinds of sequences can occur after nir_opt_peephole_select.
 1675 #
 1676 # NOTE: fadd is not handled here because that gets in the way of ffma
 1677 # generation in the i965 driver.  Instead, fadd and ffma are handled in
 1678 # late_optimizations.
 1679 
 1680 for op in ['flrp']:
 1681     optimizations += [
 1682         (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
 1683         (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
 1684         (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
 1685         (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
 1686         (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
 1687         (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
 1688     ]
 1689 
 1690 for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
 1691     optimizations += [
 1692         (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
 1693         (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
 1694         (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
 1695         (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
 1696     ]
 1697 
 1698 for op in ['fpow']:
 1699     optimizations += [
 1700         (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
 1701         (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
 1702         (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
 1703         (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
 1704     ]
 1705 
 1706 for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos']:
 1707     optimizations += [
 1708         (('bcsel', a, (op + '(is_used_once)', b), (op, c)), (op, ('bcsel', a, b, c))),
 1709         (('bcsel', a, (op, b), (op + '(is_used_once)', c)), (op, ('bcsel', a, b, c))),
 1710     ]
 1711 
 1712 # This section contains "late" optimizations that should be run before
 1713 # creating ffmas and calling regular optimizations for the final time.
 1714 # Optimizations should go here if they help code generation and conflict
 1715 # with the regular optimizations.
 1716 before_ffma_optimizations = [
 1717    # Propagate constants down multiplication chains
 1718    (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),
 1719    (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),
 1720    (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),
 1721    (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),
 1722 
 1723    (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
 1724    (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
 1725    (('~fadd', ('fneg', a), a), 0.0),
 1726    (('iadd', ('ineg', a), a), 0),
 1727    (('iadd', ('ineg', a), ('iadd', a, b)), b),
 1728    (('iadd', a, ('iadd', ('ineg', a), b)), b),
 1729    (('~fadd', ('fneg', a), ('fadd', a, b)), b),
 1730    (('~fadd', a, ('fadd', ('fneg', a), b)), b),
 1731 
 1732    (('~flrp@32', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
 1733    (('~flrp@32', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
 1734    (('~flrp@32', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
 1735 ]
 1736 
 1737 # This section contains "late" optimizations that should be run after the
 1738 # regular optimizations have finished.  Optimizations should go here if
 1739 # they help code generation but do not necessarily produce code that is
 1740 # more easily optimizable.
 1741 late_optimizations = [
 1742    # Most of these optimizations aren't quite safe when you get infinity or
 1743    # Nan involved but the first one should be fine.
 1744    (('flt',          ('fadd', a, b),  0.0), ('flt',          a, ('fneg', b))),
 1745    (('flt', ('fneg', ('fadd', a, b)), 0.0), ('flt', ('fneg', a),         b)),
 1746    (('~fge',          ('fadd', a, b),  0.0), ('fge',          a, ('fneg', b))),
 1747    (('~fge', ('fneg', ('fadd', a, b)), 0.0), ('fge', ('fneg', a),         b)),
 1748    (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
 1749    (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
 1750 
 1751    # nir_lower_to_source_mods will collapse this, but its existence during the
 1752    # optimization loop can prevent other optimizations.
 1753    (('fneg', ('fneg', a)), a),
 1754 
 1755    # Subtractions get lowered during optimization, so we need to recombine them
 1756    (('fadd', 'a', ('fneg', 'b')), ('fsub', 'a', 'b'), '!options->lower_sub'),
 1757    (('iadd', 'a', ('ineg', 'b')), ('isub', 'a', 'b'), '!options->lower_sub'),
 1758    (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
 1759    (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
 1760 
 1761    # These are duplicated from the main optimizations table.  The late
 1762    # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
 1763    # new patterns like these.  The patterns that compare with zero are removed
 1764    # because they are unlikely to be created in by anything in
 1765    # late_optimizations.
 1766    (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
 1767    (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
 1768    (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
 1769    (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
 1770    (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
 1771    (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
 1772 
 1773    (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
 1774    # flt(fsat(a), 1.0) is inexact because it returns True if a is NaN
 1775    # (fsat(NaN) is 0), while flt(a, 1.0) always returns FALSE.
 1776    (('~flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
 1777 
 1778    (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
 1779 
 1780    (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
 1781    (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
 1782    (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
 1783    (('fne', ('fneg', a), ('fneg', b)), ('fne', b, a)),
 1784    (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
 1785    (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
 1786    (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
 1787    (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
 1788    (('fne', ('fneg', a), -1.0), ('fne', 1.0, a)),
 1789    (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
 1790 
 1791    (('ior', a, a), a),
 1792    (('iand', a, a), a),
 1793 
 1794    (('iand', ('ine(is_used_once)', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('umin', a, b), 0)),
 1795    (('ior',  ('ieq(is_used_once)', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umin', a, b), 0)),
 1796 
 1797    (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
 1798 
 1799    (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
 1800    (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
 1801    (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
 1802    (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
 1803 
 1804    (('~flrp@32', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
 1805    (('~flrp@64', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
 1806 
 1807    (('~fadd@32', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp32'),
 1808    (('~fadd@64', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp64'),
 1809 
 1810    # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this
 1811    # particular operation is common for expanding values stored in a texture
 1812    # from [0,1] to [-1,1].
 1813    (('~ffma@32', a,  2.0, -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
 1814    (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
 1815    (('~ffma@32', a, -2.0,  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
 1816    (('~ffma@32', a,  2.0,  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
 1817    (('~fadd@32', ('fmul(is_used_once)',  2.0, a), -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
 1818    (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
 1819    (('~fadd@32', ('fmul(is_used_once)', -2.0, a),  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
 1820    (('~fadd@32', ('fmul(is_used_once)',  2.0, a),  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
 1821 
 1822     # flrp(a, b, a)
 1823     # a*(1-a) + b*a
 1824     # a + -a*a + a*b    (1)
 1825     # a + a*(b - a)
 1826     # Option 1: ffma(a, (b-a), a)
 1827     #
 1828     # Alternately, after (1):
 1829     # a*(1+b) + -a*a
 1830     # a*((1+b) + -a)
 1831     #
 1832     # Let b=1
 1833     #
 1834     # Option 2: ffma(a, 2, -(a*a))
 1835     # Option 3: ffma(a, 2, (-a)*a)
 1836     # Option 4: ffma(a, -a, (2*a)
 1837     # Option 5: a * (2 - a)
 1838     #
 1839     # There are a lot of other possible combinations.
 1840    (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),
 1841    (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
 1842    (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
 1843    (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
 1844    (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))),    ('flrp', a, 1.0, a), '!options->lower_flrp32'),
 1845 
 1846    # we do these late so that we don't get in the way of creating ffmas
 1847    (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
 1848    (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
 1849 
 1850    (('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))),
 1851 
 1852    # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
 1853    # op(b, d)) => op(b, bcsel(a, c, d)) transformations.  I do not know why.
 1854    (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
 1855     ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
 1856 
 1857    # Things that look like DPH in the source shader may get expanded to
 1858    # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
 1859    # to NIR.  After FFMA is generated, this can look like:
 1860    #
 1861    #    fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
 1862    #
 1863    # Reassociate the last addition into the first multiplication.
 1864    #
 1865    # Some shaders do not use 'invariant' in vertex and (possibly) geometry
 1866    # shader stages on some outputs that are intended to be invariant.  For
 1867    # various reasons, this optimization may not be fully applied in all
 1868    # shaders used for different rendering passes of the same geometry.  This
 1869    # can result in Z-fighting artifacts (at best).  For now, disable this
 1870    # optimization in these stages.  See bugzilla #111490.  In tessellation
 1871    # stages applications seem to use 'precise' when necessary, so allow the
 1872    # optimization in those stages.
 1873    (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
 1874     ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
 1875    (('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
 1876     ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
 1877 
 1878    # Convert f2fmp instructions to concrete f2f16 instructions. At this point
 1879    # any conversions that could have been removed will have been removed in
 1880    # nir_opt_algebraic so any remaining ones are required.
 1881    (('f2fmp', a), ('f2f16', a)),
 1882 ]
 1883 
 1884 for op in ['fadd']:
 1885     late_optimizations += [
 1886         (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
 1887         (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
 1888     ]
 1889 
 1890 for op in ['ffma']:
 1891     late_optimizations += [
 1892         (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
 1893         (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
 1894 
 1895         (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
 1896         (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
 1897     ]
 1898 
 1899 distribute_src_mods = [
 1900    # Try to remove some spurious negations rather than pushing them down.
 1901    (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
 1902    (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
 1903    (('fdot_replicated2', ('fneg', a), ('fneg', b)), ('fdot_replicated2', a, b)),
 1904    (('fdot_replicated3', ('fneg', a), ('fneg', b)), ('fdot_replicated3', a, b)),
 1905    (('fdot_replicated4', ('fneg', a), ('fneg', b)), ('fdot_replicated4', a, b)),
 1906    (('fneg', ('fneg', a)), a),
 1907 
 1908    (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
 1909    (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
 1910 
 1911    (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
 1912    (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
 1913    (('fneg', ('fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
 1914 
 1915    # Note that fmin <-> fmax.  I don't think there is a way to distribute
 1916    # fabs() into fmin or fmax.
 1917    (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
 1918    (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
 1919 
 1920    (('fneg', ('fdot_replicated2(is_used_once)', a, b)), ('fdot_replicated2', ('fneg', a), b)),
 1921    (('fneg', ('fdot_replicated3(is_used_once)', a, b)), ('fdot_replicated3', ('fneg', a), b)),
 1922    (('fneg', ('fdot_replicated4(is_used_once)', a, b)), ('fdot_replicated4', ('fneg', a), b)),
 1923 
 1924    # fdph works mostly like fdot, but to get the correct result, the negation
 1925    # must be applied to the second source.
 1926    (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
 1927 
 1928    (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
 1929    (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
 1930 ]
 1931 
 1932 print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
 1933 print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
 1934                                   before_ffma_optimizations).render())
 1935 print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
 1936                                   late_optimizations).render())
 1937 print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
 1938                                   distribute_src_mods).render())