"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/auxiliary/tgsi/tgsi_exec.c" (16 Sep 2020, 200632 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "tgsi_exec.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 20.1.7_vs_20.1.8.

    1 /**************************************************************************
    2  * 
    3  * Copyright 2007-2008 VMware, Inc.
    4  * All Rights Reserved.
    5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
    6  * 
    7  * Permission is hereby granted, free of charge, to any person obtaining a
    8  * copy of this software and associated documentation files (the
    9  * "Software"), to deal in the Software without restriction, including
   10  * without limitation the rights to use, copy, modify, merge, publish,
   11  * distribute, sub license, and/or sell copies of the Software, and to
   12  * permit persons to whom the Software is furnished to do so, subject to
   13  * the following conditions:
   14  * 
   15  * The above copyright notice and this permission notice (including the
   16  * next paragraph) shall be included in all copies or substantial portions
   17  * of the Software.
   18  * 
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
   22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
   23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  * 
   27  **************************************************************************/
   28 
   29 /**
   30  * TGSI interpreter/executor.
   31  *
   32  * Flow control information:
   33  *
   34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
   35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
   36  * care since a condition may be true for some quad components but false
   37  * for other components.
   38  *
   39  * We basically execute all statements (even if they're in the part of
   40  * an IF/ELSE clause that's "not taken") and use a special mask to
   41  * control writing to destination registers.  This is the ExecMask.
   42  * See store_dest().
   43  *
   44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
   45  * ContMask) which are controlled by the flow control instructions (namely:
   46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
   47  *
   48  *
   49  * Authors:
   50  *   Michal Krol
   51  *   Brian Paul
   52  */
   53 
   54 #include "pipe/p_compiler.h"
   55 #include "pipe/p_state.h"
   56 #include "pipe/p_shader_tokens.h"
   57 #include "tgsi/tgsi_dump.h"
   58 #include "tgsi/tgsi_parse.h"
   59 #include "tgsi/tgsi_util.h"
   60 #include "tgsi_exec.h"
   61 #include "util/u_half.h"
   62 #include "util/u_memory.h"
   63 #include "util/u_math.h"
   64 #include "util/rounding.h"
   65 
   66 
   67 #define DEBUG_EXECUTION 0
   68 
   69 
   70 #define FAST_MATH 0
   71 
   72 #define TILE_TOP_LEFT     0
   73 #define TILE_TOP_RIGHT    1
   74 #define TILE_BOTTOM_LEFT  2
   75 #define TILE_BOTTOM_RIGHT 3
   76 
   77 union tgsi_double_channel {
   78    double d[TGSI_QUAD_SIZE];
   79    unsigned u[TGSI_QUAD_SIZE][2];
   80    uint64_t u64[TGSI_QUAD_SIZE];
   81    int64_t i64[TGSI_QUAD_SIZE];
   82 };
   83 
   84 struct tgsi_double_vector {
   85    union tgsi_double_channel xy;
   86    union tgsi_double_channel zw;
   87 };
   88 
   89 static void
   90 micro_abs(union tgsi_exec_channel *dst,
   91           const union tgsi_exec_channel *src)
   92 {
   93    dst->f[0] = fabsf(src->f[0]);
   94    dst->f[1] = fabsf(src->f[1]);
   95    dst->f[2] = fabsf(src->f[2]);
   96    dst->f[3] = fabsf(src->f[3]);
   97 }
   98 
   99 static void
  100 micro_arl(union tgsi_exec_channel *dst,
  101           const union tgsi_exec_channel *src)
  102 {
  103    dst->i[0] = (int)floorf(src->f[0]);
  104    dst->i[1] = (int)floorf(src->f[1]);
  105    dst->i[2] = (int)floorf(src->f[2]);
  106    dst->i[3] = (int)floorf(src->f[3]);
  107 }
  108 
  109 static void
  110 micro_arr(union tgsi_exec_channel *dst,
  111           const union tgsi_exec_channel *src)
  112 {
  113    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
  114    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
  115    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
  116    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
  117 }
  118 
  119 static void
  120 micro_ceil(union tgsi_exec_channel *dst,
  121            const union tgsi_exec_channel *src)
  122 {
  123    dst->f[0] = ceilf(src->f[0]);
  124    dst->f[1] = ceilf(src->f[1]);
  125    dst->f[2] = ceilf(src->f[2]);
  126    dst->f[3] = ceilf(src->f[3]);
  127 }
  128 
  129 static void
  130 micro_cmp(union tgsi_exec_channel *dst,
  131           const union tgsi_exec_channel *src0,
  132           const union tgsi_exec_channel *src1,
  133           const union tgsi_exec_channel *src2)
  134 {
  135    dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
  136    dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
  137    dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
  138    dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
  139 }
  140 
  141 static void
  142 micro_cos(union tgsi_exec_channel *dst,
  143           const union tgsi_exec_channel *src)
  144 {
  145    dst->f[0] = cosf(src->f[0]);
  146    dst->f[1] = cosf(src->f[1]);
  147    dst->f[2] = cosf(src->f[2]);
  148    dst->f[3] = cosf(src->f[3]);
  149 }
  150 
  151 static void
  152 micro_d2f(union tgsi_exec_channel *dst,
  153           const union tgsi_double_channel *src)
  154 {
  155    dst->f[0] = (float)src->d[0];
  156    dst->f[1] = (float)src->d[1];
  157    dst->f[2] = (float)src->d[2];
  158    dst->f[3] = (float)src->d[3];
  159 }
  160 
  161 static void
  162 micro_d2i(union tgsi_exec_channel *dst,
  163           const union tgsi_double_channel *src)
  164 {
  165    dst->i[0] = (int)src->d[0];
  166    dst->i[1] = (int)src->d[1];
  167    dst->i[2] = (int)src->d[2];
  168    dst->i[3] = (int)src->d[3];
  169 }
  170 
  171 static void
  172 micro_d2u(union tgsi_exec_channel *dst,
  173           const union tgsi_double_channel *src)
  174 {
  175    dst->u[0] = (unsigned)src->d[0];
  176    dst->u[1] = (unsigned)src->d[1];
  177    dst->u[2] = (unsigned)src->d[2];
  178    dst->u[3] = (unsigned)src->d[3];
  179 }
  180 static void
  181 micro_dabs(union tgsi_double_channel *dst,
  182            const union tgsi_double_channel *src)
  183 {
  184    dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
  185    dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
  186    dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
  187    dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
  188 }
  189 
  190 static void
  191 micro_dadd(union tgsi_double_channel *dst,
  192           const union tgsi_double_channel *src)
  193 {
  194    dst->d[0] = src[0].d[0] + src[1].d[0];
  195    dst->d[1] = src[0].d[1] + src[1].d[1];
  196    dst->d[2] = src[0].d[2] + src[1].d[2];
  197    dst->d[3] = src[0].d[3] + src[1].d[3];
  198 }
  199 
  200 static void
  201 micro_ddiv(union tgsi_double_channel *dst,
  202           const union tgsi_double_channel *src)
  203 {
  204    dst->d[0] = src[0].d[0] / src[1].d[0];
  205    dst->d[1] = src[0].d[1] / src[1].d[1];
  206    dst->d[2] = src[0].d[2] / src[1].d[2];
  207    dst->d[3] = src[0].d[3] / src[1].d[3];
  208 }
  209 
  210 static void
  211 micro_ddx(union tgsi_exec_channel *dst,
  212           const union tgsi_exec_channel *src)
  213 {
  214    dst->f[0] =
  215    dst->f[1] =
  216    dst->f[2] =
  217    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
  218 }
  219 
  220 static void
  221 micro_ddx_fine(union tgsi_exec_channel *dst,
  222           const union tgsi_exec_channel *src)
  223 {
  224    dst->f[0] =
  225    dst->f[1] = src->f[TILE_TOP_RIGHT] - src->f[TILE_TOP_LEFT];
  226    dst->f[2] =
  227    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
  228 }
  229 
  230 
  231 static void
  232 micro_ddy(union tgsi_exec_channel *dst,
  233           const union tgsi_exec_channel *src)
  234 {
  235    dst->f[0] =
  236    dst->f[1] =
  237    dst->f[2] =
  238    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
  239 }
  240 
  241 static void
  242 micro_ddy_fine(union tgsi_exec_channel *dst,
  243           const union tgsi_exec_channel *src)
  244 {
  245    dst->f[0] =
  246    dst->f[2] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
  247    dst->f[1] =
  248    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_TOP_RIGHT];
  249 }
  250 
  251 static void
  252 micro_dmul(union tgsi_double_channel *dst,
  253            const union tgsi_double_channel *src)
  254 {
  255    dst->d[0] = src[0].d[0] * src[1].d[0];
  256    dst->d[1] = src[0].d[1] * src[1].d[1];
  257    dst->d[2] = src[0].d[2] * src[1].d[2];
  258    dst->d[3] = src[0].d[3] * src[1].d[3];
  259 }
  260 
  261 static void
  262 micro_dmax(union tgsi_double_channel *dst,
  263            const union tgsi_double_channel *src)
  264 {
  265    dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
  266    dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
  267    dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
  268    dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
  269 }
  270 
  271 static void
  272 micro_dmin(union tgsi_double_channel *dst,
  273            const union tgsi_double_channel *src)
  274 {
  275    dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
  276    dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
  277    dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
  278    dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
  279 }
  280 
  281 static void
  282 micro_dneg(union tgsi_double_channel *dst,
  283            const union tgsi_double_channel *src)
  284 {
  285    dst->d[0] = -src->d[0];
  286    dst->d[1] = -src->d[1];
  287    dst->d[2] = -src->d[2];
  288    dst->d[3] = -src->d[3];
  289 }
  290 
  291 static void
  292 micro_dslt(union tgsi_double_channel *dst,
  293            const union tgsi_double_channel *src)
  294 {
  295    dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
  296    dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
  297    dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
  298    dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
  299 }
  300 
  301 static void
  302 micro_dsne(union tgsi_double_channel *dst,
  303            const union tgsi_double_channel *src)
  304 {
  305    dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
  306    dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
  307    dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
  308    dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
  309 }
  310 
  311 static void
  312 micro_dsge(union tgsi_double_channel *dst,
  313            const union tgsi_double_channel *src)
  314 {
  315    dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
  316    dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
  317    dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
  318    dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
  319 }
  320 
  321 static void
  322 micro_dseq(union tgsi_double_channel *dst,
  323            const union tgsi_double_channel *src)
  324 {
  325    dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
  326    dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
  327    dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
  328    dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
  329 }
  330 
  331 static void
  332 micro_drcp(union tgsi_double_channel *dst,
  333            const union tgsi_double_channel *src)
  334 {
  335    dst->d[0] = 1.0 / src->d[0];
  336    dst->d[1] = 1.0 / src->d[1];
  337    dst->d[2] = 1.0 / src->d[2];
  338    dst->d[3] = 1.0 / src->d[3];
  339 }
  340 
  341 static void
  342 micro_dsqrt(union tgsi_double_channel *dst,
  343             const union tgsi_double_channel *src)
  344 {
  345    dst->d[0] = sqrt(src->d[0]);
  346    dst->d[1] = sqrt(src->d[1]);
  347    dst->d[2] = sqrt(src->d[2]);
  348    dst->d[3] = sqrt(src->d[3]);
  349 }
  350 
  351 static void
  352 micro_drsq(union tgsi_double_channel *dst,
  353           const union tgsi_double_channel *src)
  354 {
  355    dst->d[0] = 1.0 / sqrt(src->d[0]);
  356    dst->d[1] = 1.0 / sqrt(src->d[1]);
  357    dst->d[2] = 1.0 / sqrt(src->d[2]);
  358    dst->d[3] = 1.0 / sqrt(src->d[3]);
  359 }
  360 
  361 static void
  362 micro_dmad(union tgsi_double_channel *dst,
  363            const union tgsi_double_channel *src)
  364 {
  365    dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
  366    dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
  367    dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
  368    dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
  369 }
  370 
  371 static void
  372 micro_dfrac(union tgsi_double_channel *dst,
  373             const union tgsi_double_channel *src)
  374 {
  375    dst->d[0] = src->d[0] - floor(src->d[0]);
  376    dst->d[1] = src->d[1] - floor(src->d[1]);
  377    dst->d[2] = src->d[2] - floor(src->d[2]);
  378    dst->d[3] = src->d[3] - floor(src->d[3]);
  379 }
  380 
  381 static void
  382 micro_dldexp(union tgsi_double_channel *dst,
  383              const union tgsi_double_channel *src0,
  384              union tgsi_exec_channel *src1)
  385 {
  386    dst->d[0] = ldexp(src0->d[0], src1->i[0]);
  387    dst->d[1] = ldexp(src0->d[1], src1->i[1]);
  388    dst->d[2] = ldexp(src0->d[2], src1->i[2]);
  389    dst->d[3] = ldexp(src0->d[3], src1->i[3]);
  390 }
  391 
  392 static void
  393 micro_dfracexp(union tgsi_double_channel *dst,
  394                union tgsi_exec_channel *dst_exp,
  395                const union tgsi_double_channel *src)
  396 {
  397    dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
  398    dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
  399    dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
  400    dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
  401 }
  402 
  403 static void
  404 micro_exp2(union tgsi_exec_channel *dst,
  405            const union tgsi_exec_channel *src)
  406 {
  407 #if FAST_MATH
  408    dst->f[0] = util_fast_exp2(src->f[0]);
  409    dst->f[1] = util_fast_exp2(src->f[1]);
  410    dst->f[2] = util_fast_exp2(src->f[2]);
  411    dst->f[3] = util_fast_exp2(src->f[3]);
  412 #else
  413 #if DEBUG
  414    /* Inf is okay for this instruction, so clamp it to silence assertions. */
  415    uint i;
  416    union tgsi_exec_channel clamped;
  417 
  418    for (i = 0; i < 4; i++) {
  419       if (src->f[i] > 127.99999f) {
  420          clamped.f[i] = 127.99999f;
  421       } else if (src->f[i] < -126.99999f) {
  422          clamped.f[i] = -126.99999f;
  423       } else {
  424          clamped.f[i] = src->f[i];
  425       }
  426    }
  427    src = &clamped;
  428 #endif /* DEBUG */
  429 
  430    dst->f[0] = powf(2.0f, src->f[0]);
  431    dst->f[1] = powf(2.0f, src->f[1]);
  432    dst->f[2] = powf(2.0f, src->f[2]);
  433    dst->f[3] = powf(2.0f, src->f[3]);
  434 #endif /* FAST_MATH */
  435 }
  436 
  437 static void
  438 micro_f2d(union tgsi_double_channel *dst,
  439           const union tgsi_exec_channel *src)
  440 {
  441    dst->d[0] = (double)src->f[0];
  442    dst->d[1] = (double)src->f[1];
  443    dst->d[2] = (double)src->f[2];
  444    dst->d[3] = (double)src->f[3];
  445 }
  446 
  447 static void
  448 micro_flr(union tgsi_exec_channel *dst,
  449           const union tgsi_exec_channel *src)
  450 {
  451    dst->f[0] = floorf(src->f[0]);
  452    dst->f[1] = floorf(src->f[1]);
  453    dst->f[2] = floorf(src->f[2]);
  454    dst->f[3] = floorf(src->f[3]);
  455 }
  456 
  457 static void
  458 micro_frc(union tgsi_exec_channel *dst,
  459           const union tgsi_exec_channel *src)
  460 {
  461    dst->f[0] = src->f[0] - floorf(src->f[0]);
  462    dst->f[1] = src->f[1] - floorf(src->f[1]);
  463    dst->f[2] = src->f[2] - floorf(src->f[2]);
  464    dst->f[3] = src->f[3] - floorf(src->f[3]);
  465 }
  466 
  467 static void
  468 micro_i2d(union tgsi_double_channel *dst,
  469           const union tgsi_exec_channel *src)
  470 {
  471    dst->d[0] = (double)src->i[0];
  472    dst->d[1] = (double)src->i[1];
  473    dst->d[2] = (double)src->i[2];
  474    dst->d[3] = (double)src->i[3];
  475 }
  476 
  477 static void
  478 micro_iabs(union tgsi_exec_channel *dst,
  479            const union tgsi_exec_channel *src)
  480 {
  481    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
  482    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
  483    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
  484    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
  485 }
  486 
  487 static void
  488 micro_ineg(union tgsi_exec_channel *dst,
  489            const union tgsi_exec_channel *src)
  490 {
  491    dst->i[0] = -src->i[0];
  492    dst->i[1] = -src->i[1];
  493    dst->i[2] = -src->i[2];
  494    dst->i[3] = -src->i[3];
  495 }
  496 
  497 static void
  498 micro_lg2(union tgsi_exec_channel *dst,
  499           const union tgsi_exec_channel *src)
  500 {
  501 #if FAST_MATH
  502    dst->f[0] = util_fast_log2(src->f[0]);
  503    dst->f[1] = util_fast_log2(src->f[1]);
  504    dst->f[2] = util_fast_log2(src->f[2]);
  505    dst->f[3] = util_fast_log2(src->f[3]);
  506 #else
  507    dst->f[0] = logf(src->f[0]) * 1.442695f;
  508    dst->f[1] = logf(src->f[1]) * 1.442695f;
  509    dst->f[2] = logf(src->f[2]) * 1.442695f;
  510    dst->f[3] = logf(src->f[3]) * 1.442695f;
  511 #endif
  512 }
  513 
  514 static void
  515 micro_lrp(union tgsi_exec_channel *dst,
  516           const union tgsi_exec_channel *src0,
  517           const union tgsi_exec_channel *src1,
  518           const union tgsi_exec_channel *src2)
  519 {
  520    dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
  521    dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
  522    dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
  523    dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
  524 }
  525 
  526 static void
  527 micro_mad(union tgsi_exec_channel *dst,
  528           const union tgsi_exec_channel *src0,
  529           const union tgsi_exec_channel *src1,
  530           const union tgsi_exec_channel *src2)
  531 {
  532    dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
  533    dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
  534    dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
  535    dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
  536 }
  537 
  538 static void
  539 micro_mov(union tgsi_exec_channel *dst,
  540           const union tgsi_exec_channel *src)
  541 {
  542    dst->u[0] = src->u[0];
  543    dst->u[1] = src->u[1];
  544    dst->u[2] = src->u[2];
  545    dst->u[3] = src->u[3];
  546 }
  547 
  548 static void
  549 micro_rcp(union tgsi_exec_channel *dst,
  550           const union tgsi_exec_channel *src)
  551 {
  552 #if 0 /* for debugging */
  553    assert(src->f[0] != 0.0f);
  554    assert(src->f[1] != 0.0f);
  555    assert(src->f[2] != 0.0f);
  556    assert(src->f[3] != 0.0f);
  557 #endif
  558    dst->f[0] = 1.0f / src->f[0];
  559    dst->f[1] = 1.0f / src->f[1];
  560    dst->f[2] = 1.0f / src->f[2];
  561    dst->f[3] = 1.0f / src->f[3];
  562 }
  563 
  564 static void
  565 micro_rnd(union tgsi_exec_channel *dst,
  566           const union tgsi_exec_channel *src)
  567 {
  568    dst->f[0] = _mesa_roundevenf(src->f[0]);
  569    dst->f[1] = _mesa_roundevenf(src->f[1]);
  570    dst->f[2] = _mesa_roundevenf(src->f[2]);
  571    dst->f[3] = _mesa_roundevenf(src->f[3]);
  572 }
  573 
  574 static void
  575 micro_rsq(union tgsi_exec_channel *dst,
  576           const union tgsi_exec_channel *src)
  577 {
  578 #if 0 /* for debugging */
  579    assert(src->f[0] != 0.0f);
  580    assert(src->f[1] != 0.0f);
  581    assert(src->f[2] != 0.0f);
  582    assert(src->f[3] != 0.0f);
  583 #endif
  584    dst->f[0] = 1.0f / sqrtf(src->f[0]);
  585    dst->f[1] = 1.0f / sqrtf(src->f[1]);
  586    dst->f[2] = 1.0f / sqrtf(src->f[2]);
  587    dst->f[3] = 1.0f / sqrtf(src->f[3]);
  588 }
  589 
  590 static void
  591 micro_sqrt(union tgsi_exec_channel *dst,
  592            const union tgsi_exec_channel *src)
  593 {
  594    dst->f[0] = sqrtf(src->f[0]);
  595    dst->f[1] = sqrtf(src->f[1]);
  596    dst->f[2] = sqrtf(src->f[2]);
  597    dst->f[3] = sqrtf(src->f[3]);
  598 }
  599 
  600 static void
  601 micro_seq(union tgsi_exec_channel *dst,
  602           const union tgsi_exec_channel *src0,
  603           const union tgsi_exec_channel *src1)
  604 {
  605    dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
  606    dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
  607    dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
  608    dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
  609 }
  610 
  611 static void
  612 micro_sge(union tgsi_exec_channel *dst,
  613           const union tgsi_exec_channel *src0,
  614           const union tgsi_exec_channel *src1)
  615 {
  616    dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
  617    dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
  618    dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
  619    dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
  620 }
  621 
  622 static void
  623 micro_sgn(union tgsi_exec_channel *dst,
  624           const union tgsi_exec_channel *src)
  625 {
  626    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
  627    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
  628    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
  629    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
  630 }
  631 
  632 static void
  633 micro_isgn(union tgsi_exec_channel *dst,
  634           const union tgsi_exec_channel *src)
  635 {
  636    dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
  637    dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
  638    dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
  639    dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
  640 }
  641 
  642 static void
  643 micro_sgt(union tgsi_exec_channel *dst,
  644           const union tgsi_exec_channel *src0,
  645           const union tgsi_exec_channel *src1)
  646 {
  647    dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
  648    dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
  649    dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
  650    dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
  651 }
  652 
  653 static void
  654 micro_sin(union tgsi_exec_channel *dst,
  655           const union tgsi_exec_channel *src)
  656 {
  657    dst->f[0] = sinf(src->f[0]);
  658    dst->f[1] = sinf(src->f[1]);
  659    dst->f[2] = sinf(src->f[2]);
  660    dst->f[3] = sinf(src->f[3]);
  661 }
  662 
  663 static void
  664 micro_sle(union tgsi_exec_channel *dst,
  665           const union tgsi_exec_channel *src0,
  666           const union tgsi_exec_channel *src1)
  667 {
  668    dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
  669    dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
  670    dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
  671    dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
  672 }
  673 
  674 static void
  675 micro_slt(union tgsi_exec_channel *dst,
  676           const union tgsi_exec_channel *src0,
  677           const union tgsi_exec_channel *src1)
  678 {
  679    dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
  680    dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
  681    dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
  682    dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
  683 }
  684 
  685 static void
  686 micro_sne(union tgsi_exec_channel *dst,
  687           const union tgsi_exec_channel *src0,
  688           const union tgsi_exec_channel *src1)
  689 {
  690    dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
  691    dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
  692    dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
  693    dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
  694 }
  695 
  696 static void
  697 micro_trunc(union tgsi_exec_channel *dst,
  698             const union tgsi_exec_channel *src)
  699 {
  700    dst->f[0] = truncf(src->f[0]);
  701    dst->f[1] = truncf(src->f[1]);
  702    dst->f[2] = truncf(src->f[2]);
  703    dst->f[3] = truncf(src->f[3]);
  704 }
  705 
  706 static void
  707 micro_u2d(union tgsi_double_channel *dst,
  708           const union tgsi_exec_channel *src)
  709 {
  710    dst->d[0] = (double)src->u[0];
  711    dst->d[1] = (double)src->u[1];
  712    dst->d[2] = (double)src->u[2];
  713    dst->d[3] = (double)src->u[3];
  714 }
  715 
  716 static void
  717 micro_i64abs(union tgsi_double_channel *dst,
  718              const union tgsi_double_channel *src)
  719 {
  720    dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
  721    dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
  722    dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
  723    dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
  724 }
  725 
  726 static void
  727 micro_i64sgn(union tgsi_double_channel *dst,
  728              const union tgsi_double_channel *src)
  729 {
  730    dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
  731    dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
  732    dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
  733    dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
  734 }
  735 
  736 static void
  737 micro_i64neg(union tgsi_double_channel *dst,
  738              const union tgsi_double_channel *src)
  739 {
  740    dst->i64[0] = -src->i64[0];
  741    dst->i64[1] = -src->i64[1];
  742    dst->i64[2] = -src->i64[2];
  743    dst->i64[3] = -src->i64[3];
  744 }
  745 
  746 static void
  747 micro_u64seq(union tgsi_double_channel *dst,
  748            const union tgsi_double_channel *src)
  749 {
  750    dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
  751    dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
  752    dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
  753    dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
  754 }
  755 
  756 static void
  757 micro_u64sne(union tgsi_double_channel *dst,
  758              const union tgsi_double_channel *src)
  759 {
  760    dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
  761    dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
  762    dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
  763    dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
  764 }
  765 
  766 static void
  767 micro_i64slt(union tgsi_double_channel *dst,
  768              const union tgsi_double_channel *src)
  769 {
  770    dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
  771    dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
  772    dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
  773    dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
  774 }
  775 
  776 static void
  777 micro_u64slt(union tgsi_double_channel *dst,
  778              const union tgsi_double_channel *src)
  779 {
  780    dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
  781    dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
  782    dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
  783    dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
  784 }
  785 
  786 static void
  787 micro_i64sge(union tgsi_double_channel *dst,
  788            const union tgsi_double_channel *src)
  789 {
  790    dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
  791    dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
  792    dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
  793    dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
  794 }
  795 
  796 static void
  797 micro_u64sge(union tgsi_double_channel *dst,
  798              const union tgsi_double_channel *src)
  799 {
  800    dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
  801    dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
  802    dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
  803    dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
  804 }
  805 
  806 static void
  807 micro_u64max(union tgsi_double_channel *dst,
  808              const union tgsi_double_channel *src)
  809 {
  810    dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
  811    dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
  812    dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
  813    dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
  814 }
  815 
  816 static void
  817 micro_i64max(union tgsi_double_channel *dst,
  818              const union tgsi_double_channel *src)
  819 {
  820    dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
  821    dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
  822    dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
  823    dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
  824 }
  825 
  826 static void
  827 micro_u64min(union tgsi_double_channel *dst,
  828              const union tgsi_double_channel *src)
  829 {
  830    dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
  831    dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
  832    dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
  833    dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
  834 }
  835 
  836 static void
  837 micro_i64min(union tgsi_double_channel *dst,
  838              const union tgsi_double_channel *src)
  839 {
  840    dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
  841    dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
  842    dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
  843    dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
  844 }
  845 
  846 static void
  847 micro_u64add(union tgsi_double_channel *dst,
  848              const union tgsi_double_channel *src)
  849 {
  850    dst->u64[0] = src[0].u64[0] + src[1].u64[0];
  851    dst->u64[1] = src[0].u64[1] + src[1].u64[1];
  852    dst->u64[2] = src[0].u64[2] + src[1].u64[2];
  853    dst->u64[3] = src[0].u64[3] + src[1].u64[3];
  854 }
  855 
  856 static void
  857 micro_u64mul(union tgsi_double_channel *dst,
  858              const union tgsi_double_channel *src)
  859 {
  860    dst->u64[0] = src[0].u64[0] * src[1].u64[0];
  861    dst->u64[1] = src[0].u64[1] * src[1].u64[1];
  862    dst->u64[2] = src[0].u64[2] * src[1].u64[2];
  863    dst->u64[3] = src[0].u64[3] * src[1].u64[3];
  864 }
  865 
  866 static void
  867 micro_u64div(union tgsi_double_channel *dst,
  868              const union tgsi_double_channel *src)
  869 {
  870    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
  871    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
  872    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
  873    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
  874 }
  875 
  876 static void
  877 micro_i64div(union tgsi_double_channel *dst,
  878              const union tgsi_double_channel *src)
  879 {
  880    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
  881    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
  882    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
  883    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
  884 }
  885 
  886 static void
  887 micro_u64mod(union tgsi_double_channel *dst,
  888              const union tgsi_double_channel *src)
  889 {
  890    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
  891    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
  892    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
  893    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
  894 }
  895 
  896 static void
  897 micro_i64mod(union tgsi_double_channel *dst,
  898              const union tgsi_double_channel *src)
  899 {
  900    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
  901    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
  902    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
  903    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
  904 }
  905 
  906 static void
  907 micro_u64shl(union tgsi_double_channel *dst,
  908              const union tgsi_double_channel *src0,
  909              union tgsi_exec_channel *src1)
  910 {
  911    unsigned masked_count;
  912    masked_count = src1->u[0] & 0x3f;
  913    dst->u64[0] = src0->u64[0] << masked_count;
  914    masked_count = src1->u[1] & 0x3f;
  915    dst->u64[1] = src0->u64[1] << masked_count;
  916    masked_count = src1->u[2] & 0x3f;
  917    dst->u64[2] = src0->u64[2] << masked_count;
  918    masked_count = src1->u[3] & 0x3f;
  919    dst->u64[3] = src0->u64[3] << masked_count;
  920 }
  921 
  922 static void
  923 micro_i64shr(union tgsi_double_channel *dst,
  924              const union tgsi_double_channel *src0,
  925              union tgsi_exec_channel *src1)
  926 {
  927    unsigned masked_count;
  928    masked_count = src1->u[0] & 0x3f;
  929    dst->i64[0] = src0->i64[0] >> masked_count;
  930    masked_count = src1->u[1] & 0x3f;
  931    dst->i64[1] = src0->i64[1] >> masked_count;
  932    masked_count = src1->u[2] & 0x3f;
  933    dst->i64[2] = src0->i64[2] >> masked_count;
  934    masked_count = src1->u[3] & 0x3f;
  935    dst->i64[3] = src0->i64[3] >> masked_count;
  936 }
  937 
  938 static void
  939 micro_u64shr(union tgsi_double_channel *dst,
  940              const union tgsi_double_channel *src0,
  941              union tgsi_exec_channel *src1)
  942 {
  943    unsigned masked_count;
  944    masked_count = src1->u[0] & 0x3f;
  945    dst->u64[0] = src0->u64[0] >> masked_count;
  946    masked_count = src1->u[1] & 0x3f;
  947    dst->u64[1] = src0->u64[1] >> masked_count;
  948    masked_count = src1->u[2] & 0x3f;
  949    dst->u64[2] = src0->u64[2] >> masked_count;
  950    masked_count = src1->u[3] & 0x3f;
  951    dst->u64[3] = src0->u64[3] >> masked_count;
  952 }
  953 
  954 enum tgsi_exec_datatype {
  955    TGSI_EXEC_DATA_FLOAT,
  956    TGSI_EXEC_DATA_INT,
  957    TGSI_EXEC_DATA_UINT,
  958    TGSI_EXEC_DATA_DOUBLE,
  959    TGSI_EXEC_DATA_INT64,
  960    TGSI_EXEC_DATA_UINT64,
  961 };
  962 
  963 /*
  964  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
  965  */
  966 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
  967 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
  968 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
  969 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
  970 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
  971 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
  972 #define TEMP_PRIMITIVE_S1_I   TGSI_EXEC_TEMP_PRIMITIVE_S1_I
  973 #define TEMP_PRIMITIVE_S1_C   TGSI_EXEC_TEMP_PRIMITIVE_S1_C
  974 #define TEMP_PRIMITIVE_S2_I   TGSI_EXEC_TEMP_PRIMITIVE_S2_I
  975 #define TEMP_PRIMITIVE_S2_C   TGSI_EXEC_TEMP_PRIMITIVE_S2_C
  976 #define TEMP_PRIMITIVE_S3_I   TGSI_EXEC_TEMP_PRIMITIVE_S3_I
  977 #define TEMP_PRIMITIVE_S3_C   TGSI_EXEC_TEMP_PRIMITIVE_S3_C
  978 
  979 static const struct {
  980    int idx;
  981    int chan;
  982 } temp_prim_idxs[] = {
  983    { TEMP_PRIMITIVE_I, TEMP_PRIMITIVE_C },
  984    { TEMP_PRIMITIVE_S1_I, TEMP_PRIMITIVE_S1_C },
  985    { TEMP_PRIMITIVE_S2_I, TEMP_PRIMITIVE_S2_C },
  986    { TEMP_PRIMITIVE_S3_I, TEMP_PRIMITIVE_S3_C },
  987 };
  988 
  989 /** The execution mask depends on the conditional mask and the loop mask */
  990 #define UPDATE_EXEC_MASK(MACH) \
  991       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
  992 
  993 
  994 static const union tgsi_exec_channel ZeroVec =
  995    { { 0.0, 0.0, 0.0, 0.0 } };
  996 
  997 static const union tgsi_exec_channel OneVec = {
  998    {1.0f, 1.0f, 1.0f, 1.0f}
  999 };
 1000 
 1001 static const union tgsi_exec_channel P128Vec = {
 1002    {128.0f, 128.0f, 128.0f, 128.0f}
 1003 };
 1004 
 1005 static const union tgsi_exec_channel M128Vec = {
 1006    {-128.0f, -128.0f, -128.0f, -128.0f}
 1007 };
 1008 
 1009 
 1010 /**
 1011  * Assert that none of the float values in 'chan' are infinite or NaN.
 1012  * NaN and Inf may occur normally during program execution and should
 1013  * not lead to crashes, etc.  But when debugging, it's helpful to catch
 1014  * them.
 1015  */
 1016 static inline void
 1017 check_inf_or_nan(const union tgsi_exec_channel *chan)
 1018 {
 1019    assert(!util_is_inf_or_nan((chan)->f[0]));
 1020    assert(!util_is_inf_or_nan((chan)->f[1]));
 1021    assert(!util_is_inf_or_nan((chan)->f[2]));
 1022    assert(!util_is_inf_or_nan((chan)->f[3]));
 1023 }
 1024 
 1025 
 1026 #ifdef DEBUG
 1027 static void
 1028 print_chan(const char *msg, const union tgsi_exec_channel *chan)
 1029 {
 1030    debug_printf("%s = {%f, %f, %f, %f}\n",
 1031                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
 1032 }
 1033 #endif
 1034 
 1035 
 1036 #ifdef DEBUG
 1037 static void
 1038 print_temp(const struct tgsi_exec_machine *mach, uint index)
 1039 {
 1040    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
 1041    int i;
 1042    debug_printf("Temp[%u] =\n", index);
 1043    for (i = 0; i < 4; i++) {
 1044       debug_printf("  %c: { %f, %f, %f, %f }\n",
 1045                    "XYZW"[i],
 1046                    tmp->xyzw[i].f[0],
 1047                    tmp->xyzw[i].f[1],
 1048                    tmp->xyzw[i].f[2],
 1049                    tmp->xyzw[i].f[3]);
 1050    }
 1051 }
 1052 #endif
 1053 
 1054 
 1055 void
 1056 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
 1057                                unsigned num_bufs,
 1058                                const void **bufs,
 1059                                const unsigned *buf_sizes)
 1060 {
 1061    unsigned i;
 1062 
 1063    for (i = 0; i < num_bufs; i++) {
 1064       mach->Consts[i] = bufs[i];
 1065       mach->ConstsSize[i] = buf_sizes[i];
 1066    }
 1067 }
 1068 
 1069 /**
 1070  * Initialize machine state by expanding tokens to full instructions,
 1071  * allocating temporary storage, setting up constants, etc.
 1072  * After this, we can call tgsi_exec_machine_run() many times.
 1073  */
 1074 void 
 1075 tgsi_exec_machine_bind_shader(
 1076    struct tgsi_exec_machine *mach,
 1077    const struct tgsi_token *tokens,
 1078    struct tgsi_sampler *sampler,
 1079    struct tgsi_image *image,
 1080    struct tgsi_buffer *buffer)
 1081 {
 1082    uint k;
 1083    struct tgsi_parse_context parse;
 1084    struct tgsi_full_instruction *instructions;
 1085    struct tgsi_full_declaration *declarations;
 1086    uint maxInstructions = 10, numInstructions = 0;
 1087    uint maxDeclarations = 10, numDeclarations = 0;
 1088 
 1089 #if 0
 1090    tgsi_dump(tokens, 0);
 1091 #endif
 1092 
 1093    util_init_math();
 1094 
 1095 
 1096    mach->Tokens = tokens;
 1097    mach->Sampler = sampler;
 1098    mach->Image = image;
 1099    mach->Buffer = buffer;
 1100 
 1101    if (!tokens) {
 1102       /* unbind and free all */
 1103       FREE(mach->Declarations);
 1104       mach->Declarations = NULL;
 1105       mach->NumDeclarations = 0;
 1106 
 1107       FREE(mach->Instructions);
 1108       mach->Instructions = NULL;
 1109       mach->NumInstructions = 0;
 1110 
 1111       return;
 1112    }
 1113 
 1114    k = tgsi_parse_init (&parse, mach->Tokens);
 1115    if (k != TGSI_PARSE_OK) {
 1116       debug_printf( "Problem parsing!\n" );
 1117       return;
 1118    }
 1119 
 1120    mach->ImmLimit = 0;
 1121    mach->NumOutputs = 0;
 1122 
 1123    for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
 1124       mach->SysSemanticToIndex[k] = -1;
 1125 
 1126    if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
 1127        !mach->UsedGeometryShader) {
 1128       struct tgsi_exec_vector *inputs;
 1129       struct tgsi_exec_vector *outputs;
 1130 
 1131       inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
 1132                             TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
 1133                             16);
 1134 
 1135       if (!inputs)
 1136          return;
 1137 
 1138       outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
 1139                              TGSI_MAX_TOTAL_VERTICES, 16);
 1140 
 1141       if (!outputs) {
 1142          align_free(inputs);
 1143          return;
 1144       }
 1145 
 1146       align_free(mach->Inputs);
 1147       align_free(mach->Outputs);
 1148 
 1149       mach->Inputs = inputs;
 1150       mach->Outputs = outputs;
 1151       mach->UsedGeometryShader = TRUE;
 1152    }
 1153 
 1154    declarations = (struct tgsi_full_declaration *)
 1155       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
 1156 
 1157    if (!declarations) {
 1158       return;
 1159    }
 1160 
 1161    instructions = (struct tgsi_full_instruction *)
 1162       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
 1163 
 1164    if (!instructions) {
 1165       FREE( declarations );
 1166       return;
 1167    }
 1168 
 1169    while( !tgsi_parse_end_of_tokens( &parse ) ) {
 1170       uint i;
 1171 
 1172       tgsi_parse_token( &parse );
 1173       switch( parse.FullToken.Token.Type ) {
 1174       case TGSI_TOKEN_TYPE_DECLARATION:
 1175          /* save expanded declaration */
 1176          if (numDeclarations == maxDeclarations) {
 1177             declarations = REALLOC(declarations,
 1178                                    maxDeclarations
 1179                                    * sizeof(struct tgsi_full_declaration),
 1180                                    (maxDeclarations + 10)
 1181                                    * sizeof(struct tgsi_full_declaration));
 1182             maxDeclarations += 10;
 1183          }
 1184          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT)
 1185             mach->NumOutputs = MAX2(mach->NumOutputs, parse.FullToken.FullDeclaration.Range.Last + 1);
 1186          else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
 1187             const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
 1188             mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
 1189          }
 1190 
 1191          memcpy(declarations + numDeclarations,
 1192                 &parse.FullToken.FullDeclaration,
 1193                 sizeof(declarations[0]));
 1194          numDeclarations++;
 1195          break;
 1196 
 1197       case TGSI_TOKEN_TYPE_IMMEDIATE:
 1198          {
 1199             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
 1200             assert( size <= 4 );
 1201             if (mach->ImmLimit >= mach->ImmsReserved) {
 1202                unsigned newReserved = mach->ImmsReserved ? 2 * mach->ImmsReserved : 128;
 1203                float4 *imms = REALLOC(mach->Imms, mach->ImmsReserved, newReserved * sizeof(float4));
 1204                if (imms) {
 1205                   mach->ImmsReserved = newReserved;
 1206                   mach->Imms = imms;
 1207                } else {
 1208                   debug_printf("Unable to (re)allocate space for immidiate constants\n");
 1209                   break;
 1210                }
 1211             }
 1212 
 1213             for( i = 0; i < size; i++ ) {
 1214                mach->Imms[mach->ImmLimit][i] = 
 1215           parse.FullToken.FullImmediate.u[i].Float;
 1216             }
 1217             mach->ImmLimit += 1;
 1218          }
 1219          break;
 1220 
 1221       case TGSI_TOKEN_TYPE_INSTRUCTION:
 1222 
 1223          /* save expanded instruction */
 1224          if (numInstructions == maxInstructions) {
 1225             instructions = REALLOC(instructions,
 1226                                    maxInstructions
 1227                                    * sizeof(struct tgsi_full_instruction),
 1228                                    (maxInstructions + 10)
 1229                                    * sizeof(struct tgsi_full_instruction));
 1230             maxInstructions += 10;
 1231          }
 1232 
 1233          memcpy(instructions + numInstructions,
 1234                 &parse.FullToken.FullInstruction,
 1235                 sizeof(instructions[0]));
 1236 
 1237          numInstructions++;
 1238          break;
 1239 
 1240       case TGSI_TOKEN_TYPE_PROPERTY:
 1241          if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
 1242             if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
 1243                mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
 1244             }
 1245          }
 1246          break;
 1247 
 1248       default:
 1249          assert( 0 );
 1250       }
 1251    }
 1252    tgsi_parse_free (&parse);
 1253 
 1254    FREE(mach->Declarations);
 1255    mach->Declarations = declarations;
 1256    mach->NumDeclarations = numDeclarations;
 1257 
 1258    FREE(mach->Instructions);
 1259    mach->Instructions = instructions;
 1260    mach->NumInstructions = numInstructions;
 1261 }
 1262 
 1263 
 1264 struct tgsi_exec_machine *
 1265 tgsi_exec_machine_create(enum pipe_shader_type shader_type)
 1266 {
 1267    struct tgsi_exec_machine *mach;
 1268 
 1269    mach = align_malloc( sizeof *mach, 16 );
 1270    if (!mach)
 1271       goto fail;
 1272 
 1273    memset(mach, 0, sizeof(*mach));
 1274 
 1275    mach->ShaderType = shader_type;
 1276    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
 1277    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
 1278 
 1279    if (shader_type != PIPE_SHADER_COMPUTE) {
 1280       mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
 1281       mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
 1282       if (!mach->Inputs || !mach->Outputs)
 1283          goto fail;
 1284    }
 1285 
 1286    if (shader_type == PIPE_SHADER_FRAGMENT) {
 1287       mach->InputSampleOffsetApply = align_malloc(sizeof(apply_sample_offset_func) * PIPE_MAX_SHADER_INPUTS, 16);
 1288       if (!mach->InputSampleOffsetApply)
 1289          goto fail;
 1290    }
 1291 
 1292 #ifdef DEBUG
 1293    /* silence warnings */
 1294    (void) print_chan;
 1295    (void) print_temp;
 1296 #endif
 1297 
 1298    return mach;
 1299 
 1300 fail:
 1301    if (mach) {
 1302       align_free(mach->InputSampleOffsetApply);
 1303       align_free(mach->Inputs);
 1304       align_free(mach->Outputs);
 1305       align_free(mach);
 1306    }
 1307    return NULL;
 1308 }
 1309 
 1310 
 1311 void
 1312 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
 1313 {
 1314    if (mach) {
 1315       FREE(mach->Instructions);
 1316       FREE(mach->Declarations);
 1317       FREE(mach->Imms);
 1318 
 1319       align_free(mach->InputSampleOffsetApply);
 1320       align_free(mach->Inputs);
 1321       align_free(mach->Outputs);
 1322 
 1323       align_free(mach);
 1324    }
 1325 }
 1326 
 1327 static void
 1328 micro_add(union tgsi_exec_channel *dst,
 1329           const union tgsi_exec_channel *src0,
 1330           const union tgsi_exec_channel *src1)
 1331 {
 1332    dst->f[0] = src0->f[0] + src1->f[0];
 1333    dst->f[1] = src0->f[1] + src1->f[1];
 1334    dst->f[2] = src0->f[2] + src1->f[2];
 1335    dst->f[3] = src0->f[3] + src1->f[3];
 1336 }
 1337 
 1338 static void
 1339 micro_div(
 1340    union tgsi_exec_channel *dst,
 1341    const union tgsi_exec_channel *src0,
 1342    const union tgsi_exec_channel *src1 )
 1343 {
 1344    if (src1->f[0] != 0) {
 1345       dst->f[0] = src0->f[0] / src1->f[0];
 1346    }
 1347    if (src1->f[1] != 0) {
 1348       dst->f[1] = src0->f[1] / src1->f[1];
 1349    }
 1350    if (src1->f[2] != 0) {
 1351       dst->f[2] = src0->f[2] / src1->f[2];
 1352    }
 1353    if (src1->f[3] != 0) {
 1354       dst->f[3] = src0->f[3] / src1->f[3];
 1355    }
 1356 }
 1357 
 1358 static void
 1359 micro_lt(
 1360    union tgsi_exec_channel *dst,
 1361    const union tgsi_exec_channel *src0,
 1362    const union tgsi_exec_channel *src1,
 1363    const union tgsi_exec_channel *src2,
 1364    const union tgsi_exec_channel *src3 )
 1365 {
 1366    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
 1367    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
 1368    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
 1369    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 1370 }
 1371 
 1372 static void
 1373 micro_max(union tgsi_exec_channel *dst,
 1374           const union tgsi_exec_channel *src0,
 1375           const union tgsi_exec_channel *src1)
 1376 {
 1377    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
 1378    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
 1379    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
 1380    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 1381 }
 1382 
 1383 static void
 1384 micro_min(union tgsi_exec_channel *dst,
 1385           const union tgsi_exec_channel *src0,
 1386           const union tgsi_exec_channel *src1)
 1387 {
 1388    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
 1389    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
 1390    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
 1391    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 1392 }
 1393 
 1394 static void
 1395 micro_mul(union tgsi_exec_channel *dst,
 1396           const union tgsi_exec_channel *src0,
 1397           const union tgsi_exec_channel *src1)
 1398 {
 1399    dst->f[0] = src0->f[0] * src1->f[0];
 1400    dst->f[1] = src0->f[1] * src1->f[1];
 1401    dst->f[2] = src0->f[2] * src1->f[2];
 1402    dst->f[3] = src0->f[3] * src1->f[3];
 1403 }
 1404 
 1405 static void
 1406 micro_neg(
 1407    union tgsi_exec_channel *dst,
 1408    const union tgsi_exec_channel *src )
 1409 {
 1410    dst->f[0] = -src->f[0];
 1411    dst->f[1] = -src->f[1];
 1412    dst->f[2] = -src->f[2];
 1413    dst->f[3] = -src->f[3];
 1414 }
 1415 
 1416 static void
 1417 micro_pow(
 1418    union tgsi_exec_channel *dst,
 1419    const union tgsi_exec_channel *src0,
 1420    const union tgsi_exec_channel *src1 )
 1421 {
 1422 #if FAST_MATH
 1423    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
 1424    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
 1425    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
 1426    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
 1427 #else
 1428    dst->f[0] = powf( src0->f[0], src1->f[0] );
 1429    dst->f[1] = powf( src0->f[1], src1->f[1] );
 1430    dst->f[2] = powf( src0->f[2], src1->f[2] );
 1431    dst->f[3] = powf( src0->f[3], src1->f[3] );
 1432 #endif
 1433 }
 1434 
 1435 static void
 1436 micro_ldexp(union tgsi_exec_channel *dst,
 1437             const union tgsi_exec_channel *src0,
 1438             const union tgsi_exec_channel *src1)
 1439 {
 1440    dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
 1441    dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
 1442    dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
 1443    dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
 1444 }
 1445 
 1446 static void
 1447 micro_sub(union tgsi_exec_channel *dst,
 1448           const union tgsi_exec_channel *src0,
 1449           const union tgsi_exec_channel *src1)
 1450 {
 1451    dst->f[0] = src0->f[0] - src1->f[0];
 1452    dst->f[1] = src0->f[1] - src1->f[1];
 1453    dst->f[2] = src0->f[2] - src1->f[2];
 1454    dst->f[3] = src0->f[3] - src1->f[3];
 1455 }
 1456 
 1457 static void
 1458 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
 1459                        const uint file,
 1460                        const uint swizzle,
 1461                        const union tgsi_exec_channel *index,
 1462                        const union tgsi_exec_channel *index2D,
 1463                        union tgsi_exec_channel *chan)
 1464 {
 1465    uint i;
 1466 
 1467    assert(swizzle < 4);
 1468 
 1469    switch (file) {
 1470    case TGSI_FILE_CONSTANT:
 1471       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1472          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
 1473          assert(mach->Consts[index2D->i[i]]);
 1474 
 1475          if (index->i[i] < 0) {
 1476             chan->u[i] = 0;
 1477          } else {
 1478             /* NOTE: copying the const value as a uint instead of float */
 1479             const uint constbuf = index2D->i[i];
 1480             const uint *buf = (const uint *)mach->Consts[constbuf];
 1481             const int pos = index->i[i] * 4 + swizzle;
 1482             /* const buffer bounds check */
 1483             if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
 1484                if (0) {
 1485                   /* Debug: print warning */
 1486                   static int count = 0;
 1487                   if (count++ < 100)
 1488                      debug_printf("TGSI Exec: const buffer index %d"
 1489                                   " out of bounds\n", pos);
 1490                }
 1491                chan->u[i] = 0;
 1492             }
 1493             else
 1494                chan->u[i] = buf[pos];
 1495          }
 1496       }
 1497       break;
 1498 
 1499    case TGSI_FILE_INPUT:
 1500       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1501          /*
 1502          if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
 1503             debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
 1504                          index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
 1505                          index2D->i[i], index->i[i]);
 1506                          }*/
 1507          int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
 1508          assert(pos >= 0);
 1509          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
 1510          chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
 1511       }
 1512       break;
 1513 
 1514    case TGSI_FILE_SYSTEM_VALUE:
 1515       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1516          chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
 1517       }
 1518       break;
 1519 
 1520    case TGSI_FILE_TEMPORARY:
 1521       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1522          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
 1523          assert(index2D->i[i] == 0);
 1524 
 1525          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
 1526       }
 1527       break;
 1528 
 1529    case TGSI_FILE_IMMEDIATE:
 1530       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1531          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
 1532          assert(index2D->i[i] == 0);
 1533 
 1534          chan->f[i] = mach->Imms[index->i[i]][swizzle];
 1535       }
 1536       break;
 1537 
 1538    case TGSI_FILE_ADDRESS:
 1539       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1540          assert(index->i[i] >= 0);
 1541          assert(index2D->i[i] == 0);
 1542 
 1543          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
 1544       }
 1545       break;
 1546 
 1547    case TGSI_FILE_OUTPUT:
 1548       /* vertex/fragment output vars can be read too */
 1549       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1550          assert(index->i[i] >= 0);
 1551          assert(index2D->i[i] == 0);
 1552 
 1553          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
 1554       }
 1555       break;
 1556 
 1557    default:
 1558       assert(0);
 1559       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1560          chan->u[i] = 0;
 1561       }
 1562    }
 1563 }
 1564 
 1565 static void
 1566 get_index_registers(const struct tgsi_exec_machine *mach,
 1567                     const struct tgsi_full_src_register *reg,
 1568                     union tgsi_exec_channel *index,
 1569                     union tgsi_exec_channel *index2D)
 1570 {
 1571    uint swizzle;
 1572 
 1573    /* We start with a direct index into a register file.
 1574     *
 1575     *    file[1],
 1576     *    where:
 1577     *       file = Register.File
 1578     *       [1] = Register.Index
 1579     */
 1580    index->i[0] =
 1581    index->i[1] =
 1582    index->i[2] =
 1583    index->i[3] = reg->Register.Index;
 1584 
 1585    /* There is an extra source register that indirectly subscripts
 1586     * a register file. The direct index now becomes an offset
 1587     * that is being added to the indirect register.
 1588     *
 1589     *    file[ind[2].x+1],
 1590     *    where:
 1591     *       ind = Indirect.File
 1592     *       [2] = Indirect.Index
 1593     *       .x = Indirect.SwizzleX
 1594     */
 1595    if (reg->Register.Indirect) {
 1596       union tgsi_exec_channel index2;
 1597       union tgsi_exec_channel indir_index;
 1598       const uint execmask = mach->ExecMask;
 1599       uint i;
 1600 
 1601       /* which address register (always zero now) */
 1602       index2.i[0] =
 1603       index2.i[1] =
 1604       index2.i[2] =
 1605       index2.i[3] = reg->Indirect.Index;
 1606       /* get current value of address register[swizzle] */
 1607       swizzle = reg->Indirect.Swizzle;
 1608       fetch_src_file_channel(mach,
 1609                              reg->Indirect.File,
 1610                              swizzle,
 1611                              &index2,
 1612                              &ZeroVec,
 1613                              &indir_index);
 1614 
 1615       /* add value of address register to the offset */
 1616       index->i[0] += indir_index.i[0];
 1617       index->i[1] += indir_index.i[1];
 1618       index->i[2] += indir_index.i[2];
 1619       index->i[3] += indir_index.i[3];
 1620 
 1621       /* for disabled execution channels, zero-out the index to
 1622        * avoid using a potential garbage value.
 1623        */
 1624       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1625          if ((execmask & (1 << i)) == 0)
 1626             index->i[i] = 0;
 1627       }
 1628    }
 1629 
 1630    /* There is an extra source register that is a second
 1631     * subscript to a register file. Effectively it means that
 1632     * the register file is actually a 2D array of registers.
 1633     *
 1634     *    file[3][1],
 1635     *    where:
 1636     *       [3] = Dimension.Index
 1637     */
 1638    if (reg->Register.Dimension) {
 1639       index2D->i[0] =
 1640       index2D->i[1] =
 1641       index2D->i[2] =
 1642       index2D->i[3] = reg->Dimension.Index;
 1643 
 1644       /* Again, the second subscript index can be addressed indirectly
 1645        * identically to the first one.
 1646        * Nothing stops us from indirectly addressing the indirect register,
 1647        * but there is no need for that, so we won't exercise it.
 1648        *
 1649        *    file[ind[4].y+3][1],
 1650        *    where:
 1651        *       ind = DimIndirect.File
 1652        *       [4] = DimIndirect.Index
 1653        *       .y = DimIndirect.SwizzleX
 1654        */
 1655       if (reg->Dimension.Indirect) {
 1656          union tgsi_exec_channel index2;
 1657          union tgsi_exec_channel indir_index;
 1658          const uint execmask = mach->ExecMask;
 1659          uint i;
 1660 
 1661          index2.i[0] =
 1662          index2.i[1] =
 1663          index2.i[2] =
 1664          index2.i[3] = reg->DimIndirect.Index;
 1665 
 1666          swizzle = reg->DimIndirect.Swizzle;
 1667          fetch_src_file_channel(mach,
 1668                                 reg->DimIndirect.File,
 1669                                 swizzle,
 1670                                 &index2,
 1671                                 &ZeroVec,
 1672                                 &indir_index);
 1673 
 1674          index2D->i[0] += indir_index.i[0];
 1675          index2D->i[1] += indir_index.i[1];
 1676          index2D->i[2] += indir_index.i[2];
 1677          index2D->i[3] += indir_index.i[3];
 1678 
 1679          /* for disabled execution channels, zero-out the index to
 1680           * avoid using a potential garbage value.
 1681           */
 1682          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1683             if ((execmask & (1 << i)) == 0) {
 1684                index2D->i[i] = 0;
 1685             }
 1686          }
 1687       }
 1688 
 1689       /* If by any chance there was a need for a 3D array of register
 1690        * files, we would have to check whether Dimension is followed
 1691        * by a dimension register and continue the saga.
 1692        */
 1693    } else {
 1694       index2D->i[0] =
 1695       index2D->i[1] =
 1696       index2D->i[2] =
 1697       index2D->i[3] = 0;
 1698    }
 1699 }
 1700 
 1701 
 1702 static void
 1703 fetch_source_d(const struct tgsi_exec_machine *mach,
 1704                union tgsi_exec_channel *chan,
 1705                const struct tgsi_full_src_register *reg,
 1706            const uint chan_index)
 1707 {
 1708    union tgsi_exec_channel index;
 1709    union tgsi_exec_channel index2D;
 1710    uint swizzle;
 1711 
 1712    get_index_registers(mach, reg, &index, &index2D);
 1713 
 1714 
 1715    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
 1716    fetch_src_file_channel(mach,
 1717                           reg->Register.File,
 1718                           swizzle,
 1719                           &index,
 1720                           &index2D,
 1721                           chan);
 1722 }
 1723 
 1724 static void
 1725 fetch_source(const struct tgsi_exec_machine *mach,
 1726              union tgsi_exec_channel *chan,
 1727              const struct tgsi_full_src_register *reg,
 1728              const uint chan_index,
 1729              enum tgsi_exec_datatype src_datatype)
 1730 {
 1731    fetch_source_d(mach, chan, reg, chan_index);
 1732 
 1733    if (reg->Register.Absolute) {
 1734       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
 1735          micro_abs(chan, chan);
 1736       } else {
 1737          micro_iabs(chan, chan);
 1738       }
 1739    }
 1740 
 1741    if (reg->Register.Negate) {
 1742       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
 1743          micro_neg(chan, chan);
 1744       } else {
 1745          micro_ineg(chan, chan);
 1746       }
 1747    }
 1748 }
 1749 
 1750 static union tgsi_exec_channel *
 1751 store_dest_dstret(struct tgsi_exec_machine *mach,
 1752                  const union tgsi_exec_channel *chan,
 1753                  const struct tgsi_full_dst_register *reg,
 1754                  uint chan_index,
 1755                  enum tgsi_exec_datatype dst_datatype)
 1756 {
 1757    static union tgsi_exec_channel null;
 1758    union tgsi_exec_channel *dst;
 1759    union tgsi_exec_channel index2D;
 1760    int offset = 0;  /* indirection offset */
 1761    int index;
 1762 
 1763    /* for debugging */
 1764    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
 1765       check_inf_or_nan(chan);
 1766    }
 1767 
 1768    /* There is an extra source register that indirectly subscripts
 1769     * a register file. The direct index now becomes an offset
 1770     * that is being added to the indirect register.
 1771     *
 1772     *    file[ind[2].x+1],
 1773     *    where:
 1774     *       ind = Indirect.File
 1775     *       [2] = Indirect.Index
 1776     *       .x = Indirect.SwizzleX
 1777     */
 1778    if (reg->Register.Indirect) {
 1779       union tgsi_exec_channel index;
 1780       union tgsi_exec_channel indir_index;
 1781       uint swizzle;
 1782 
 1783       /* which address register (always zero for now) */
 1784       index.i[0] =
 1785       index.i[1] =
 1786       index.i[2] =
 1787       index.i[3] = reg->Indirect.Index;
 1788 
 1789       /* get current value of address register[swizzle] */
 1790       swizzle = reg->Indirect.Swizzle;
 1791 
 1792       /* fetch values from the address/indirection register */
 1793       fetch_src_file_channel(mach,
 1794                              reg->Indirect.File,
 1795                              swizzle,
 1796                              &index,
 1797                              &ZeroVec,
 1798                              &indir_index);
 1799 
 1800       /* save indirection offset */
 1801       offset = indir_index.i[0];
 1802    }
 1803 
 1804    /* There is an extra source register that is a second
 1805     * subscript to a register file. Effectively it means that
 1806     * the register file is actually a 2D array of registers.
 1807     *
 1808     *    file[3][1],
 1809     *    where:
 1810     *       [3] = Dimension.Index
 1811     */
 1812    if (reg->Register.Dimension) {
 1813       index2D.i[0] =
 1814       index2D.i[1] =
 1815       index2D.i[2] =
 1816       index2D.i[3] = reg->Dimension.Index;
 1817 
 1818       /* Again, the second subscript index can be addressed indirectly
 1819        * identically to the first one.
 1820        * Nothing stops us from indirectly addressing the indirect register,
 1821        * but there is no need for that, so we won't exercise it.
 1822        *
 1823        *    file[ind[4].y+3][1],
 1824        *    where:
 1825        *       ind = DimIndirect.File
 1826        *       [4] = DimIndirect.Index
 1827        *       .y = DimIndirect.SwizzleX
 1828        */
 1829       if (reg->Dimension.Indirect) {
 1830          union tgsi_exec_channel index2;
 1831          union tgsi_exec_channel indir_index;
 1832          const uint execmask = mach->ExecMask;
 1833          unsigned swizzle;
 1834          uint i;
 1835 
 1836          index2.i[0] =
 1837          index2.i[1] =
 1838          index2.i[2] =
 1839          index2.i[3] = reg->DimIndirect.Index;
 1840 
 1841          swizzle = reg->DimIndirect.Swizzle;
 1842          fetch_src_file_channel(mach,
 1843                                 reg->DimIndirect.File,
 1844                                 swizzle,
 1845                                 &index2,
 1846                                 &ZeroVec,
 1847                                 &indir_index);
 1848 
 1849          index2D.i[0] += indir_index.i[0];
 1850          index2D.i[1] += indir_index.i[1];
 1851          index2D.i[2] += indir_index.i[2];
 1852          index2D.i[3] += indir_index.i[3];
 1853 
 1854          /* for disabled execution channels, zero-out the index to
 1855           * avoid using a potential garbage value.
 1856           */
 1857          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 1858             if ((execmask & (1 << i)) == 0) {
 1859                index2D.i[i] = 0;
 1860             }
 1861          }
 1862       }
 1863 
 1864       /* If by any chance there was a need for a 3D array of register
 1865        * files, we would have to check whether Dimension is followed
 1866        * by a dimension register and continue the saga.
 1867        */
 1868    } else {
 1869       index2D.i[0] =
 1870       index2D.i[1] =
 1871       index2D.i[2] =
 1872       index2D.i[3] = 0;
 1873    }
 1874 
 1875    switch (reg->Register.File) {
 1876    case TGSI_FILE_NULL:
 1877       dst = &null;
 1878       break;
 1879 
 1880    case TGSI_FILE_OUTPUT:
 1881       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
 1882          + reg->Register.Index;
 1883       dst = &mach->Outputs[offset + index].xyzw[chan_index];
 1884 #if 0
 1885       debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
 1886                    mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
 1887                    reg->Register.Index);
 1888       if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
 1889          debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
 1890          for (i = 0; i < TGSI_QUAD_SIZE; i++)
 1891             if (execmask & (1 << i))
 1892                debug_printf("%f, ", chan->f[i]);
 1893          debug_printf(")\n");
 1894       }
 1895 #endif
 1896       break;
 1897 
 1898    case TGSI_FILE_TEMPORARY:
 1899       index = reg->Register.Index;
 1900       assert( index < TGSI_EXEC_NUM_TEMPS );
 1901       dst = &mach->Temps[offset + index].xyzw[chan_index];
 1902       break;
 1903 
 1904    case TGSI_FILE_ADDRESS:
 1905       index = reg->Register.Index;
 1906       dst = &mach->Addrs[index].xyzw[chan_index];
 1907       break;
 1908 
 1909    default:
 1910       assert( 0 );
 1911       return NULL;
 1912    }
 1913 
 1914    return dst;
 1915 }
 1916 
 1917 static void
 1918 store_dest_double(struct tgsi_exec_machine *mach,
 1919                  const union tgsi_exec_channel *chan,
 1920                  const struct tgsi_full_dst_register *reg,
 1921                  uint chan_index,
 1922                  enum tgsi_exec_datatype dst_datatype)
 1923 {
 1924    union tgsi_exec_channel *dst;
 1925    const uint execmask = mach->ExecMask;
 1926    int i;
 1927 
 1928    dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
 1929    if (!dst)
 1930       return;
 1931 
 1932    /* doubles path */
 1933    for (i = 0; i < TGSI_QUAD_SIZE; i++)
 1934       if (execmask & (1 << i))
 1935          dst->i[i] = chan->i[i];
 1936 }
 1937 
 1938 static void
 1939 store_dest(struct tgsi_exec_machine *mach,
 1940            const union tgsi_exec_channel *chan,
 1941            const struct tgsi_full_dst_register *reg,
 1942            const struct tgsi_full_instruction *inst,
 1943            uint chan_index,
 1944            enum tgsi_exec_datatype dst_datatype)
 1945 {
 1946    union tgsi_exec_channel *dst;
 1947    const uint execmask = mach->ExecMask;
 1948    int i;
 1949 
 1950    dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
 1951    if (!dst)
 1952       return;
 1953 
 1954    if (!inst->Instruction.Saturate) {
 1955       for (i = 0; i < TGSI_QUAD_SIZE; i++)
 1956          if (execmask & (1 << i))
 1957             dst->i[i] = chan->i[i];
 1958    }
 1959    else {
 1960       for (i = 0; i < TGSI_QUAD_SIZE; i++)
 1961          if (execmask & (1 << i)) {
 1962             if (chan->f[i] < 0.0f)
 1963                dst->f[i] = 0.0f;
 1964             else if (chan->f[i] > 1.0f)
 1965                dst->f[i] = 1.0f;
 1966             else
 1967                dst->i[i] = chan->i[i];
 1968          }
 1969    }
 1970 }
 1971 
 1972 #define FETCH(VAL,INDEX,CHAN)\
 1973     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
 1974 
 1975 #define IFETCH(VAL,INDEX,CHAN)\
 1976     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
 1977 
 1978 
 1979 /**
 1980  * Execute ARB-style KIL which is predicated by a src register.
 1981  * Kill fragment if any of the four values is less than zero.
 1982  */
 1983 static void
 1984 exec_kill_if(struct tgsi_exec_machine *mach,
 1985              const struct tgsi_full_instruction *inst)
 1986 {
 1987    uint uniquemask;
 1988    uint chan_index;
 1989    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
 1990    union tgsi_exec_channel r[1];
 1991 
 1992    /* This mask stores component bits that were already tested. */
 1993    uniquemask = 0;
 1994 
 1995    for (chan_index = 0; chan_index < 4; chan_index++)
 1996    {
 1997       uint swizzle;
 1998       uint i;
 1999 
 2000       /* unswizzle channel */
 2001       swizzle = tgsi_util_get_full_src_register_swizzle (
 2002                         &inst->Src[0],
 2003                         chan_index);
 2004 
 2005       /* check if the component has not been already tested */
 2006       if (uniquemask & (1 << swizzle))
 2007          continue;
 2008       uniquemask |= 1 << swizzle;
 2009 
 2010       FETCH(&r[0], 0, chan_index);
 2011       for (i = 0; i < 4; i++)
 2012          if (r[0].f[i] < 0.0f)
 2013             kilmask |= 1 << i;
 2014    }
 2015 
 2016    /* restrict to fragments currently executing */
 2017    kilmask &= mach->ExecMask;
 2018 
 2019    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
 2020 }
 2021 
 2022 /**
 2023  * Unconditional fragment kill/discard.
 2024  */
 2025 static void
 2026 exec_kill(struct tgsi_exec_machine *mach)
 2027 {
 2028    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
 2029 
 2030    /* kill fragment for all fragments currently executing */
 2031    kilmask = mach->ExecMask;
 2032    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
 2033 }
 2034 
 2035 static void
 2036 emit_vertex(struct tgsi_exec_machine *mach,
 2037             const struct tgsi_full_instruction *inst)
 2038 {
 2039    union tgsi_exec_channel r[1];
 2040    unsigned stream_id;
 2041    unsigned *prim_count;
 2042    /* FIXME: check for exec mask correctly
 2043    unsigned i;
 2044    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
 2045          if ((mach->ExecMask & (1 << i)))
 2046    */
 2047    IFETCH(&r[0], 0, TGSI_CHAN_X);
 2048    stream_id = r[0].u[0];
 2049    prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
 2050    if (mach->ExecMask) {
 2051       if (mach->Primitives[stream_id][*prim_count] >= mach->MaxOutputVertices)
 2052          return;
 2053 
 2054       if (mach->Primitives[stream_id][*prim_count] == 0)
 2055          mach->PrimitiveOffsets[stream_id][*prim_count] = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0];
 2056       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
 2057       mach->Primitives[stream_id][*prim_count]++;
 2058    }
 2059 }
 2060 
 2061 static void
 2062 emit_primitive(struct tgsi_exec_machine *mach,
 2063                const struct tgsi_full_instruction *inst)
 2064 {
 2065    unsigned *prim_count;
 2066    union tgsi_exec_channel r[1];
 2067    unsigned stream_id = 0;
 2068    /* FIXME: check for exec mask correctly
 2069    unsigned i;
 2070    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
 2071          if ((mach->ExecMask & (1 << i)))
 2072    */
 2073    if (inst) {
 2074       IFETCH(&r[0], 0, TGSI_CHAN_X);
 2075       stream_id = r[0].u[0];
 2076    }
 2077    prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
 2078    if (mach->ExecMask) {
 2079       ++(*prim_count);
 2080       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
 2081       mach->Primitives[stream_id][*prim_count] = 0;
 2082    }
 2083 }
 2084 
 2085 static void
 2086 conditional_emit_primitive(struct tgsi_exec_machine *mach)
 2087 {
 2088    if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
 2089       int emitted_verts =
 2090          mach->Primitives[0][mach->Temps[temp_prim_idxs[0].idx].xyzw[temp_prim_idxs[0].chan].u[0]];
 2091       if (emitted_verts) {
 2092          emit_primitive(mach, NULL);
 2093       }
 2094    }
 2095 }
 2096 
 2097 
 2098 /*
 2099  * Fetch four texture samples using STR texture coordinates.
 2100  */
 2101 static void
 2102 fetch_texel( struct tgsi_sampler *sampler,
 2103              const unsigned sview_idx,
 2104              const unsigned sampler_idx,
 2105              const union tgsi_exec_channel *s,
 2106              const union tgsi_exec_channel *t,
 2107              const union tgsi_exec_channel *p,
 2108              const union tgsi_exec_channel *c0,
 2109              const union tgsi_exec_channel *c1,
 2110              float derivs[3][2][TGSI_QUAD_SIZE],
 2111              const int8_t offset[3],
 2112              enum tgsi_sampler_control control,
 2113              union tgsi_exec_channel *r,
 2114              union tgsi_exec_channel *g,
 2115              union tgsi_exec_channel *b,
 2116              union tgsi_exec_channel *a )
 2117 {
 2118    uint j;
 2119    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 2120 
 2121    /* FIXME: handle explicit derivs, offsets */
 2122    sampler->get_samples(sampler, sview_idx, sampler_idx,
 2123                         s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
 2124 
 2125    for (j = 0; j < 4; j++) {
 2126       r->f[j] = rgba[0][j];
 2127       g->f[j] = rgba[1][j];
 2128       b->f[j] = rgba[2][j];
 2129       a->f[j] = rgba[3][j];
 2130    }
 2131 }
 2132 
 2133 
 2134 #define TEX_MODIFIER_NONE           0
 2135 #define TEX_MODIFIER_PROJECTED      1
 2136 #define TEX_MODIFIER_LOD_BIAS       2
 2137 #define TEX_MODIFIER_EXPLICIT_LOD   3
 2138 #define TEX_MODIFIER_LEVEL_ZERO     4
 2139 #define TEX_MODIFIER_GATHER         5
 2140 
 2141 /*
 2142  * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
 2143  */
 2144 static void
 2145 fetch_texel_offsets(struct tgsi_exec_machine *mach,
 2146                     const struct tgsi_full_instruction *inst,
 2147                     int8_t offsets[3])
 2148 {
 2149    if (inst->Texture.NumOffsets == 1) {
 2150       union tgsi_exec_channel index;
 2151       union tgsi_exec_channel offset[3];
 2152       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
 2153       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
 2154                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
 2155       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
 2156                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
 2157       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
 2158                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
 2159      offsets[0] = offset[0].i[0];
 2160      offsets[1] = offset[1].i[0];
 2161      offsets[2] = offset[2].i[0];
 2162    } else {
 2163      assert(inst->Texture.NumOffsets == 0);
 2164      offsets[0] = offsets[1] = offsets[2] = 0;
 2165    }
 2166 }
 2167 
 2168 
 2169 /*
 2170  * Fetch dx and dy values for one channel (s, t or r).
 2171  * Put dx values into one float array, dy values into another.
 2172  */
 2173 static void
 2174 fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
 2175                            const struct tgsi_full_instruction *inst,
 2176                            unsigned regdsrcx,
 2177                            unsigned chan,
 2178                            float derivs[2][TGSI_QUAD_SIZE])
 2179 {
 2180    union tgsi_exec_channel d;
 2181    FETCH(&d, regdsrcx, chan);
 2182    derivs[0][0] = d.f[0];
 2183    derivs[0][1] = d.f[1];
 2184    derivs[0][2] = d.f[2];
 2185    derivs[0][3] = d.f[3];
 2186    FETCH(&d, regdsrcx + 1, chan);
 2187    derivs[1][0] = d.f[0];
 2188    derivs[1][1] = d.f[1];
 2189    derivs[1][2] = d.f[2];
 2190    derivs[1][3] = d.f[3];
 2191 }
 2192 
 2193 static uint
 2194 fetch_sampler_unit(struct tgsi_exec_machine *mach,
 2195                    const struct tgsi_full_instruction *inst,
 2196                    uint sampler)
 2197 {
 2198    uint unit = 0;
 2199    int i;
 2200    if (inst->Src[sampler].Register.Indirect) {
 2201       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
 2202       union tgsi_exec_channel indir_index, index2;
 2203       const uint execmask = mach->ExecMask;
 2204       index2.i[0] =
 2205       index2.i[1] =
 2206       index2.i[2] =
 2207       index2.i[3] = reg->Indirect.Index;
 2208 
 2209       fetch_src_file_channel(mach,
 2210                              reg->Indirect.File,
 2211                              reg->Indirect.Swizzle,
 2212                              &index2,
 2213                              &ZeroVec,
 2214                              &indir_index);
 2215       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 2216          if (execmask & (1 << i)) {
 2217             unit = inst->Src[sampler].Register.Index + indir_index.i[i];
 2218             break;
 2219          }
 2220       }
 2221 
 2222    } else {
 2223       unit = inst->Src[sampler].Register.Index;
 2224    }
 2225    return unit;
 2226 }
 2227 
 2228 /*
 2229  * execute a texture instruction.
 2230  *
 2231  * modifier is used to control the channel routing for the
 2232  * instruction variants like proj, lod, and texture with lod bias.
 2233  * sampler indicates which src register the sampler is contained in.
 2234  */
 2235 static void
 2236 exec_tex(struct tgsi_exec_machine *mach,
 2237          const struct tgsi_full_instruction *inst,
 2238          uint modifier, uint sampler)
 2239 {
 2240    const union tgsi_exec_channel *args[5], *proj = NULL;
 2241    union tgsi_exec_channel r[5];
 2242    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
 2243    uint chan;
 2244    uint unit;
 2245    int8_t offsets[3];
 2246    int dim, shadow_ref, i;
 2247 
 2248    unit = fetch_sampler_unit(mach, inst, sampler);
 2249    /* always fetch all 3 offsets, overkill but keeps code simple */
 2250    fetch_texel_offsets(mach, inst, offsets);
 2251 
 2252    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
 2253    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
 2254 
 2255    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
 2256    shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
 2257 
 2258    assert(dim <= 4);
 2259    if (shadow_ref >= 0)
 2260       assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
 2261 
 2262    /* fetch modifier to the last argument */
 2263    if (modifier != TEX_MODIFIER_NONE) {
 2264       const int last = ARRAY_SIZE(args) - 1;
 2265 
 2266       /* fetch modifier from src0.w or src1.x */
 2267       if (sampler == 1) {
 2268          assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
 2269          FETCH(&r[last], 0, TGSI_CHAN_W);
 2270       }
 2271       else {
 2272          FETCH(&r[last], 1, TGSI_CHAN_X);
 2273       }
 2274 
 2275       if (modifier != TEX_MODIFIER_PROJECTED) {
 2276          args[last] = &r[last];
 2277       }
 2278       else {
 2279          proj = &r[last];
 2280          args[last] = &ZeroVec;
 2281       }
 2282 
 2283       /* point unused arguments to zero vector */
 2284       for (i = dim; i < last; i++)
 2285          args[i] = &ZeroVec;
 2286 
 2287       if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
 2288          control = TGSI_SAMPLER_LOD_EXPLICIT;
 2289       else if (modifier == TEX_MODIFIER_LOD_BIAS)
 2290          control = TGSI_SAMPLER_LOD_BIAS;
 2291       else if (modifier == TEX_MODIFIER_GATHER)
 2292          control = TGSI_SAMPLER_GATHER;
 2293    }
 2294    else {
 2295       for (i = dim; i < (int)ARRAY_SIZE(args); i++)
 2296          args[i] = &ZeroVec;
 2297    }
 2298 
 2299    /* fetch coordinates */
 2300    for (i = 0; i < dim; i++) {
 2301       FETCH(&r[i], 0, TGSI_CHAN_X + i);
 2302 
 2303       if (proj)
 2304          micro_div(&r[i], &r[i], proj);
 2305 
 2306       args[i] = &r[i];
 2307    }
 2308 
 2309    /* fetch reference value */
 2310    if (shadow_ref >= 0) {
 2311       FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
 2312 
 2313       if (proj)
 2314          micro_div(&r[shadow_ref], &r[shadow_ref], proj);
 2315 
 2316       args[shadow_ref] = &r[shadow_ref];
 2317    }
 2318 
 2319    fetch_texel(mach->Sampler, unit, unit,
 2320          args[0], args[1], args[2], args[3], args[4],
 2321          NULL, offsets, control,
 2322          &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
 2323 
 2324 #if 0
 2325    debug_printf("fetch r: %g %g %g %g\n",
 2326          r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
 2327    debug_printf("fetch g: %g %g %g %g\n",
 2328          r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
 2329    debug_printf("fetch b: %g %g %g %g\n",
 2330          r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
 2331    debug_printf("fetch a: %g %g %g %g\n",
 2332          r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
 2333 #endif
 2334 
 2335    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 2336       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 2337          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 2338       }
 2339    }
 2340 }
 2341 
 2342 static void
 2343 exec_lodq(struct tgsi_exec_machine *mach,
 2344           const struct tgsi_full_instruction *inst)
 2345 {
 2346    uint resource_unit, sampler_unit;
 2347    unsigned dim;
 2348    unsigned i;
 2349    union tgsi_exec_channel coords[4];
 2350    const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
 2351    union tgsi_exec_channel r[2];
 2352 
 2353    resource_unit = fetch_sampler_unit(mach, inst, 1);
 2354    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
 2355       uint target = mach->SamplerViews[resource_unit].Resource;
 2356       dim = tgsi_util_get_texture_coord_dim(target);
 2357       sampler_unit = fetch_sampler_unit(mach, inst, 2);
 2358    } else {
 2359       dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
 2360       sampler_unit = resource_unit;
 2361    }
 2362    assert(dim <= ARRAY_SIZE(coords));
 2363    /* fetch coordinates */
 2364    for (i = 0; i < dim; i++) {
 2365       FETCH(&coords[i], 0, TGSI_CHAN_X + i);
 2366       args[i] = &coords[i];
 2367    }
 2368    for (i = dim; i < ARRAY_SIZE(coords); i++) {
 2369       args[i] = &ZeroVec;
 2370    }
 2371    mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
 2372                             args[0]->f,
 2373                             args[1]->f,
 2374                             args[2]->f,
 2375                             args[3]->f,
 2376                             TGSI_SAMPLER_LOD_NONE,
 2377                             r[0].f,
 2378                             r[1].f);
 2379 
 2380    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
 2381       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
 2382                  TGSI_EXEC_DATA_FLOAT);
 2383    }
 2384    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
 2385       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
 2386                  TGSI_EXEC_DATA_FLOAT);
 2387    }
 2388    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
 2389       unsigned char swizzles[4];
 2390       unsigned chan;
 2391       swizzles[0] = inst->Src[1].Register.SwizzleX;
 2392       swizzles[1] = inst->Src[1].Register.SwizzleY;
 2393       swizzles[2] = inst->Src[1].Register.SwizzleZ;
 2394       swizzles[3] = inst->Src[1].Register.SwizzleW;
 2395 
 2396       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 2397          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 2398             if (swizzles[chan] >= 2) {
 2399                store_dest(mach, &ZeroVec,
 2400                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 2401             } else {
 2402                store_dest(mach, &r[swizzles[chan]],
 2403                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 2404             }
 2405          }
 2406       }
 2407    } else {
 2408       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
 2409          store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
 2410                     TGSI_EXEC_DATA_FLOAT);
 2411       }
 2412       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
 2413          store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
 2414                     TGSI_EXEC_DATA_FLOAT);
 2415       }
 2416    }
 2417 }
 2418 
 2419 static void
 2420 exec_txd(struct tgsi_exec_machine *mach,
 2421          const struct tgsi_full_instruction *inst)
 2422 {
 2423    union tgsi_exec_channel r[4];
 2424    float derivs[3][2][TGSI_QUAD_SIZE];
 2425    uint chan;
 2426    uint unit;
 2427    int8_t offsets[3];
 2428 
 2429    unit = fetch_sampler_unit(mach, inst, 3);
 2430    /* always fetch all 3 offsets, overkill but keeps code simple */
 2431    fetch_texel_offsets(mach, inst, offsets);
 2432 
 2433    switch (inst->Texture.Texture) {
 2434    case TGSI_TEXTURE_1D:
 2435       FETCH(&r[0], 0, TGSI_CHAN_X);
 2436 
 2437       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
 2438 
 2439       fetch_texel(mach->Sampler, unit, unit,
 2440                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
 2441                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
 2442                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
 2443       break;
 2444 
 2445    case TGSI_TEXTURE_SHADOW1D:
 2446    case TGSI_TEXTURE_1D_ARRAY:
 2447    case TGSI_TEXTURE_SHADOW1D_ARRAY:
 2448       /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
 2449       FETCH(&r[0], 0, TGSI_CHAN_X);
 2450       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2451       FETCH(&r[2], 0, TGSI_CHAN_Z);
 2452 
 2453       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
 2454 
 2455       fetch_texel(mach->Sampler, unit, unit,
 2456                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
 2457                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
 2458                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
 2459       break;
 2460 
 2461    case TGSI_TEXTURE_2D:
 2462    case TGSI_TEXTURE_RECT:
 2463       FETCH(&r[0], 0, TGSI_CHAN_X);
 2464       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2465 
 2466       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
 2467       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
 2468 
 2469       fetch_texel(mach->Sampler, unit, unit,
 2470                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
 2471                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
 2472                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
 2473       break;
 2474 
 2475 
 2476    case TGSI_TEXTURE_SHADOW2D:
 2477    case TGSI_TEXTURE_SHADOWRECT:
 2478    case TGSI_TEXTURE_2D_ARRAY:
 2479    case TGSI_TEXTURE_SHADOW2D_ARRAY:
 2480       /* only SHADOW2D_ARRAY actually needs W */
 2481       FETCH(&r[0], 0, TGSI_CHAN_X);
 2482       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2483       FETCH(&r[2], 0, TGSI_CHAN_Z);
 2484       FETCH(&r[3], 0, TGSI_CHAN_W);
 2485 
 2486       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
 2487       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
 2488 
 2489       fetch_texel(mach->Sampler, unit, unit,
 2490                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
 2491                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
 2492                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
 2493       break;
 2494 
 2495    case TGSI_TEXTURE_3D:
 2496    case TGSI_TEXTURE_CUBE:
 2497    case TGSI_TEXTURE_CUBE_ARRAY:
 2498    case TGSI_TEXTURE_SHADOWCUBE:
 2499       /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
 2500       FETCH(&r[0], 0, TGSI_CHAN_X);
 2501       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2502       FETCH(&r[2], 0, TGSI_CHAN_Z);
 2503       FETCH(&r[3], 0, TGSI_CHAN_W);
 2504 
 2505       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
 2506       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
 2507       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
 2508 
 2509       fetch_texel(mach->Sampler, unit, unit,
 2510                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
 2511                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
 2512                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
 2513       break;
 2514 
 2515    default:
 2516       assert(0);
 2517    }
 2518 
 2519    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 2520       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 2521          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 2522       }
 2523    }
 2524 }
 2525 
 2526 
 2527 static void
 2528 exec_txf(struct tgsi_exec_machine *mach,
 2529          const struct tgsi_full_instruction *inst)
 2530 {
 2531    union tgsi_exec_channel r[4];
 2532    uint chan;
 2533    uint unit;
 2534    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 2535    int j;
 2536    int8_t offsets[3];
 2537    unsigned target;
 2538 
 2539    unit = fetch_sampler_unit(mach, inst, 1);
 2540    /* always fetch all 3 offsets, overkill but keeps code simple */
 2541    fetch_texel_offsets(mach, inst, offsets);
 2542 
 2543    IFETCH(&r[3], 0, TGSI_CHAN_W);
 2544 
 2545    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
 2546        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
 2547       target = mach->SamplerViews[unit].Resource;
 2548    }
 2549    else {
 2550       target = inst->Texture.Texture;
 2551    }
 2552    switch(target) {
 2553    case TGSI_TEXTURE_3D:
 2554    case TGSI_TEXTURE_2D_ARRAY:
 2555    case TGSI_TEXTURE_SHADOW2D_ARRAY:
 2556    case TGSI_TEXTURE_2D_ARRAY_MSAA:
 2557       IFETCH(&r[2], 0, TGSI_CHAN_Z);
 2558       /* fallthrough */
 2559    case TGSI_TEXTURE_2D:
 2560    case TGSI_TEXTURE_RECT:
 2561    case TGSI_TEXTURE_SHADOW1D_ARRAY:
 2562    case TGSI_TEXTURE_SHADOW2D:
 2563    case TGSI_TEXTURE_SHADOWRECT:
 2564    case TGSI_TEXTURE_1D_ARRAY:
 2565    case TGSI_TEXTURE_2D_MSAA:
 2566       IFETCH(&r[1], 0, TGSI_CHAN_Y);
 2567       /* fallthrough */
 2568    case TGSI_TEXTURE_BUFFER:
 2569    case TGSI_TEXTURE_1D:
 2570    case TGSI_TEXTURE_SHADOW1D:
 2571       IFETCH(&r[0], 0, TGSI_CHAN_X);
 2572       break;
 2573    default:
 2574       assert(0);
 2575       break;
 2576    }      
 2577 
 2578    mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
 2579                             offsets, rgba);
 2580 
 2581    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 2582       r[0].f[j] = rgba[0][j];
 2583       r[1].f[j] = rgba[1][j];
 2584       r[2].f[j] = rgba[2][j];
 2585       r[3].f[j] = rgba[3][j];
 2586    }
 2587 
 2588    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
 2589        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
 2590       unsigned char swizzles[4];
 2591       swizzles[0] = inst->Src[1].Register.SwizzleX;
 2592       swizzles[1] = inst->Src[1].Register.SwizzleY;
 2593       swizzles[2] = inst->Src[1].Register.SwizzleZ;
 2594       swizzles[3] = inst->Src[1].Register.SwizzleW;
 2595 
 2596       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 2597          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 2598             store_dest(mach, &r[swizzles[chan]],
 2599                        &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 2600          }
 2601       }
 2602    }
 2603    else {
 2604       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 2605          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 2606             store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 2607          }
 2608       }
 2609    }
 2610 }
 2611 
 2612 static void
 2613 exec_txq(struct tgsi_exec_machine *mach,
 2614          const struct tgsi_full_instruction *inst)
 2615 {
 2616    int result[4];
 2617    union tgsi_exec_channel r[4], src;
 2618    uint chan;
 2619    uint unit;
 2620    int i,j;
 2621 
 2622    unit = fetch_sampler_unit(mach, inst, 1);
 2623 
 2624    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
 2625 
 2626    /* XXX: This interface can't return per-pixel values */
 2627    mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
 2628 
 2629    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 2630       for (j = 0; j < 4; j++) {
 2631          r[j].i[i] = result[j];
 2632       }
 2633    }
 2634 
 2635    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 2636       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 2637          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
 2638                     TGSI_EXEC_DATA_INT);
 2639       }
 2640    }
 2641 }
 2642 
 2643 static void
 2644 exec_sample(struct tgsi_exec_machine *mach,
 2645             const struct tgsi_full_instruction *inst,
 2646             uint modifier, boolean compare)
 2647 {
 2648    const uint resource_unit = inst->Src[1].Register.Index;
 2649    const uint sampler_unit = inst->Src[2].Register.Index;
 2650    union tgsi_exec_channel r[5], c1;
 2651    const union tgsi_exec_channel *lod = &ZeroVec;
 2652    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
 2653    uint chan;
 2654    unsigned char swizzles[4];
 2655    int8_t offsets[3];
 2656 
 2657    /* always fetch all 3 offsets, overkill but keeps code simple */
 2658    fetch_texel_offsets(mach, inst, offsets);
 2659 
 2660    assert(modifier != TEX_MODIFIER_PROJECTED);
 2661 
 2662    if (modifier != TEX_MODIFIER_NONE) {
 2663       if (modifier == TEX_MODIFIER_LOD_BIAS) {
 2664          FETCH(&c1, 3, TGSI_CHAN_X);
 2665          lod = &c1;
 2666          control = TGSI_SAMPLER_LOD_BIAS;
 2667       }
 2668       else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
 2669          FETCH(&c1, 3, TGSI_CHAN_X);
 2670          lod = &c1;
 2671          control = TGSI_SAMPLER_LOD_EXPLICIT;
 2672       }
 2673       else if (modifier == TEX_MODIFIER_GATHER) {
 2674          control = TGSI_SAMPLER_GATHER;
 2675       }
 2676       else {
 2677          assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
 2678          control = TGSI_SAMPLER_LOD_ZERO;
 2679       }
 2680    }
 2681 
 2682    FETCH(&r[0], 0, TGSI_CHAN_X);
 2683 
 2684    switch (mach->SamplerViews[resource_unit].Resource) {
 2685    case TGSI_TEXTURE_1D:
 2686       if (compare) {
 2687          FETCH(&r[2], 3, TGSI_CHAN_X);
 2688          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2689                      &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
 2690                      NULL, offsets, control,
 2691                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
 2692       }
 2693       else {
 2694          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2695                      &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
 2696                      NULL, offsets, control,
 2697                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
 2698       }
 2699       break;
 2700 
 2701    case TGSI_TEXTURE_1D_ARRAY:
 2702    case TGSI_TEXTURE_2D:
 2703    case TGSI_TEXTURE_RECT:
 2704       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2705       if (compare) {
 2706          FETCH(&r[2], 3, TGSI_CHAN_X);
 2707          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2708                      &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
 2709                      NULL, offsets, control,
 2710                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
 2711       }
 2712       else {
 2713          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2714                      &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
 2715                      NULL, offsets, control,
 2716                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
 2717       }
 2718       break;
 2719 
 2720    case TGSI_TEXTURE_2D_ARRAY:
 2721    case TGSI_TEXTURE_3D:
 2722    case TGSI_TEXTURE_CUBE:
 2723       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2724       FETCH(&r[2], 0, TGSI_CHAN_Z);
 2725       if(compare) {
 2726          FETCH(&r[3], 3, TGSI_CHAN_X);
 2727          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2728                      &r[0], &r[1], &r[2], &r[3], lod,
 2729                      NULL, offsets, control,
 2730                      &r[0], &r[1], &r[2], &r[3]);
 2731       }
 2732       else {
 2733          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2734                      &r[0], &r[1], &r[2], &ZeroVec, lod,
 2735                      NULL, offsets, control,
 2736                      &r[0], &r[1], &r[2], &r[3]);
 2737       }
 2738       break;
 2739 
 2740    case TGSI_TEXTURE_CUBE_ARRAY:
 2741       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2742       FETCH(&r[2], 0, TGSI_CHAN_Z);
 2743       FETCH(&r[3], 0, TGSI_CHAN_W);
 2744       if(compare) {
 2745          FETCH(&r[4], 3, TGSI_CHAN_X);
 2746          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2747                      &r[0], &r[1], &r[2], &r[3], &r[4],
 2748                      NULL, offsets, control,
 2749                      &r[0], &r[1], &r[2], &r[3]);
 2750       }
 2751       else {
 2752          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2753                      &r[0], &r[1], &r[2], &r[3], lod,
 2754                      NULL, offsets, control,
 2755                      &r[0], &r[1], &r[2], &r[3]);
 2756       }
 2757       break;
 2758 
 2759 
 2760    default:
 2761       assert(0);
 2762    }
 2763 
 2764    swizzles[0] = inst->Src[1].Register.SwizzleX;
 2765    swizzles[1] = inst->Src[1].Register.SwizzleY;
 2766    swizzles[2] = inst->Src[1].Register.SwizzleZ;
 2767    swizzles[3] = inst->Src[1].Register.SwizzleW;
 2768 
 2769    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 2770       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 2771          store_dest(mach, &r[swizzles[chan]],
 2772                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 2773       }
 2774    }
 2775 }
 2776 
 2777 static void
 2778 exec_sample_d(struct tgsi_exec_machine *mach,
 2779               const struct tgsi_full_instruction *inst)
 2780 {
 2781    const uint resource_unit = inst->Src[1].Register.Index;
 2782    const uint sampler_unit = inst->Src[2].Register.Index;
 2783    union tgsi_exec_channel r[4];
 2784    float derivs[3][2][TGSI_QUAD_SIZE];
 2785    uint chan;
 2786    unsigned char swizzles[4];
 2787    int8_t offsets[3];
 2788 
 2789    /* always fetch all 3 offsets, overkill but keeps code simple */
 2790    fetch_texel_offsets(mach, inst, offsets);
 2791 
 2792    FETCH(&r[0], 0, TGSI_CHAN_X);
 2793 
 2794    switch (mach->SamplerViews[resource_unit].Resource) {
 2795    case TGSI_TEXTURE_1D:
 2796    case TGSI_TEXTURE_1D_ARRAY:
 2797       /* only 1D array actually needs Y */
 2798       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2799 
 2800       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
 2801 
 2802       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2803                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
 2804                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
 2805                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
 2806       break;
 2807 
 2808    case TGSI_TEXTURE_2D:
 2809    case TGSI_TEXTURE_RECT:
 2810    case TGSI_TEXTURE_2D_ARRAY:
 2811       /* only 2D array actually needs Z */
 2812       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2813       FETCH(&r[2], 0, TGSI_CHAN_Z);
 2814 
 2815       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
 2816       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
 2817 
 2818       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2819                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
 2820                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
 2821                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
 2822       break;
 2823 
 2824    case TGSI_TEXTURE_3D:
 2825    case TGSI_TEXTURE_CUBE:
 2826    case TGSI_TEXTURE_CUBE_ARRAY:
 2827       /* only cube array actually needs W */
 2828       FETCH(&r[1], 0, TGSI_CHAN_Y);
 2829       FETCH(&r[2], 0, TGSI_CHAN_Z);
 2830       FETCH(&r[3], 0, TGSI_CHAN_W);
 2831 
 2832       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
 2833       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
 2834       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
 2835 
 2836       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
 2837                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,
 2838                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
 2839                   &r[0], &r[1], &r[2], &r[3]);
 2840       break;
 2841 
 2842    default:
 2843       assert(0);
 2844    }
 2845 
 2846    swizzles[0] = inst->Src[1].Register.SwizzleX;
 2847    swizzles[1] = inst->Src[1].Register.SwizzleY;
 2848    swizzles[2] = inst->Src[1].Register.SwizzleZ;
 2849    swizzles[3] = inst->Src[1].Register.SwizzleW;
 2850 
 2851    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 2852       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 2853          store_dest(mach, &r[swizzles[chan]],
 2854                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 2855       }
 2856    }
 2857 }
 2858 
 2859 
 2860 /**
 2861  * Evaluate a constant-valued coefficient at the position of the
 2862  * current quad.
 2863  */
 2864 static void
 2865 eval_constant_coef(
 2866    struct tgsi_exec_machine *mach,
 2867    unsigned attrib,
 2868    unsigned chan )
 2869 {
 2870    unsigned i;
 2871 
 2872    for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
 2873       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
 2874    }
 2875 }
 2876 
 2877 static void
 2878 interp_constant_offset(
 2879       UNUSED const struct tgsi_exec_machine *mach,
 2880       UNUSED unsigned attrib,
 2881       UNUSED unsigned chan,
 2882       UNUSED float ofs_x,
 2883       UNUSED float ofs_y,
 2884       UNUSED union tgsi_exec_channel *out_chan)
 2885 {
 2886 }
 2887 
 2888 /**
 2889  * Evaluate a linear-valued coefficient at the position of the
 2890  * current quad.
 2891  */
 2892 static void
 2893 interp_linear_offset(
 2894       const struct tgsi_exec_machine *mach,
 2895       unsigned attrib,
 2896       unsigned chan,
 2897       float ofs_x,
 2898       float ofs_y,
 2899       union tgsi_exec_channel *out_chan)
 2900 {
 2901    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
 2902    const float dady = mach->InterpCoefs[attrib].dady[chan];
 2903    const float delta = ofs_x * dadx + ofs_y * dady;
 2904    out_chan->f[0] += delta;
 2905    out_chan->f[1] += delta;
 2906    out_chan->f[2] += delta;
 2907    out_chan->f[3] += delta;
 2908 }
 2909 
 2910 static void
 2911 eval_linear_coef(struct tgsi_exec_machine *mach,
 2912                  unsigned attrib,
 2913                  unsigned chan)
 2914 {
 2915    const float x = mach->QuadPos.xyzw[0].f[0];
 2916    const float y = mach->QuadPos.xyzw[1].f[0];
 2917    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
 2918    const float dady = mach->InterpCoefs[attrib].dady[chan];
 2919    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
 2920 
 2921    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
 2922    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
 2923    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
 2924    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
 2925 }
 2926 
 2927 /**
 2928  * Evaluate a perspective-valued coefficient at the position of the
 2929  * current quad.
 2930  */
 2931 
 2932 static void
 2933 interp_perspective_offset(
 2934    const struct tgsi_exec_machine *mach,
 2935    unsigned attrib,
 2936    unsigned chan,
 2937    float ofs_x,
 2938    float ofs_y,
 2939    union tgsi_exec_channel *out_chan)
 2940 {
 2941    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
 2942    const float dady = mach->InterpCoefs[attrib].dady[chan];
 2943    const float *w = mach->QuadPos.xyzw[3].f;
 2944    const float delta = ofs_x * dadx + ofs_y * dady;
 2945    out_chan->f[0] += delta / w[0];
 2946    out_chan->f[1] += delta / w[1];
 2947    out_chan->f[2] += delta / w[2];
 2948    out_chan->f[3] += delta / w[3];
 2949 }
 2950 
 2951 static void
 2952 eval_perspective_coef(
 2953    struct tgsi_exec_machine *mach,
 2954    unsigned attrib,
 2955    unsigned chan )
 2956 {
 2957    const float x = mach->QuadPos.xyzw[0].f[0];
 2958    const float y = mach->QuadPos.xyzw[1].f[0];
 2959    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
 2960    const float dady = mach->InterpCoefs[attrib].dady[chan];
 2961    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
 2962    const float *w = mach->QuadPos.xyzw[3].f;
 2963    /* divide by W here */
 2964    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
 2965    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
 2966    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
 2967    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
 2968 }
 2969 
 2970 
 2971 typedef void (* eval_coef_func)(
 2972    struct tgsi_exec_machine *mach,
 2973    unsigned attrib,
 2974    unsigned chan );
 2975 
 2976 static void
 2977 exec_declaration(struct tgsi_exec_machine *mach,
 2978                  const struct tgsi_full_declaration *decl)
 2979 {
 2980    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
 2981       mach->SamplerViews[decl->Range.First] = decl->SamplerView;
 2982       return;
 2983    }
 2984 
 2985    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
 2986       if (decl->Declaration.File == TGSI_FILE_INPUT) {
 2987          uint first, last, mask;
 2988 
 2989          first = decl->Range.First;
 2990          last = decl->Range.Last;
 2991          mask = decl->Declaration.UsageMask;
 2992 
 2993          /* XXX we could remove this special-case code since
 2994           * mach->InterpCoefs[first].a0 should already have the
 2995           * front/back-face value.  But we should first update the
 2996           * ureg code to emit the right UsageMask value (WRITEMASK_X).
 2997           * Then, we could remove the tgsi_exec_machine::Face field.
 2998           */
 2999          /* XXX make FACE a system value */
 3000          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
 3001             uint i;
 3002 
 3003             assert(decl->Semantic.Index == 0);
 3004             assert(first == last);
 3005 
 3006             for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 3007                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
 3008             }
 3009          } else {
 3010             eval_coef_func eval;
 3011             apply_sample_offset_func interp;
 3012             uint i, j;
 3013 
 3014             switch (decl->Interp.Interpolate) {
 3015             case TGSI_INTERPOLATE_CONSTANT:
 3016                eval = eval_constant_coef;
 3017                interp = interp_constant_offset;
 3018                break;
 3019 
 3020             case TGSI_INTERPOLATE_LINEAR:
 3021                eval = eval_linear_coef;
 3022                interp = interp_linear_offset;
 3023                break;
 3024 
 3025             case TGSI_INTERPOLATE_PERSPECTIVE:
 3026                eval = eval_perspective_coef;
 3027                interp = interp_perspective_offset;
 3028                break;
 3029 
 3030             case TGSI_INTERPOLATE_COLOR:
 3031                eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
 3032                interp = mach->flatshade_color ? interp_constant_offset : interp_perspective_offset;
 3033                break;
 3034 
 3035             default:
 3036                assert(0);
 3037                return;
 3038             }
 3039 
 3040             for (i = first; i <= last; i++)
 3041                mach->InputSampleOffsetApply[i] = interp;
 3042 
 3043             for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
 3044                if (mask & (1 << j)) {
 3045                   for (i = first; i <= last; i++) {
 3046                      eval(mach, i, j);
 3047                   }
 3048                }
 3049             }
 3050          }
 3051 
 3052          if (DEBUG_EXECUTION) {
 3053             uint i, j;
 3054             for (i = first; i <= last; ++i) {
 3055                debug_printf("IN[%2u] = ", i);
 3056                for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
 3057                   if (j > 0) {
 3058                      debug_printf("         ");
 3059                   }
 3060                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
 3061                                mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
 3062                                mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
 3063                                mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
 3064                                mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
 3065                }
 3066             }
 3067          }
 3068       }
 3069    }
 3070 
 3071 }
 3072 
 3073 typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
 3074                                 const union tgsi_exec_channel *src);
 3075 
 3076 static void
 3077 exec_scalar_unary(struct tgsi_exec_machine *mach,
 3078                   const struct tgsi_full_instruction *inst,
 3079                   micro_unary_op op,
 3080                   enum tgsi_exec_datatype dst_datatype,
 3081                   enum tgsi_exec_datatype src_datatype)
 3082 {
 3083    unsigned int chan;
 3084    union tgsi_exec_channel src;
 3085    union tgsi_exec_channel dst;
 3086 
 3087    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
 3088    op(&dst, &src);
 3089    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3090       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3091          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
 3092       }
 3093    }
 3094 }
 3095 
 3096 static void
 3097 exec_vector_unary(struct tgsi_exec_machine *mach,
 3098                   const struct tgsi_full_instruction *inst,
 3099                   micro_unary_op op,
 3100                   enum tgsi_exec_datatype dst_datatype,
 3101                   enum tgsi_exec_datatype src_datatype)
 3102 {
 3103    unsigned int chan;
 3104    struct tgsi_exec_vector dst;
 3105 
 3106    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3107       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3108          union tgsi_exec_channel src;
 3109 
 3110          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
 3111          op(&dst.xyzw[chan], &src);
 3112       }
 3113    }
 3114    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3115       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3116          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
 3117       }
 3118    }
 3119 }
 3120 
 3121 typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
 3122                                  const union tgsi_exec_channel *src0,
 3123                                  const union tgsi_exec_channel *src1);
 3124 
 3125 static void
 3126 exec_scalar_binary(struct tgsi_exec_machine *mach,
 3127                    const struct tgsi_full_instruction *inst,
 3128                    micro_binary_op op,
 3129                    enum tgsi_exec_datatype dst_datatype,
 3130                    enum tgsi_exec_datatype src_datatype)
 3131 {
 3132    unsigned int chan;
 3133    union tgsi_exec_channel src[2];
 3134    union tgsi_exec_channel dst;
 3135 
 3136    fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
 3137    fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
 3138    op(&dst, &src[0], &src[1]);
 3139    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3140       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3141          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
 3142       }
 3143    }
 3144 }
 3145 
 3146 static void
 3147 exec_vector_binary(struct tgsi_exec_machine *mach,
 3148                    const struct tgsi_full_instruction *inst,
 3149                    micro_binary_op op,
 3150                    enum tgsi_exec_datatype dst_datatype,
 3151                    enum tgsi_exec_datatype src_datatype)
 3152 {
 3153    unsigned int chan;
 3154    struct tgsi_exec_vector dst;
 3155 
 3156    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3157       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3158          union tgsi_exec_channel src[2];
 3159 
 3160          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
 3161          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
 3162          op(&dst.xyzw[chan], &src[0], &src[1]);
 3163       }
 3164    }
 3165    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3166       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3167          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
 3168       }
 3169    }
 3170 }
 3171 
 3172 typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
 3173                                   const union tgsi_exec_channel *src0,
 3174                                   const union tgsi_exec_channel *src1,
 3175                                   const union tgsi_exec_channel *src2);
 3176 
 3177 static void
 3178 exec_vector_trinary(struct tgsi_exec_machine *mach,
 3179                     const struct tgsi_full_instruction *inst,
 3180                     micro_trinary_op op,
 3181                     enum tgsi_exec_datatype dst_datatype,
 3182                     enum tgsi_exec_datatype src_datatype)
 3183 {
 3184    unsigned int chan;
 3185    struct tgsi_exec_vector dst;
 3186 
 3187    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3188       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3189          union tgsi_exec_channel src[3];
 3190 
 3191          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
 3192          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
 3193          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
 3194          op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
 3195       }
 3196    }
 3197    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3198       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3199          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
 3200       }
 3201    }
 3202 }
 3203 
 3204 typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
 3205                                      const union tgsi_exec_channel *src0,
 3206                                      const union tgsi_exec_channel *src1,
 3207                                      const union tgsi_exec_channel *src2,
 3208                                      const union tgsi_exec_channel *src3);
 3209 
 3210 static void
 3211 exec_vector_quaternary(struct tgsi_exec_machine *mach,
 3212                        const struct tgsi_full_instruction *inst,
 3213                        micro_quaternary_op op,
 3214                        enum tgsi_exec_datatype dst_datatype,
 3215                        enum tgsi_exec_datatype src_datatype)
 3216 {
 3217    unsigned int chan;
 3218    struct tgsi_exec_vector dst;
 3219 
 3220    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3221       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3222          union tgsi_exec_channel src[4];
 3223 
 3224          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
 3225          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
 3226          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
 3227          fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
 3228          op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
 3229       }
 3230    }
 3231    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3232       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3233          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
 3234       }
 3235    }
 3236 }
 3237 
 3238 static void
 3239 exec_dp3(struct tgsi_exec_machine *mach,
 3240          const struct tgsi_full_instruction *inst)
 3241 {
 3242    unsigned int chan;
 3243    union tgsi_exec_channel arg[3];
 3244 
 3245    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3246    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3247    micro_mul(&arg[2], &arg[0], &arg[1]);
 3248 
 3249    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
 3250       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
 3251       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
 3252       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
 3253    }
 3254 
 3255    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3256       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3257          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 3258       }
 3259    }
 3260 }
 3261 
 3262 static void
 3263 exec_dp4(struct tgsi_exec_machine *mach,
 3264          const struct tgsi_full_instruction *inst)
 3265 {
 3266    unsigned int chan;
 3267    union tgsi_exec_channel arg[3];
 3268 
 3269    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3270    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3271    micro_mul(&arg[2], &arg[0], &arg[1]);
 3272 
 3273    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
 3274       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
 3275       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
 3276       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
 3277    }
 3278 
 3279    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3280       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3281          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 3282       }
 3283    }
 3284 }
 3285 
 3286 static void
 3287 exec_dp2(struct tgsi_exec_machine *mach,
 3288          const struct tgsi_full_instruction *inst)
 3289 {
 3290    unsigned int chan;
 3291    union tgsi_exec_channel arg[3];
 3292 
 3293    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3294    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3295    micro_mul(&arg[2], &arg[0], &arg[1]);
 3296 
 3297    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3298    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3299    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
 3300 
 3301    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3302       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3303          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 3304       }
 3305    }
 3306 }
 3307 
 3308 static void
 3309 exec_pk2h(struct tgsi_exec_machine *mach,
 3310           const struct tgsi_full_instruction *inst)
 3311 {
 3312    unsigned chan;
 3313    union tgsi_exec_channel arg[2], dst;
 3314 
 3315    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3316    fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3317    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
 3318       dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
 3319          (util_float_to_half(arg[1].f[chan]) << 16);
 3320    }
 3321    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3322       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3323          store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
 3324       }
 3325    }
 3326 }
 3327 
 3328 static void
 3329 exec_up2h(struct tgsi_exec_machine *mach,
 3330           const struct tgsi_full_instruction *inst)
 3331 {
 3332    unsigned chan;
 3333    union tgsi_exec_channel arg, dst[2];
 3334 
 3335    fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
 3336    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
 3337       dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
 3338       dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
 3339    }
 3340    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3341       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3342          store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 3343       }
 3344    }
 3345 }
 3346 
 3347 static void
 3348 micro_ucmp(union tgsi_exec_channel *dst,
 3349            const union tgsi_exec_channel *src0,
 3350            const union tgsi_exec_channel *src1,
 3351            const union tgsi_exec_channel *src2)
 3352 {
 3353    dst->f[0] = src0->u[0] ? src1->f[0] : src2->f[0];
 3354    dst->f[1] = src0->u[1] ? src1->f[1] : src2->f[1];
 3355    dst->f[2] = src0->u[2] ? src1->f[2] : src2->f[2];
 3356    dst->f[3] = src0->u[3] ? src1->f[3] : src2->f[3];
 3357 }
 3358 
 3359 static void
 3360 exec_ucmp(struct tgsi_exec_machine *mach,
 3361           const struct tgsi_full_instruction *inst)
 3362 {
 3363    unsigned int chan;
 3364    struct tgsi_exec_vector dst;
 3365 
 3366    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3367       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3368          union tgsi_exec_channel src[3];
 3369 
 3370          fetch_source(mach, &src[0], &inst->Src[0], chan,
 3371                       TGSI_EXEC_DATA_UINT);
 3372          fetch_source(mach, &src[1], &inst->Src[1], chan,
 3373                       TGSI_EXEC_DATA_FLOAT);
 3374          fetch_source(mach, &src[2], &inst->Src[2], chan,
 3375                       TGSI_EXEC_DATA_FLOAT);
 3376          micro_ucmp(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
 3377       }
 3378    }
 3379    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3380       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3381          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan,
 3382                     TGSI_EXEC_DATA_FLOAT);
 3383       }
 3384    }
 3385 }
 3386 
 3387 static void
 3388 exec_dst(struct tgsi_exec_machine *mach,
 3389          const struct tgsi_full_instruction *inst)
 3390 {
 3391    union tgsi_exec_channel r[2];
 3392    union tgsi_exec_channel d[4];
 3393 
 3394    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
 3395       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3396       fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3397       micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
 3398    }
 3399    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
 3400       fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
 3401    }
 3402    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
 3403       fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
 3404    }
 3405 
 3406    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
 3407       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3408    }
 3409    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
 3410       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3411    }
 3412    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
 3413       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
 3414    }
 3415    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
 3416       store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
 3417    }
 3418 }
 3419 
 3420 static void
 3421 exec_log(struct tgsi_exec_machine *mach,
 3422          const struct tgsi_full_instruction *inst)
 3423 {
 3424    union tgsi_exec_channel r[3];
 3425 
 3426    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3427    micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
 3428    micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
 3429    micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
 3430    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
 3431       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3432    }
 3433    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
 3434       micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
 3435       micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
 3436       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3437    }
 3438    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
 3439       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
 3440    }
 3441    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
 3442       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
 3443    }
 3444 }
 3445 
 3446 static void
 3447 exec_exp(struct tgsi_exec_machine *mach,
 3448          const struct tgsi_full_instruction *inst)
 3449 {
 3450    union tgsi_exec_channel r[3];
 3451 
 3452    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3453    micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
 3454    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
 3455       micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
 3456       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3457    }
 3458    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
 3459       micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
 3460       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3461    }
 3462    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
 3463       micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
 3464       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
 3465    }
 3466    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
 3467       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
 3468    }
 3469 }
 3470 
 3471 static void
 3472 exec_lit(struct tgsi_exec_machine *mach,
 3473          const struct tgsi_full_instruction *inst)
 3474 {
 3475    union tgsi_exec_channel r[3];
 3476    union tgsi_exec_channel d[3];
 3477 
 3478    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
 3479       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3480       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
 3481          fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3482          micro_max(&r[1], &r[1], &ZeroVec);
 3483 
 3484          fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
 3485          micro_min(&r[2], &r[2], &P128Vec);
 3486          micro_max(&r[2], &r[2], &M128Vec);
 3487          micro_pow(&r[1], &r[1], &r[2]);
 3488          micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
 3489          store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
 3490       }
 3491       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
 3492          micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
 3493          store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
 3494       }
 3495    }
 3496    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
 3497       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
 3498    }
 3499 
 3500    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
 3501       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
 3502    }
 3503 }
 3504 
 3505 static void
 3506 exec_break(struct tgsi_exec_machine *mach)
 3507 {
 3508    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
 3509       /* turn off loop channels for each enabled exec channel */
 3510       mach->LoopMask &= ~mach->ExecMask;
 3511       /* Todo: if mach->LoopMask == 0, jump to end of loop */
 3512       UPDATE_EXEC_MASK(mach);
 3513    } else {
 3514       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
 3515 
 3516       mach->Switch.mask = 0x0;
 3517 
 3518       UPDATE_EXEC_MASK(mach);
 3519    }
 3520 }
 3521 
 3522 static void
 3523 exec_switch(struct tgsi_exec_machine *mach,
 3524             const struct tgsi_full_instruction *inst)
 3525 {
 3526    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
 3527    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
 3528 
 3529    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
 3530    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
 3531    mach->Switch.mask = 0x0;
 3532    mach->Switch.defaultMask = 0x0;
 3533 
 3534    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
 3535    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
 3536 
 3537    UPDATE_EXEC_MASK(mach);
 3538 }
 3539 
 3540 static void
 3541 exec_case(struct tgsi_exec_machine *mach,
 3542           const struct tgsi_full_instruction *inst)
 3543 {
 3544    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
 3545    union tgsi_exec_channel src;
 3546    uint mask = 0;
 3547 
 3548    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
 3549 
 3550    if (mach->Switch.selector.u[0] == src.u[0]) {
 3551       mask |= 0x1;
 3552    }
 3553    if (mach->Switch.selector.u[1] == src.u[1]) {
 3554       mask |= 0x2;
 3555    }
 3556    if (mach->Switch.selector.u[2] == src.u[2]) {
 3557       mask |= 0x4;
 3558    }
 3559    if (mach->Switch.selector.u[3] == src.u[3]) {
 3560       mask |= 0x8;
 3561    }
 3562 
 3563    mach->Switch.defaultMask |= mask;
 3564 
 3565    mach->Switch.mask |= mask & prevMask;
 3566 
 3567    UPDATE_EXEC_MASK(mach);
 3568 }
 3569 
 3570 /* FIXME: this will only work if default is last */
 3571 static void
 3572 exec_default(struct tgsi_exec_machine *mach)
 3573 {
 3574    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
 3575 
 3576    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
 3577 
 3578    UPDATE_EXEC_MASK(mach);
 3579 }
 3580 
 3581 static void
 3582 exec_endswitch(struct tgsi_exec_machine *mach)
 3583 {
 3584    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
 3585    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
 3586 
 3587    UPDATE_EXEC_MASK(mach);
 3588 }
 3589 
 3590 typedef void (* micro_dop)(union tgsi_double_channel *dst,
 3591                            const union tgsi_double_channel *src);
 3592 
 3593 typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
 3594                                const union tgsi_double_channel *src0,
 3595                                union tgsi_exec_channel *src1);
 3596 
 3597 typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
 3598                              const union tgsi_exec_channel *src);
 3599 
 3600 typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
 3601                              const union tgsi_double_channel *src);
 3602 
 3603 static void
 3604 fetch_double_channel(struct tgsi_exec_machine *mach,
 3605                      union tgsi_double_channel *chan,
 3606                      const struct tgsi_full_src_register *reg,
 3607                      uint chan_0,
 3608                      uint chan_1)
 3609 {
 3610    union tgsi_exec_channel src[2];
 3611    uint i;
 3612 
 3613    fetch_source_d(mach, &src[0], reg, chan_0);
 3614    fetch_source_d(mach, &src[1], reg, chan_1);
 3615 
 3616    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 3617       chan->u[i][0] = src[0].u[i];
 3618       chan->u[i][1] = src[1].u[i];
 3619    }
 3620    if (reg->Register.Absolute) {
 3621       micro_dabs(chan, chan);
 3622    }
 3623    if (reg->Register.Negate) {
 3624       micro_dneg(chan, chan);
 3625    }
 3626 }
 3627 
 3628 static void
 3629 store_double_channel(struct tgsi_exec_machine *mach,
 3630                      const union tgsi_double_channel *chan,
 3631                      const struct tgsi_full_dst_register *reg,
 3632                      const struct tgsi_full_instruction *inst,
 3633                      uint chan_0,
 3634                      uint chan_1)
 3635 {
 3636    union tgsi_exec_channel dst[2];
 3637    uint i;
 3638    union tgsi_double_channel temp;
 3639    const uint execmask = mach->ExecMask;
 3640 
 3641    if (!inst->Instruction.Saturate) {
 3642       for (i = 0; i < TGSI_QUAD_SIZE; i++)
 3643          if (execmask & (1 << i)) {
 3644             dst[0].u[i] = chan->u[i][0];
 3645             dst[1].u[i] = chan->u[i][1];
 3646          }
 3647    }
 3648    else {
 3649       for (i = 0; i < TGSI_QUAD_SIZE; i++)
 3650          if (execmask & (1 << i)) {
 3651             if (chan->d[i] < 0.0)
 3652                temp.d[i] = 0.0;
 3653             else if (chan->d[i] > 1.0)
 3654                temp.d[i] = 1.0;
 3655             else
 3656                temp.d[i] = chan->d[i];
 3657 
 3658             dst[0].u[i] = temp.u[i][0];
 3659             dst[1].u[i] = temp.u[i][1];
 3660          }
 3661    }
 3662 
 3663    store_dest_double(mach, &dst[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
 3664    if (chan_1 != (unsigned)-1)
 3665       store_dest_double(mach, &dst[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
 3666 }
 3667 
 3668 static void
 3669 exec_double_unary(struct tgsi_exec_machine *mach,
 3670                   const struct tgsi_full_instruction *inst,
 3671                   micro_dop op)
 3672 {
 3673    union tgsi_double_channel src;
 3674    union tgsi_double_channel dst;
 3675 
 3676    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
 3677       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
 3678       op(&dst, &src);
 3679       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
 3680    }
 3681    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
 3682       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
 3683       op(&dst, &src);
 3684       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
 3685    }
 3686 }
 3687 
 3688 static void
 3689 exec_double_binary(struct tgsi_exec_machine *mach,
 3690                    const struct tgsi_full_instruction *inst,
 3691                    micro_dop op,
 3692                    enum tgsi_exec_datatype dst_datatype)
 3693 {
 3694    union tgsi_double_channel src[2];
 3695    union tgsi_double_channel dst;
 3696    int first_dest_chan, second_dest_chan;
 3697    int wmask;
 3698 
 3699    wmask = inst->Dst[0].Register.WriteMask;
 3700    /* these are & because of the way DSLT etc store their destinations */
 3701    if (wmask & TGSI_WRITEMASK_XY) {
 3702       first_dest_chan = TGSI_CHAN_X;
 3703       second_dest_chan = TGSI_CHAN_Y;
 3704       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
 3705          first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
 3706          second_dest_chan = -1;
 3707       }
 3708 
 3709       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
 3710       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
 3711       op(&dst, src);
 3712       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
 3713    }
 3714 
 3715    if (wmask & TGSI_WRITEMASK_ZW) {
 3716       first_dest_chan = TGSI_CHAN_Z;
 3717       second_dest_chan = TGSI_CHAN_W;
 3718       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
 3719          first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
 3720          second_dest_chan = -1;
 3721       }
 3722 
 3723       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
 3724       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
 3725       op(&dst, src);
 3726       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
 3727    }
 3728 }
 3729 
 3730 static void
 3731 exec_double_trinary(struct tgsi_exec_machine *mach,
 3732                     const struct tgsi_full_instruction *inst,
 3733                     micro_dop op)
 3734 {
 3735    union tgsi_double_channel src[3];
 3736    union tgsi_double_channel dst;
 3737 
 3738    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
 3739       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
 3740       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
 3741       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
 3742       op(&dst, src);
 3743       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
 3744    }
 3745    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
 3746       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
 3747       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
 3748       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
 3749       op(&dst, src);
 3750       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
 3751    }
 3752 }
 3753 
 3754 static void
 3755 exec_dldexp(struct tgsi_exec_machine *mach,
 3756             const struct tgsi_full_instruction *inst)
 3757 {
 3758    union tgsi_double_channel src0;
 3759    union tgsi_exec_channel src1;
 3760    union tgsi_double_channel dst;
 3761    int wmask;
 3762 
 3763    wmask = inst->Dst[0].Register.WriteMask;
 3764    if (wmask & TGSI_WRITEMASK_XY) {
 3765       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
 3766       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
 3767       micro_dldexp(&dst, &src0, &src1);
 3768       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
 3769    }
 3770 
 3771    if (wmask & TGSI_WRITEMASK_ZW) {
 3772       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
 3773       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
 3774       micro_dldexp(&dst, &src0, &src1);
 3775       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
 3776    }
 3777 }
 3778 
 3779 static void
 3780 exec_dfracexp(struct tgsi_exec_machine *mach,
 3781               const struct tgsi_full_instruction *inst)
 3782 {
 3783    union tgsi_double_channel src;
 3784    union tgsi_double_channel dst;
 3785    union tgsi_exec_channel dst_exp;
 3786 
 3787    fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
 3788    micro_dfracexp(&dst, &dst_exp, &src);
 3789    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)
 3790       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
 3791    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)
 3792       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
 3793    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3794       if (inst->Dst[1].Register.WriteMask & (1 << chan))
 3795          store_dest(mach, &dst_exp, &inst->Dst[1], inst, chan, TGSI_EXEC_DATA_INT);
 3796    }
 3797 }
 3798 
 3799 static void
 3800 exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
 3801             const struct tgsi_full_instruction *inst,
 3802             micro_dop_sop op)
 3803 {
 3804    union tgsi_double_channel src0;
 3805    union tgsi_exec_channel src1;
 3806    union tgsi_double_channel dst;
 3807    int wmask;
 3808 
 3809    wmask = inst->Dst[0].Register.WriteMask;
 3810    if (wmask & TGSI_WRITEMASK_XY) {
 3811       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
 3812       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
 3813       op(&dst, &src0, &src1);
 3814       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
 3815    }
 3816 
 3817    if (wmask & TGSI_WRITEMASK_ZW) {
 3818       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
 3819       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
 3820       op(&dst, &src0, &src1);
 3821       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
 3822    }
 3823 }
 3824 
 3825 static int
 3826 get_image_coord_dim(unsigned tgsi_tex)
 3827 {
 3828    int dim;
 3829    switch (tgsi_tex) {
 3830    case TGSI_TEXTURE_BUFFER:
 3831    case TGSI_TEXTURE_1D:
 3832       dim = 1;
 3833       break;
 3834    case TGSI_TEXTURE_2D:
 3835    case TGSI_TEXTURE_RECT:
 3836    case TGSI_TEXTURE_1D_ARRAY:
 3837    case TGSI_TEXTURE_2D_MSAA:
 3838       dim = 2;
 3839       break;
 3840    case TGSI_TEXTURE_3D:
 3841    case TGSI_TEXTURE_CUBE:
 3842    case TGSI_TEXTURE_2D_ARRAY:
 3843    case TGSI_TEXTURE_2D_ARRAY_MSAA:
 3844    case TGSI_TEXTURE_CUBE_ARRAY:
 3845       dim = 3;
 3846       break;
 3847    default:
 3848       assert(!"unknown texture target");
 3849       dim = 0;
 3850       break;
 3851    }
 3852 
 3853    return dim;
 3854 }
 3855 
 3856 static int
 3857 get_image_coord_sample(unsigned tgsi_tex)
 3858 {
 3859    int sample = 0;
 3860    switch (tgsi_tex) {
 3861    case TGSI_TEXTURE_2D_MSAA:
 3862       sample = 3;
 3863       break;
 3864    case TGSI_TEXTURE_2D_ARRAY_MSAA:
 3865       sample = 4;
 3866       break;
 3867    default:
 3868       break;
 3869    }
 3870    return sample;
 3871 }
 3872 
 3873 static void
 3874 exec_load_img(struct tgsi_exec_machine *mach,
 3875               const struct tgsi_full_instruction *inst)
 3876 {
 3877    union tgsi_exec_channel r[4], sample_r;
 3878    uint unit;
 3879    int sample;
 3880    int i, j;
 3881    int dim;
 3882    uint chan;
 3883    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 3884    struct tgsi_image_params params;
 3885    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
 3886 
 3887    unit = fetch_sampler_unit(mach, inst, 0);
 3888    dim = get_image_coord_dim(inst->Memory.Texture);
 3889    sample = get_image_coord_sample(inst->Memory.Texture);
 3890    assert(dim <= 3);
 3891 
 3892    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
 3893    params.unit = unit;
 3894    params.tgsi_tex_instr = inst->Memory.Texture;
 3895    params.format = inst->Memory.Format;
 3896 
 3897    for (i = 0; i < dim; i++) {
 3898       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
 3899    }
 3900 
 3901    if (sample)
 3902       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
 3903 
 3904    mach->Image->load(mach->Image, &params,
 3905                      r[0].i, r[1].i, r[2].i, sample_r.i,
 3906                      rgba);
 3907    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 3908       r[0].f[j] = rgba[0][j];
 3909       r[1].f[j] = rgba[1][j];
 3910       r[2].f[j] = rgba[2][j];
 3911       r[3].f[j] = rgba[3][j];
 3912    }
 3913    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3914       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3915          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 3916       }
 3917    }
 3918 }
 3919 
 3920 static void
 3921 exec_load_buf(struct tgsi_exec_machine *mach,
 3922               const struct tgsi_full_instruction *inst)
 3923 {
 3924    union tgsi_exec_channel r[4];
 3925    uint unit;
 3926    int j;
 3927    uint chan;
 3928    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 3929    struct tgsi_buffer_params params;
 3930    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
 3931 
 3932    unit = fetch_sampler_unit(mach, inst, 0);
 3933 
 3934    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
 3935    params.unit = unit;
 3936    IFETCH(&r[0], 1, TGSI_CHAN_X);
 3937 
 3938    mach->Buffer->load(mach->Buffer, &params,
 3939                       r[0].i, rgba);
 3940    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 3941       r[0].f[j] = rgba[0][j];
 3942       r[1].f[j] = rgba[1][j];
 3943       r[2].f[j] = rgba[2][j];
 3944       r[3].f[j] = rgba[3][j];
 3945    }
 3946    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3947       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3948          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 3949       }
 3950    }
 3951 }
 3952 
 3953 static void
 3954 exec_load_mem(struct tgsi_exec_machine *mach,
 3955               const struct tgsi_full_instruction *inst)
 3956 {
 3957    union tgsi_exec_channel r[4];
 3958    uint chan;
 3959    char *ptr = mach->LocalMem;
 3960    uint32_t offset;
 3961    int j;
 3962 
 3963    IFETCH(&r[0], 1, TGSI_CHAN_X);
 3964    if (r[0].u[0] >= mach->LocalMemSize)
 3965       return;
 3966 
 3967    offset = r[0].u[0];
 3968    ptr += offset;
 3969 
 3970    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 3971       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3972          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3973             memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
 3974          }
 3975       }
 3976    }
 3977 
 3978    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 3979       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 3980          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 3981       }
 3982    }
 3983 }
 3984 
 3985 static void
 3986 exec_load(struct tgsi_exec_machine *mach,
 3987           const struct tgsi_full_instruction *inst)
 3988 {
 3989    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
 3990       exec_load_img(mach, inst);
 3991    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
 3992       exec_load_buf(mach, inst);
 3993    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
 3994       exec_load_mem(mach, inst);
 3995 }
 3996 
 3997 static uint
 3998 fetch_store_img_unit(struct tgsi_exec_machine *mach,
 3999                      const struct tgsi_full_dst_register *dst)
 4000 {
 4001    uint unit = 0;
 4002    int i;
 4003    if (dst->Register.Indirect) {
 4004       union tgsi_exec_channel indir_index, index2;
 4005       const uint execmask = mach->ExecMask;
 4006       index2.i[0] =
 4007       index2.i[1] =
 4008       index2.i[2] =
 4009       index2.i[3] = dst->Indirect.Index;
 4010 
 4011       fetch_src_file_channel(mach,
 4012                              dst->Indirect.File,
 4013                              dst->Indirect.Swizzle,
 4014                              &index2,
 4015                              &ZeroVec,
 4016                              &indir_index);
 4017       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 4018          if (execmask & (1 << i)) {
 4019             unit = dst->Register.Index + indir_index.i[i];
 4020             break;
 4021          }
 4022       }
 4023    } else {
 4024       unit = dst->Register.Index;
 4025    }
 4026    return unit;
 4027 }
 4028 
 4029 static void
 4030 exec_store_img(struct tgsi_exec_machine *mach,
 4031                const struct tgsi_full_instruction *inst)
 4032 {
 4033    union tgsi_exec_channel r[3], sample_r;
 4034    union tgsi_exec_channel value[4];
 4035    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 4036    struct tgsi_image_params params;
 4037    int dim;
 4038    int sample;
 4039    int i, j;
 4040    uint unit;
 4041    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
 4042    unit = fetch_store_img_unit(mach, &inst->Dst[0]);
 4043    dim = get_image_coord_dim(inst->Memory.Texture);
 4044    sample = get_image_coord_sample(inst->Memory.Texture);
 4045    assert(dim <= 3);
 4046 
 4047    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
 4048    params.unit = unit;
 4049    params.tgsi_tex_instr = inst->Memory.Texture;
 4050    params.format = inst->Memory.Format;
 4051 
 4052    for (i = 0; i < dim; i++) {
 4053       IFETCH(&r[i], 0, TGSI_CHAN_X + i);
 4054    }
 4055 
 4056    for (i = 0; i < 4; i++) {
 4057       FETCH(&value[i], 1, TGSI_CHAN_X + i);
 4058    }
 4059    if (sample)
 4060       IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
 4061 
 4062    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 4063       rgba[0][j] = value[0].f[j];
 4064       rgba[1][j] = value[1].f[j];
 4065       rgba[2][j] = value[2].f[j];
 4066       rgba[3][j] = value[3].f[j];
 4067    }
 4068 
 4069    mach->Image->store(mach->Image, &params,
 4070                       r[0].i, r[1].i, r[2].i, sample_r.i,
 4071                       rgba);
 4072 }
 4073 
 4074 static void
 4075 exec_store_buf(struct tgsi_exec_machine *mach,
 4076                const struct tgsi_full_instruction *inst)
 4077 {
 4078    union tgsi_exec_channel r[3];
 4079    union tgsi_exec_channel value[4];
 4080    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 4081    struct tgsi_buffer_params params;
 4082    int i, j;
 4083    uint unit;
 4084    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
 4085 
 4086    unit = fetch_store_img_unit(mach, &inst->Dst[0]);
 4087 
 4088    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
 4089    params.unit = unit;
 4090    params.writemask = inst->Dst[0].Register.WriteMask;
 4091 
 4092    IFETCH(&r[0], 0, TGSI_CHAN_X);
 4093    for (i = 0; i < 4; i++) {
 4094       FETCH(&value[i], 1, TGSI_CHAN_X + i);
 4095    }
 4096 
 4097    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 4098       rgba[0][j] = value[0].f[j];
 4099       rgba[1][j] = value[1].f[j];
 4100       rgba[2][j] = value[2].f[j];
 4101       rgba[3][j] = value[3].f[j];
 4102    }
 4103 
 4104    mach->Buffer->store(mach->Buffer, &params,
 4105                       r[0].i,
 4106                       rgba);
 4107 }
 4108 
 4109 static void
 4110 exec_store_mem(struct tgsi_exec_machine *mach,
 4111                const struct tgsi_full_instruction *inst)
 4112 {
 4113    union tgsi_exec_channel r[3];
 4114    union tgsi_exec_channel value[4];
 4115    uint i, chan;
 4116    char *ptr = mach->LocalMem;
 4117    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
 4118    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
 4119 
 4120    IFETCH(&r[0], 0, TGSI_CHAN_X);
 4121 
 4122    for (i = 0; i < 4; i++) {
 4123       FETCH(&value[i], 1, TGSI_CHAN_X + i);
 4124    }
 4125 
 4126    if (r[0].u[0] >= mach->LocalMemSize)
 4127       return;
 4128    ptr += r[0].u[0];
 4129 
 4130    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
 4131       if (execmask & (1 << i)) {
 4132          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 4133             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 4134                memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
 4135             }
 4136          }
 4137       }
 4138    }
 4139 }
 4140 
 4141 static void
 4142 exec_store(struct tgsi_exec_machine *mach,
 4143            const struct tgsi_full_instruction *inst)
 4144 {
 4145    if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
 4146       exec_store_img(mach, inst);
 4147    else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
 4148       exec_store_buf(mach, inst);
 4149    else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
 4150       exec_store_mem(mach, inst);
 4151 }
 4152 
 4153 static void
 4154 exec_atomop_img(struct tgsi_exec_machine *mach,
 4155                 const struct tgsi_full_instruction *inst)
 4156 {
 4157    union tgsi_exec_channel r[4], sample_r;
 4158    union tgsi_exec_channel value[4], value2[4];
 4159    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 4160    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 4161    struct tgsi_image_params params;
 4162    int dim;
 4163    int sample;
 4164    int i, j;
 4165    uint unit, chan;
 4166    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
 4167    unit = fetch_sampler_unit(mach, inst, 0);
 4168    dim = get_image_coord_dim(inst->Memory.Texture);
 4169    sample = get_image_coord_sample(inst->Memory.Texture);
 4170    assert(dim <= 3);
 4171 
 4172    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
 4173    params.unit = unit;
 4174    params.tgsi_tex_instr = inst->Memory.Texture;
 4175    params.format = inst->Memory.Format;
 4176 
 4177    for (i = 0; i < dim; i++) {
 4178       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
 4179    }
 4180 
 4181    for (i = 0; i < 4; i++) {
 4182       FETCH(&value[i], 2, TGSI_CHAN_X + i);
 4183       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
 4184          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
 4185    }
 4186    if (sample)
 4187       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
 4188 
 4189    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 4190       rgba[0][j] = value[0].f[j];
 4191       rgba[1][j] = value[1].f[j];
 4192       rgba[2][j] = value[2].f[j];
 4193       rgba[3][j] = value[3].f[j];
 4194    }
 4195    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 4196       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 4197          rgba2[0][j] = value2[0].f[j];
 4198          rgba2[1][j] = value2[1].f[j];
 4199          rgba2[2][j] = value2[2].f[j];
 4200          rgba2[3][j] = value2[3].f[j];
 4201       }
 4202    }
 4203 
 4204    mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
 4205                    r[0].i, r[1].i, r[2].i, sample_r.i,
 4206                    rgba, rgba2);
 4207 
 4208    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 4209       r[0].f[j] = rgba[0][j];
 4210       r[1].f[j] = rgba[1][j];
 4211       r[2].f[j] = rgba[2][j];
 4212       r[3].f[j] = rgba[3][j];
 4213    }
 4214    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 4215       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 4216          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 4217       }
 4218    }
 4219 }
 4220 
 4221 static void
 4222 exec_atomop_buf(struct tgsi_exec_machine *mach,
 4223                 const struct tgsi_full_instruction *inst)
 4224 {
 4225    union tgsi_exec_channel r[4];
 4226    union tgsi_exec_channel value[4], value2[4];
 4227    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 4228    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
 4229    struct tgsi_buffer_params params;
 4230    int i, j;
 4231    uint unit, chan;
 4232    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
 4233 
 4234    unit = fetch_sampler_unit(mach, inst, 0);
 4235 
 4236    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
 4237    params.unit = unit;
 4238    params.writemask = inst->Dst[0].Register.WriteMask;
 4239 
 4240    IFETCH(&r[0], 1, TGSI_CHAN_X);
 4241 
 4242    for (i = 0; i < 4; i++) {
 4243       FETCH(&value[i], 2, TGSI_CHAN_X + i);
 4244       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
 4245          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
 4246    }
 4247 
 4248    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 4249       rgba[0][j] = value[0].f[j];
 4250       rgba[1][j] = value[1].f[j];
 4251       rgba[2][j] = value[2].f[j];
 4252       rgba[3][j] = value[3].f[j];
 4253    }
 4254    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 4255       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 4256          rgba2[0][j] = value2[0].f[j];
 4257          rgba2[1][j] = value2[1].f[j];
 4258          rgba2[2][j] = value2[2].f[j];
 4259          rgba2[3][j] = value2[3].f[j];
 4260       }
 4261    }
 4262 
 4263    mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
 4264                    r[0].i,
 4265                    rgba, rgba2);
 4266 
 4267    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
 4268       r[0].f[j] = rgba[0][j];
 4269       r[1].f[j] = rgba[1][j];
 4270       r[2].f[j] = rgba[2][j];
 4271       r[3].f[j] = rgba[3][j];
 4272    }
 4273    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 4274       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
 4275          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 4276       }
 4277    }
 4278 }
 4279 
 4280 static void
 4281 exec_atomop_mem(struct tgsi_exec_machine *mach,
 4282                 const struct tgsi_full_instruction *inst)
 4283 {
 4284    union tgsi_exec_channel r[4];
 4285    union tgsi_exec_channel value[4], value2[4];
 4286    char *ptr = mach->LocalMem;
 4287    uint32_t val;
 4288    uint chan, i;
 4289    uint32_t offset;
 4290    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
 4291    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
 4292    IFETCH(&r[0], 1, TGSI_CHAN_X);
 4293 
 4294    if (r[0].u[0] >= mach->LocalMemSize)
 4295       return;
 4296 
 4297    offset = r[0].u[0];
 4298    ptr += offset;
 4299    for (i = 0; i < 4; i++) {
 4300       FETCH(&value[i], 2, TGSI_CHAN_X + i);
 4301       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
 4302          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
 4303    }
 4304 
 4305    memcpy(&r[0].u[0], ptr, 4);
 4306    val = r[0].u[0];
 4307    switch (inst->Instruction.Opcode) {
 4308    case TGSI_OPCODE_ATOMUADD:
 4309       val += value[0].u[0];
 4310       break;
 4311    case TGSI_OPCODE_ATOMXOR:
 4312       val ^= value[0].u[0];
 4313       break;
 4314    case TGSI_OPCODE_ATOMOR:
 4315       val |= value[0].u[0];
 4316       break;
 4317    case TGSI_OPCODE_ATOMAND:
 4318       val &= value[0].u[0];
 4319       break;
 4320    case TGSI_OPCODE_ATOMUMIN:
 4321       val =