"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/nouveau/nvc0/nvc0_program.c" (16 Sep 2020, 30633 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "nvc0_program.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 20.1.5_vs_20.2.0-rc1.

    1 /*
    2  * Copyright 2010 Christoph Bumiller
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8  * and/or sell copies of the Software, and to permit persons to whom the
    9  * Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice shall be included in
   12  * all copies or substantial portions of the Software.
   13  *
   14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
   18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
   19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
   20  * OTHER DEALINGS IN THE SOFTWARE.
   21  */
   22 
   23 #include "pipe/p_defines.h"
   24 
   25 #include "compiler/nir/nir.h"
   26 #include "tgsi/tgsi_ureg.h"
   27 
   28 #include "nvc0/nvc0_context.h"
   29 
   30 #include "codegen/nv50_ir_driver.h"
   31 #include "nvc0/nve4_compute.h"
   32 
   33 /* NOTE: Using a[0x270] in FP may cause an error even if we're using less than
   34  * 124 scalar varying values.
   35  */
   36 static uint32_t
   37 nvc0_shader_input_address(unsigned sn, unsigned si)
   38 {
   39    switch (sn) {
   40    case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
   41    case TGSI_SEMANTIC_TESSINNER:    return 0x010 + si * 0x4;
   42    case TGSI_SEMANTIC_PATCH:        return 0x020 + si * 0x10;
   43    case TGSI_SEMANTIC_PRIMID:       return 0x060;
   44    case TGSI_SEMANTIC_LAYER:        return 0x064;
   45    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
   46    case TGSI_SEMANTIC_PSIZE:        return 0x06c;
   47    case TGSI_SEMANTIC_POSITION:     return 0x070;
   48    case TGSI_SEMANTIC_GENERIC:      return 0x080 + si * 0x10;
   49    case TGSI_SEMANTIC_FOG:          return 0x2e8;
   50    case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
   51    case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
   52    case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
   53    case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
   54    case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
   55    case TGSI_SEMANTIC_TESSCOORD:    return 0x2f0;
   56    case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
   57    case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
   58    case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
   59    default:
   60       assert(!"invalid TGSI input semantic");
   61       return ~0;
   62    }
   63 }
   64 
   65 static uint32_t
   66 nvc0_shader_output_address(unsigned sn, unsigned si)
   67 {
   68    switch (sn) {
   69    case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
   70    case TGSI_SEMANTIC_TESSINNER:     return 0x010 + si * 0x4;
   71    case TGSI_SEMANTIC_PATCH:         return 0x020 + si * 0x10;
   72    case TGSI_SEMANTIC_PRIMID:        return 0x060;
   73    case TGSI_SEMANTIC_LAYER:         return 0x064;
   74    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
   75    case TGSI_SEMANTIC_PSIZE:         return 0x06c;
   76    case TGSI_SEMANTIC_POSITION:      return 0x070;
   77    case TGSI_SEMANTIC_GENERIC:       return 0x080 + si * 0x10;
   78    case TGSI_SEMANTIC_FOG:           return 0x2e8;
   79    case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
   80    case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
   81    case TGSI_SEMANTIC_CLIPDIST:      return 0x2c0 + si * 0x10;
   82    case TGSI_SEMANTIC_CLIPVERTEX:    return 0x270;
   83    case TGSI_SEMANTIC_TEXCOORD:      return 0x300 + si * 0x10;
   84    case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0;
   85    case TGSI_SEMANTIC_EDGEFLAG:      return ~0;
   86    default:
   87       assert(!"invalid TGSI output semantic");
   88       return ~0;
   89    }
   90 }
   91 
   92 static int
   93 nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)
   94 {
   95    unsigned i, c, n;
   96 
   97    for (n = 0, i = 0; i < info->numInputs; ++i) {
   98       switch (info->in[i].sn) {
   99       case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */
  100       case TGSI_SEMANTIC_VERTEXID:
  101          info->in[i].mask = 0x1;
  102          info->in[i].slot[0] =
  103             nvc0_shader_input_address(info->in[i].sn, 0) / 4;
  104          continue;
  105       default:
  106          break;
  107       }
  108       for (c = 0; c < 4; ++c)
  109          info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4;
  110       ++n;
  111    }
  112 
  113    return 0;
  114 }
  115 
  116 static int
  117 nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info)
  118 {
  119    unsigned offset;
  120    unsigned i, c;
  121 
  122    for (i = 0; i < info->numInputs; ++i) {
  123       offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);
  124 
  125       for (c = 0; c < 4; ++c)
  126          info->in[i].slot[c] = (offset + c * 0x4) / 4;
  127    }
  128 
  129    return 0;
  130 }
  131 
  132 static int
  133 nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info)
  134 {
  135    unsigned count = info->prop.fp.numColourResults * 4;
  136    unsigned i, c;
  137 
  138    /* Compute the relative position of each color output, since skipped MRT
  139     * positions will not have registers allocated to them.
  140     */
  141    unsigned colors[8] = {0};
  142    for (i = 0; i < info->numOutputs; ++i)
  143       if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
  144          colors[info->out[i].si] = 1;
  145    for (i = 0, c = 0; i < 8; i++)
  146       if (colors[i])
  147          colors[i] = c++;
  148    for (i = 0; i < info->numOutputs; ++i)
  149       if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
  150          for (c = 0; c < 4; ++c)
  151             info->out[i].slot[c] = colors[info->out[i].si] * 4 + c;
  152 
  153    if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
  154       info->out[info->io.sampleMask].slot[0] = count++;
  155    else
  156    if (info->target >= 0xe0)
  157       count++; /* on Kepler, depth is always last colour reg + 2 */
  158 
  159    if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
  160       info->out[info->io.fragDepth].slot[2] = count;
  161 
  162    return 0;
  163 }
  164 
  165 static int
  166 nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info)
  167 {
  168    unsigned offset;
  169    unsigned i, c;
  170 
  171    for (i = 0; i < info->numOutputs; ++i) {
  172       offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);
  173 
  174       for (c = 0; c < 4; ++c)
  175          info->out[i].slot[c] = (offset + c * 0x4) / 4;
  176    }
  177 
  178    return 0;
  179 }
  180 
  181 static int
  182 nvc0_program_assign_varying_slots(struct nv50_ir_prog_info *info)
  183 {
  184    int ret;
  185 
  186    if (info->type == PIPE_SHADER_VERTEX)
  187       ret = nvc0_vp_assign_input_slots(info);
  188    else
  189       ret = nvc0_sp_assign_input_slots(info);
  190    if (ret)
  191       return ret;
  192 
  193    if (info->type == PIPE_SHADER_FRAGMENT)
  194       ret = nvc0_fp_assign_output_slots(info);
  195    else
  196       ret = nvc0_sp_assign_output_slots(info);
  197    return ret;
  198 }
  199 
  200 static inline void
  201 nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot)
  202 {
  203    uint8_t min = (vp->hdr[4] >> 12) & 0xff;
  204    uint8_t max = (vp->hdr[4] >> 24);
  205 
  206    min = MIN2(min, slot);
  207    max = MAX2(max, slot);
  208 
  209    vp->hdr[4] = (max << 24) | (min << 12);
  210 }
  211 
  212 /* Common part of header generation for VP, TCP, TEP and GP. */
  213 static int
  214 nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
  215 {
  216    unsigned i, c, a;
  217 
  218    for (i = 0; i < info->numInputs; ++i) {
  219       if (info->in[i].patch)
  220          continue;
  221       for (c = 0; c < 4; ++c) {
  222          a = info->in[i].slot[c];
  223          if (info->in[i].mask & (1 << c))
  224             vp->hdr[5 + a / 32] |= 1 << (a % 32);
  225       }
  226    }
  227 
  228    for (i = 0; i < info->numOutputs; ++i) {
  229       if (info->out[i].patch)
  230          continue;
  231       for (c = 0; c < 4; ++c) {
  232          if (!(info->out[i].mask & (1 << c)))
  233             continue;
  234          assert(info->out[i].slot[c] >= 0x40 / 4);
  235          a = info->out[i].slot[c] - 0x40 / 4;
  236          vp->hdr[13 + a / 32] |= 1 << (a % 32);
  237          if (info->out[i].oread)
  238             nvc0_vtgp_hdr_update_oread(vp, info->out[i].slot[c]);
  239       }
  240    }
  241 
  242    for (i = 0; i < info->numSysVals; ++i) {
  243       switch (info->sv[i].sn) {
  244       case TGSI_SEMANTIC_PRIMID:
  245          vp->hdr[5] |= 1 << 24;
  246          break;
  247       case TGSI_SEMANTIC_INSTANCEID:
  248          vp->hdr[10] |= 1 << 30;
  249          break;
  250       case TGSI_SEMANTIC_VERTEXID:
  251          vp->hdr[10] |= 1 << 31;
  252          break;
  253       case TGSI_SEMANTIC_TESSCOORD:
  254          /* We don't have the mask, nor the slots populated. While this could
  255           * be achieved, the vast majority of the time if either of the coords
  256           * are read, then both will be read.
  257           */
  258          nvc0_vtgp_hdr_update_oread(vp, 0x2f0 / 4);
  259          nvc0_vtgp_hdr_update_oread(vp, 0x2f4 / 4);
  260          break;
  261       default:
  262          break;
  263       }
  264    }
  265 
  266    vp->vp.clip_enable = (1 << info->io.clipDistances) - 1;
  267    vp->vp.cull_enable =
  268       ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
  269    for (i = 0; i < info->io.cullDistances; ++i)
  270       vp->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
  271 
  272    if (info->io.genUserClip < 0)
  273       vp->vp.num_ucps = PIPE_MAX_CLIP_PLANES + 1; /* prevent rebuilding */
  274 
  275    vp->vp.layer_viewport_relative = info->io.layer_viewport_relative;
  276 
  277    return 0;
  278 }
  279 
  280 static int
  281 nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
  282 {
  283    vp->hdr[0] = 0x20061 | (1 << 10);
  284    vp->hdr[4] = 0xff000;
  285 
  286    return nvc0_vtgp_gen_header(vp, info);
  287 }
  288 
  289 static void
  290 nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
  291 {
  292    if (info->prop.tp.outputPrim == PIPE_PRIM_MAX) {
  293       tp->tp.tess_mode = ~0;
  294       return;
  295    }
  296    switch (info->prop.tp.domain) {
  297    case PIPE_PRIM_LINES:
  298       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_ISOLINES;
  299       break;
  300    case PIPE_PRIM_TRIANGLES:
  301       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;
  302       break;
  303    case PIPE_PRIM_QUADS:
  304       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;
  305       break;
  306    default:
  307       tp->tp.tess_mode = ~0;
  308       return;
  309    }
  310 
  311    /* It seems like lines want the "CW" bit to indicate they're connected, and
  312     * spit out errors in dmesg when the "CONNECTED" bit is set.
  313     */
  314    if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS) {
  315       if (info->prop.tp.domain == PIPE_PRIM_LINES)
  316          tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
  317       else
  318          tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
  319    }
  320 
  321    /* Winding only matters for triangles/quads, not lines. */
  322    if (info->prop.tp.domain != PIPE_PRIM_LINES &&
  323        info->prop.tp.outputPrim != PIPE_PRIM_POINTS &&
  324        info->prop.tp.winding > 0)
  325       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
  326 
  327    switch (info->prop.tp.partitioning) {
  328    case PIPE_TESS_SPACING_EQUAL:
  329       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL;
  330       break;
  331    case PIPE_TESS_SPACING_FRACTIONAL_ODD:
  332       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD;
  333       break;
  334    case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
  335       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN;
  336       break;
  337    default:
  338       assert(!"invalid tessellator partitioning");
  339       break;
  340    }
  341 }
  342 
  343 static int
  344 nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info)
  345 {
  346    unsigned opcs = 6; /* output patch constants (at least the TessFactors) */
  347 
  348    if (info->numPatchConstants)
  349       opcs = 8 + info->numPatchConstants * 4;
  350 
  351    tcp->hdr[0] = 0x20061 | (2 << 10);
  352 
  353    tcp->hdr[1] = opcs << 24;
  354    tcp->hdr[2] = info->prop.tp.outputPatchSize << 24;
  355 
  356    tcp->hdr[4] = 0xff000; /* initial min/max parallel output read address */
  357 
  358    nvc0_vtgp_gen_header(tcp, info);
  359 
  360    if (info->target >= NVISA_GM107_CHIPSET) {
  361       /* On GM107+, the number of output patch components has moved in the TCP
  362        * header, but it seems like blob still also uses the old position.
  363        * Also, the high 8-bits are located inbetween the min/max parallel
  364        * field and has to be set after updating the outputs. */
  365       tcp->hdr[3] = (opcs & 0x0f) << 28;
  366       tcp->hdr[4] |= (opcs & 0xf0) << 16;
  367    }
  368 
  369    nvc0_tp_get_tess_mode(tcp, info);
  370 
  371    return 0;
  372 }
  373 
  374 static int
  375 nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info)
  376 {
  377    tep->hdr[0] = 0x20061 | (3 << 10);
  378    tep->hdr[4] = 0xff000;
  379 
  380    nvc0_vtgp_gen_header(tep, info);
  381 
  382    nvc0_tp_get_tess_mode(tep, info);
  383 
  384    tep->hdr[18] |= 0x3 << 12; /* ? */
  385 
  386    return 0;
  387 }
  388 
  389 static int
  390 nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info)
  391 {
  392    gp->hdr[0] = 0x20061 | (4 << 10);
  393 
  394    gp->hdr[2] = MIN2(info->prop.gp.instanceCount, 32) << 24;
  395 
  396    switch (info->prop.gp.outputPrim) {
  397    case PIPE_PRIM_POINTS:
  398       gp->hdr[3] = 0x01000000;
  399       gp->hdr[0] |= 0xf0000000;
  400       break;
  401    case PIPE_PRIM_LINE_STRIP:
  402       gp->hdr[3] = 0x06000000;
  403       gp->hdr[0] |= 0x10000000;
  404       break;
  405    case PIPE_PRIM_TRIANGLE_STRIP:
  406       gp->hdr[3] = 0x07000000;
  407       gp->hdr[0] |= 0x10000000;
  408       break;
  409    default:
  410       assert(0);
  411       break;
  412    }
  413 
  414    gp->hdr[4] = CLAMP(info->prop.gp.maxVertices, 1, 1024);
  415 
  416    return nvc0_vtgp_gen_header(gp, info);
  417 }
  418 
  419 #define NVC0_INTERP_FLAT          (1 << 0)
  420 #define NVC0_INTERP_PERSPECTIVE   (2 << 0)
  421 #define NVC0_INTERP_LINEAR        (3 << 0)
  422 #define NVC0_INTERP_CENTROID      (1 << 2)
  423 
  424 static uint8_t
  425 nvc0_hdr_interp_mode(const struct nv50_ir_varying *var)
  426 {
  427    if (var->linear)
  428       return NVC0_INTERP_LINEAR;
  429    if (var->flat)
  430       return NVC0_INTERP_FLAT;
  431    return NVC0_INTERP_PERSPECTIVE;
  432 }
  433 
  434 static int
  435 nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info)
  436 {
  437    unsigned i, c, a, m;
  438 
  439    /* just 00062 on Kepler */
  440    fp->hdr[0] = 0x20062 | (5 << 10);
  441    fp->hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */
  442 
  443    if (info->prop.fp.usesDiscard)
  444       fp->hdr[0] |= 0x8000;
  445    if (!info->prop.fp.separateFragData)
  446       fp->hdr[0] |= 0x4000;
  447    if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
  448       fp->hdr[19] |= 0x1;
  449    if (info->prop.fp.writesDepth) {
  450       fp->hdr[19] |= 0x2;
  451       fp->flags[0] = 0x11; /* deactivate ZCULL */
  452    }
  453 
  454    for (i = 0; i < info->numInputs; ++i) {
  455       m = nvc0_hdr_interp_mode(&info->in[i]);
  456       if (info->in[i].sn == TGSI_SEMANTIC_COLOR) {
  457          fp->fp.colors |= 1 << info->in[i].si;
  458          if (info->in[i].sc)
  459             fp->fp.color_interp[info->in[i].si] = m | (info->in[i].mask << 4);
  460       }
  461       for (c = 0; c < 4; ++c) {
  462          if (!(info->in[i].mask & (1 << c)))
  463             continue;
  464          a = info->in[i].slot[c];
  465          if (info->in[i].slot[0] >= (0x060 / 4) &&
  466              info->in[i].slot[0] <= (0x07c / 4)) {
  467             fp->hdr[5] |= 1 << (24 + (a - 0x060 / 4));
  468          } else
  469          if (info->in[i].slot[0] >= (0x2c0 / 4) &&
  470              info->in[i].slot[0] <= (0x2fc / 4)) {
  471             fp->hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000;
  472          } else {
  473             if (info->in[i].slot[c] < (0x040 / 4) ||
  474                 info->in[i].slot[c] > (0x380 / 4))
  475                continue;
  476             a *= 2;
  477             if (info->in[i].slot[0] >= (0x300 / 4))
  478                a -= 32;
  479             fp->hdr[4 + a / 32] |= m << (a % 32);
  480          }
  481       }
  482    }
  483    /* GM20x+ needs TGSI_SEMANTIC_POSITION to access sample locations */
  484    if (info->prop.fp.readsSampleLocations && info->target >= NVISA_GM200_CHIPSET)
  485       fp->hdr[5] |= 0x30000000;
  486 
  487    for (i = 0; i < info->numOutputs; ++i) {
  488       if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
  489          fp->hdr[18] |= 0xf << (4 * info->out[i].si);
  490    }
  491 
  492    /* There are no "regular" attachments, but the shader still needs to be
  493     * executed. It seems like it wants to think that it has some color
  494     * outputs in order to actually run.
  495     */
  496    if (info->prop.fp.numColourResults == 0 && !info->prop.fp.writesDepth)
  497       fp->hdr[18] |= 0xf;
  498 
  499    fp->fp.early_z = info->prop.fp.earlyFragTests;
  500    fp->fp.sample_mask_in = info->prop.fp.usesSampleMaskIn;
  501    fp->fp.reads_framebuffer = info->prop.fp.readsFramebuffer;
  502    fp->fp.post_depth_coverage = info->prop.fp.postDepthCoverage;
  503 
  504    /* Mark position xy and layer as read */
  505    if (fp->fp.reads_framebuffer)
  506       fp->hdr[5] |= 0x32000000;
  507 
  508    return 0;
  509 }
  510 
  511 static struct nvc0_transform_feedback_state *
  512 nvc0_program_create_tfb_state(const struct nv50_ir_prog_info *info,
  513                               const struct pipe_stream_output_info *pso)
  514 {
  515    struct nvc0_transform_feedback_state *tfb;
  516    unsigned b, i, c;
  517 
  518    tfb = MALLOC_STRUCT(nvc0_transform_feedback_state);
  519    if (!tfb)
  520       return NULL;
  521    for (b = 0; b < 4; ++b) {
  522       tfb->stride[b] = pso->stride[b] * 4;
  523       tfb->varying_count[b] = 0;
  524    }
  525    memset(tfb->varying_index, 0xff, sizeof(tfb->varying_index)); /* = skip */
  526 
  527    for (i = 0; i < pso->num_outputs; ++i) {
  528       unsigned s = pso->output[i].start_component;
  529       unsigned p = pso->output[i].dst_offset;
  530       const unsigned r = pso->output[i].register_index;
  531       b = pso->output[i].output_buffer;
  532 
  533       if (r >= info->numOutputs)
  534          continue;
  535 
  536       for (c = 0; c < pso->output[i].num_components; ++c)
  537          tfb->varying_index[b][p++] = info->out[r].slot[s + c];
  538 
  539       tfb->varying_count[b] = MAX2(tfb->varying_count[b], p);
  540       tfb->stream[b] = pso->output[i].stream;
  541    }
  542    for (b = 0; b < 4; ++b) // zero unused indices (looks nicer)
  543       for (c = tfb->varying_count[b]; c & 3; ++c)
  544          tfb->varying_index[b][c] = 0;
  545 
  546    return tfb;
  547 }
  548 
  549 #ifndef NDEBUG
  550 static void
  551 nvc0_program_dump(struct nvc0_program *prog)
  552 {
  553    unsigned pos;
  554 
  555    if (prog->type != PIPE_SHADER_COMPUTE) {
  556       _debug_printf("dumping HDR for type %i\n", prog->type);
  557       for (pos = 0; pos < ARRAY_SIZE(prog->hdr); ++pos)
  558          _debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n",
  559                       pos * sizeof(prog->hdr[0]), prog->hdr[pos]);
  560    }
  561    _debug_printf("shader binary code (0x%x bytes):", prog->code_size);
  562    for (pos = 0; pos < prog->code_size / 4; ++pos) {
  563       if ((pos % 8) == 0)
  564          _debug_printf("\n");
  565       _debug_printf("%08x ", prog->code[pos]);
  566    }
  567    _debug_printf("\n");
  568 }
  569 #endif
  570 
  571 bool
  572 nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
  573                        struct pipe_debug_callback *debug)
  574 {
  575    struct nv50_ir_prog_info *info;
  576    int ret;
  577 
  578    info = CALLOC_STRUCT(nv50_ir_prog_info);
  579    if (!info)
  580       return false;
  581 
  582    info->type = prog->type;
  583    info->target = chipset;
  584 
  585    info->bin.sourceRep = prog->pipe.type;
  586    switch (prog->pipe.type) {
  587    case PIPE_SHADER_IR_TGSI:
  588       info->bin.source = (void *)prog->pipe.tokens;
  589       break;
  590    case PIPE_SHADER_IR_NIR:
  591       info->bin.source = (void *)nir_shader_clone(NULL, prog->pipe.ir.nir);
  592       break;
  593    default:
  594       assert(!"unsupported IR!");
  595       free(info);
  596       return false;
  597    }
  598 
  599 #ifndef NDEBUG
  600    info->target = debug_get_num_option("NV50_PROG_CHIPSET", chipset);
  601    info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
  602    info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
  603    info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0);
  604 #else
  605    info->optLevel = 3;
  606 #endif
  607 
  608    info->bin.smemSize = prog->cp.smem_size;
  609    info->io.genUserClip = prog->vp.num_ucps;
  610    info->io.auxCBSlot = 15;
  611    info->io.msInfoCBSlot = 15;
  612    info->io.ucpBase = NVC0_CB_AUX_UCP_INFO;
  613    info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO;
  614    info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
  615    info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
  616    info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0);
  617    if (info->target >= NVISA_GK104_CHIPSET) {
  618       info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
  619       info->io.fbtexBindBase = NVC0_CB_AUX_FB_TEX_INFO;
  620       info->io.bindlessBase = NVC0_CB_AUX_BINDLESS_INFO(0);
  621    }
  622 
  623    if (prog->type == PIPE_SHADER_COMPUTE) {
  624       if (info->target >= NVISA_GK104_CHIPSET) {
  625          info->io.auxCBSlot = 7;
  626          info->io.msInfoCBSlot = 7;
  627          info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0);
  628       }
  629       info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO(0);
  630    } else {
  631       info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;
  632    }
  633 
  634    info->assignSlots = nvc0_program_assign_varying_slots;
  635 
  636    ret = nv50_ir_generate_code(info);
  637    if (ret) {
  638       NOUVEAU_ERR("shader translation failed: %i\n", ret);
  639       goto out;
  640    }
  641    if (prog->type != PIPE_SHADER_COMPUTE)
  642       FREE(info->bin.syms);
  643 
  644    prog->code = info->bin.code;
  645    prog->code_size = info->bin.codeSize;
  646    prog->relocs = info->bin.relocData;
  647    prog->fixups = info->bin.fixupData;
  648    prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1));
  649    prog->cp.smem_size = info->bin.smemSize;
  650    prog->num_barriers = info->numBarriers;
  651 
  652    prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
  653    prog->vp.need_draw_parameters = info->prop.vp.usesDrawParameters;
  654 
  655    if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS)
  656       info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */
  657    prog->vp.edgeflag = info->io.edgeFlagIn;
  658 
  659    switch (prog->type) {
  660    case PIPE_SHADER_VERTEX:
  661       ret = nvc0_vp_gen_header(prog, info);
  662       break;
  663    case PIPE_SHADER_TESS_CTRL:
  664       ret = nvc0_tcp_gen_header(prog, info);
  665       break;
  666    case PIPE_SHADER_TESS_EVAL:
  667       ret = nvc0_tep_gen_header(prog, info);
  668       break;
  669    case PIPE_SHADER_GEOMETRY:
  670       ret = nvc0_gp_gen_header(prog, info);
  671       break;
  672    case PIPE_SHADER_FRAGMENT:
  673       ret = nvc0_fp_gen_header(prog, info);
  674       break;
  675    case PIPE_SHADER_COMPUTE:
  676       prog->cp.syms = info->bin.syms;
  677       prog->cp.num_syms = info->bin.numSyms;
  678       break;
  679    default:
  680       ret = -1;
  681       NOUVEAU_ERR("unknown program type: %u\n", prog->type);
  682       break;
  683    }
  684    if (ret)
  685       goto out;
  686 
  687    if (info->bin.tlsSpace) {
  688       assert(info->bin.tlsSpace < (1 << 24));
  689       prog->hdr[0] |= 1 << 26;
  690       prog->hdr[1] |= align(info->bin.tlsSpace, 0x10); /* l[] size */
  691       prog->need_tls = true;
  692    }
  693    /* TODO: factor 2 only needed where joinat/precont is used,
  694     *       and we only have to count non-uniform branches
  695     */
  696    /*
  697    if ((info->maxCFDepth * 2) > 16) {
  698       prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200;
  699       prog->need_tls = true;
  700    }
  701    */
  702    if (info->io.globalAccess)
  703       prog->hdr[0] |= 1 << 26;
  704    if (info->io.globalAccess & 0x2)
  705       prog->hdr[0] |= 1 << 16;
  706    if (info->io.fp64)
  707       prog->hdr[0] |= 1 << 27;
  708 
  709    if (prog->pipe.stream_output.num_outputs)
  710       prog->tfb = nvc0_program_create_tfb_state(info,
  711                                                 &prog->pipe.stream_output);
  712 
  713    pipe_debug_message(debug, SHADER_INFO,
  714                       "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d",
  715                       prog->type, info->bin.tlsSpace, info->bin.smemSize,
  716                       prog->num_gprs, info->bin.instructions,
  717                       info->bin.codeSize);
  718 
  719 #ifndef NDEBUG
  720    if (debug_get_option("NV50_PROG_CHIPSET", NULL) && info->dbgFlags)
  721       nvc0_program_dump(prog);
  722 #endif
  723 
  724 out:
  725    if (info->bin.sourceRep == PIPE_SHADER_IR_NIR)
  726       ralloc_free((void *)info->bin.source);
  727    FREE(info);
  728    return !ret;
  729 }
  730 
  731 static inline int
  732 nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
  733 {
  734    struct nvc0_screen *screen = nvc0->screen;
  735    const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
  736    int ret;
  737    uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
  738 
  739    /* On Fermi, SP_START_ID must be aligned to 0x40.
  740     * On Kepler, the first instruction must be aligned to 0x80 because
  741     * latency information is expected only at certain positions.
  742     */
  743    if (screen->base.class_3d >= NVE4_3D_CLASS)
  744       size = size + (is_cp ? 0x40 : 0x70);
  745    size = align(size, 0x40);
  746 
  747    ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem);
  748    if (ret)
  749       return ret;
  750    prog->code_base = prog->mem->start;
  751 
  752    if (!is_cp) {
  753       if (screen->base.class_3d >= NVE4_3D_CLASS) {
  754          switch (prog->mem->start & 0xff) {
  755          case 0x40: prog->code_base += 0x70; break;
  756          case 0x80: prog->code_base += 0x30; break;
  757          case 0xc0: prog->code_base += 0x70; break;
  758          default:
  759             prog->code_base += 0x30;
  760             assert((prog->mem->start & 0xff) == 0x00);
  761             break;
  762          }
  763       }
  764    } else {
  765       if (screen->base.class_3d >= NVE4_3D_CLASS) {
  766          if (prog->mem->start & 0x40)
  767             prog->code_base += 0x40;
  768          assert((prog->code_base & 0x7f) == 0x00);
  769       }
  770    }
  771 
  772    return 0;
  773 }
  774 
  775 static inline void
  776 nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
  777 {
  778    struct nvc0_screen *screen = nvc0->screen;
  779    const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
  780    uint32_t code_pos = prog->code_base + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
  781 
  782    if (prog->relocs)
  783       nv50_ir_relocate_code(prog->relocs, prog->code, code_pos,
  784                             screen->lib_code->start, 0);
  785    if (prog->fixups) {
  786       nv50_ir_apply_fixups(prog->fixups, prog->code,
  787                            prog->fp.force_persample_interp,
  788                            prog->fp.flatshade,
  789                            0 /* alphatest */);
  790       for (int i = 0; i < 2; i++) {
  791          unsigned mask = prog->fp.color_interp[i] >> 4;
  792          unsigned interp = prog->fp.color_interp[i] & 3;
  793          if (!mask)
  794             continue;
  795          prog->hdr[14] &= ~(0xff << (8 * i));
  796          if (prog->fp.flatshade)
  797             interp = NVC0_INTERP_FLAT;
  798          for (int c = 0; c < 4; c++)
  799             if (mask & (1 << c))
  800                prog->hdr[14] |= interp << (2 * (4 * i + c));
  801       }
  802    }
  803 
  804    if (!is_cp)
  805       nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base,
  806                            NV_VRAM_DOMAIN(&screen->base),
  807                            NVC0_SHADER_HEADER_SIZE, prog->hdr);
  808 
  809    nvc0->base.push_data(&nvc0->base, screen->text, code_pos,
  810                         NV_VRAM_DOMAIN(&screen->base), prog->code_size,
  811                         prog->code);
  812 }
  813 
  814 bool
  815 nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog)
  816 {
  817    struct nvc0_screen *screen = nvc0->screen;
  818    const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
  819    int ret;
  820    uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
  821 
  822    ret = nvc0_program_alloc_code(nvc0, prog);
  823    if (ret) {
  824       struct nouveau_heap *heap = screen->text_heap;
  825       struct nvc0_program *progs[] = { /* Sorted accordingly to SP_START_ID */
  826          nvc0->compprog, nvc0->vertprog, nvc0->tctlprog,
  827          nvc0->tevlprog, nvc0->gmtyprog, nvc0->fragprog
  828       };
  829 
  830       /* Note that the code library, which is allocated before anything else,
  831        * does not have a priv pointer. We can stop once we hit it.
  832        */
  833       while (heap->next && heap->next->priv) {
  834          struct nvc0_program *evict = heap->next->priv;
  835          nouveau_heap_free(&evict->mem);
  836       }
  837       debug_printf("WARNING: out of code space, evicting all shaders.\n");
  838 
  839       /* Make sure to synchronize before deleting the code segment. */
  840       IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0);
  841 
  842       if ((screen->text->size << 1) <= (1 << 23)) {
  843          ret = nvc0_screen_resize_text_area(screen, screen->text->size << 1);
  844          if (ret) {
  845             NOUVEAU_ERR("Error allocating TEXT area: %d\n", ret);
  846             return false;
  847          }
  848 
  849          /* Re-upload the builtin function into the new code segment. */
  850          nvc0_program_library_upload(nvc0);
  851       }
  852 
  853       ret = nvc0_program_alloc_code(nvc0, prog);
  854       if (ret) {
  855          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
  856          return false;
  857       }
  858 
  859       /* All currently bound shaders have to be reuploaded. */
  860       for (int i = 0; i < ARRAY_SIZE(progs); i++) {
  861          if (!progs[i] || progs[i] == prog)
  862             continue;
  863 
  864          ret = nvc0_program_alloc_code(nvc0, progs[i]);
  865          if (ret) {
  866             NOUVEAU_ERR("failed to re-upload a shader after code eviction.\n");
  867             return false;
  868          }
  869          nvc0_program_upload_code(nvc0, progs[i]);
  870 
  871          if (progs[i]->type == PIPE_SHADER_COMPUTE) {
  872             /* Caches have to be invalidated but the CP_START_ID will be
  873              * updated in the launch_grid functions. */
  874             BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(FLUSH), 1);
  875             PUSH_DATA (nvc0->base.pushbuf, NVC0_COMPUTE_FLUSH_CODE);
  876          } else {
  877             BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(SP_START_ID(i)), 1);
  878             PUSH_DATA (nvc0->base.pushbuf, progs[i]->code_base);
  879          }
  880       }
  881    }
  882 
  883    nvc0_program_upload_code(nvc0, prog);
  884 
  885 #ifndef NDEBUG
  886    if (debug_get_bool_option("NV50_PROG_DEBUG", false))
  887       nvc0_program_dump(prog);
  888 #endif
  889 
  890    BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1);
  891    PUSH_DATA (nvc0->base.pushbuf, 0x1011);
  892 
  893    return true;
  894 }
  895 
  896 /* Upload code for builtin functions like integer division emulation. */
  897 void
  898 nvc0_program_library_upload(struct nvc0_context *nvc0)
  899 {
  900    struct nvc0_screen *screen = nvc0->screen;
  901    int ret;
  902    uint32_t size;
  903    const uint32_t *code;
  904 
  905    if (screen->lib_code)
  906       return;
  907 
  908    nv50_ir_get_target_library(screen->base.device->chipset, &code, &size);
  909    if (!size)
  910       return;
  911 
  912    ret = nouveau_heap_alloc(screen->text_heap, align(size, 0x100), NULL,
  913                             &screen->lib_code);
  914    if (ret)
  915       return;
  916 
  917    nvc0->base.push_data(&nvc0->base,
  918                         screen->text, screen->lib_code->start, NV_VRAM_DOMAIN(&screen->base),
  919                         size, code);
  920    /* no need for a memory barrier, will be emitted with first program */
  921 }
  922 
  923 void
  924 nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
  925 {
  926    const struct pipe_shader_state pipe = prog->pipe;
  927    const ubyte type = prog->type;
  928 
  929    if (prog->mem)
  930       nouveau_heap_free(&prog->mem);
  931    FREE(prog->code); /* may be 0 for hardcoded shaders */
  932    FREE(prog->relocs);
  933    FREE(prog->fixups);
  934    if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms)
  935       FREE(prog->cp.syms);
  936    if (prog->tfb) {
  937       if (nvc0->state.tfb == prog->tfb)
  938          nvc0->state.tfb = NULL;
  939       FREE(prog->tfb);
  940    }
  941 
  942    memset(prog, 0, sizeof(*prog));
  943 
  944    prog->pipe = pipe;
  945    prog->type = type;
  946 }
  947 
  948 uint32_t
  949 nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
  950 {
  951    const struct nv50_ir_prog_symbol *syms =
  952       (const struct nv50_ir_prog_symbol *)prog->cp.syms;
  953    unsigned base = 0;
  954    unsigned i;
  955    if (prog->type != PIPE_SHADER_COMPUTE)
  956       base = NVC0_SHADER_HEADER_SIZE;
  957    for (i = 0; i < prog->cp.num_syms; ++i)
  958       if (syms[i].label == label)
  959          return prog->code_base + base + syms[i].offset;
  960    return prog->code_base; /* no symbols or symbol not found */
  961 }
  962 
  963 void
  964 nvc0_program_init_tcp_empty(struct nvc0_context *nvc0)
  965 {
  966    struct ureg_program *ureg;
  967 
  968    ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
  969    if (!ureg)
  970       return;
  971 
  972    ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, 1);
  973    ureg_END(ureg);
  974 
  975    nvc0->tcp_empty = ureg_create_shader_and_destroy(ureg, &nvc0->base.pipe);
  976 }