"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/r600/r600_hw_context.c" (16 Sep 2020, 20675 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "r600_hw_context.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * on the rights to use, copy, modify, merge, publish, distribute, sub
    8  * license, and/or sell copies of the Software, and to permit persons to whom
    9  * the Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
   18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
   19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
   20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
   21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
   22  *
   23  * Authors:
   24  *      Jerome Glisse
   25  */
   26 #include "r600_pipe.h"
   27 #include "r600d.h"
   28 #include "util/u_memory.h"
   29 #include <errno.h>
   30 #include <unistd.h>
   31 
   32 
   33 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
   34             boolean count_draw_in, unsigned num_atomics)
   35 {
   36     /* Flush the DMA IB if it's not empty. */
   37     if (radeon_emitted(ctx->b.dma.cs, 0))
   38         ctx->b.dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
   39 
   40     if (!radeon_cs_memory_below_limit(ctx->b.screen, ctx->b.gfx.cs,
   41                       ctx->b.vram, ctx->b.gtt)) {
   42         ctx->b.gtt = 0;
   43         ctx->b.vram = 0;
   44         ctx->b.gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
   45         return;
   46     }
   47     /* all will be accounted once relocation are emited */
   48     ctx->b.gtt = 0;
   49     ctx->b.vram = 0;
   50 
   51     /* Check available space in CS. */
   52     if (count_draw_in) {
   53         uint64_t mask;
   54 
   55         /* The number of dwords all the dirty states would take. */
   56         mask = ctx->dirty_atoms;
   57         while (mask != 0)
   58             num_dw += ctx->atoms[u_bit_scan64(&mask)]->num_dw;
   59 
   60         /* The upper-bound of how much space a draw command would take. */
   61         num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS;
   62     }
   63 
   64     /* add atomic counters, 8 pre + 8 post per counter + 16 post if any counters */
   65     num_dw += (num_atomics * 16) + (num_atomics ? 16 : 0);
   66 
   67     /* Count in r600_suspend_queries. */
   68     num_dw += ctx->b.num_cs_dw_queries_suspend;
   69 
   70     /* Count in streamout_end at the end of CS. */
   71     if (ctx->b.streamout.begin_emitted) {
   72         num_dw += ctx->b.streamout.num_dw_for_end;
   73     }
   74 
   75     /* SX_MISC */
   76     if (ctx->b.chip_class == R600) {
   77         num_dw += 3;
   78     }
   79 
   80     /* Count in framebuffer cache flushes at the end of CS. */
   81     num_dw += R600_MAX_FLUSH_CS_DWORDS;
   82 
   83     /* The fence at the end of CS. */
   84     num_dw += 10;
   85 
   86     /* Flush if there's not enough space. */
   87     if (!ctx->b.ws->cs_check_space(ctx->b.gfx.cs, num_dw, false)) {
   88         ctx->b.gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
   89     }
   90 }
   91 
   92 void r600_flush_emit(struct r600_context *rctx)
   93 {
   94     struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
   95     unsigned cp_coher_cntl = 0;
   96     unsigned wait_until = 0;
   97 
   98     if (!rctx->b.flags) {
   99         return;
  100     }
  101 
  102     /* Ensure coherency between streamout and shaders. */
  103     if (rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH)
  104         rctx->b.flags |= r600_get_flush_flags(R600_COHERENCY_SHADER);
  105 
  106     if (rctx->b.flags & R600_CONTEXT_WAIT_3D_IDLE) {
  107         wait_until |= S_008040_WAIT_3D_IDLE(1);
  108     }
  109     if (rctx->b.flags & R600_CONTEXT_WAIT_CP_DMA_IDLE) {
  110         wait_until |= S_008040_WAIT_CP_DMA_IDLE(1);
  111     }
  112 
  113     if (wait_until) {
  114         /* Use of WAIT_UNTIL is deprecated on Cayman+ */
  115         if (rctx->b.family >= CHIP_CAYMAN) {
  116             /* emit a PS partial flush on Cayman/TN */
  117             rctx->b.flags |= R600_CONTEXT_PS_PARTIAL_FLUSH;
  118         }
  119     }
  120 
  121     /* Wait packets must be executed first, because SURFACE_SYNC doesn't
  122      * wait for shaders if it's not flushing CB or DB.
  123      */
  124     if (rctx->b.flags & R600_CONTEXT_PS_PARTIAL_FLUSH) {
  125         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
  126         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
  127     }
  128 
  129     if (rctx->b.flags & R600_CONTEXT_CS_PARTIAL_FLUSH) {
  130         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
  131         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
  132     }
  133 
  134     if (wait_until) {
  135         /* Use of WAIT_UNTIL is deprecated on Cayman+ */
  136         if (rctx->b.family < CHIP_CAYMAN) {
  137             /* wait for things to settle */
  138             radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, wait_until);
  139         }
  140     }
  141 
  142     if (rctx->b.chip_class >= R700 &&
  143         (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_CB_META)) {
  144         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
  145         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
  146     }
  147 
  148     if (rctx->b.chip_class >= R700 &&
  149         (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_DB_META)) {
  150         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
  151         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
  152 
  153         /* Set FULL_CACHE_ENA for DB META flushes on r7xx and later.
  154          *
  155          * This hack predates use of FLUSH_AND_INV_DB_META, so it's
  156          * unclear whether it's still needed or even whether it has
  157          * any effect.
  158          */
  159         cp_coher_cntl |= S_0085F0_FULL_CACHE_ENA(1);
  160     }
  161 
  162     if (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV ||
  163         (rctx->b.chip_class == R600 && rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH)) {
  164         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
  165         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0));
  166     }
  167 
  168     if (rctx->b.flags & R600_CONTEXT_INV_CONST_CACHE) {
  169         /* Direct constant addressing uses the shader cache.
  170          * Indirect contant addressing uses the vertex cache. */
  171         cp_coher_cntl |= S_0085F0_SH_ACTION_ENA(1) |
  172                  (rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1)
  173                              : S_0085F0_TC_ACTION_ENA(1));
  174     }
  175     if (rctx->b.flags & R600_CONTEXT_INV_VERTEX_CACHE) {
  176         cp_coher_cntl |= rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1)
  177                             : S_0085F0_TC_ACTION_ENA(1);
  178     }
  179     if (rctx->b.flags & R600_CONTEXT_INV_TEX_CACHE) {
  180         /* Textures use the texture cache.
  181          * Texture buffer objects use the vertex cache. */
  182         cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1) |
  183                  (rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) : 0);
  184     }
  185 
  186     /* Don't use the DB CP COHER logic on r6xx.
  187      * There are hw bugs.
  188      */
  189     if (rctx->b.chip_class >= R700 &&
  190         (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_DB)) {
  191         cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
  192                 S_0085F0_DB_DEST_BASE_ENA(1) |
  193                 S_0085F0_SMX_ACTION_ENA(1);
  194     }
  195 
  196     /* Don't use the CB CP COHER logic on r6xx.
  197      * There are hw bugs.
  198      */
  199     if (rctx->b.chip_class >= R700 &&
  200         (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_CB)) {
  201         cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
  202                 S_0085F0_CB0_DEST_BASE_ENA(1) |
  203                 S_0085F0_CB1_DEST_BASE_ENA(1) |
  204                 S_0085F0_CB2_DEST_BASE_ENA(1) |
  205                 S_0085F0_CB3_DEST_BASE_ENA(1) |
  206                 S_0085F0_CB4_DEST_BASE_ENA(1) |
  207                 S_0085F0_CB5_DEST_BASE_ENA(1) |
  208                 S_0085F0_CB6_DEST_BASE_ENA(1) |
  209                 S_0085F0_CB7_DEST_BASE_ENA(1) |
  210                 S_0085F0_SMX_ACTION_ENA(1);
  211         if (rctx->b.chip_class >= EVERGREEN)
  212             cp_coher_cntl |= S_0085F0_CB8_DEST_BASE_ENA(1) |
  213                     S_0085F0_CB9_DEST_BASE_ENA(1) |
  214                     S_0085F0_CB10_DEST_BASE_ENA(1) |
  215                     S_0085F0_CB11_DEST_BASE_ENA(1);
  216     }
  217 
  218     if (rctx->b.chip_class >= R700 &&
  219         rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH) {
  220         cp_coher_cntl |= S_0085F0_SO0_DEST_BASE_ENA(1) |
  221                 S_0085F0_SO1_DEST_BASE_ENA(1) |
  222                 S_0085F0_SO2_DEST_BASE_ENA(1) |
  223                 S_0085F0_SO3_DEST_BASE_ENA(1) |
  224                 S_0085F0_SMX_ACTION_ENA(1);
  225     }
  226 
  227     /* Workaround for buggy flushing on some R6xx chipsets. */
  228     if ((rctx->b.flags & (R600_CONTEXT_FLUSH_AND_INV |
  229                   R600_CONTEXT_STREAMOUT_FLUSH)) &&
  230         (rctx->b.family == CHIP_RV670 ||
  231          rctx->b.family == CHIP_RS780 ||
  232          rctx->b.family == CHIP_RS880)) {
  233         cp_coher_cntl |=  S_0085F0_CB1_DEST_BASE_ENA(1) |
  234                   S_0085F0_DEST_BASE_0_ENA(1);
  235     }
  236 
  237     if (cp_coher_cntl) {
  238         radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
  239         radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
  240         radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
  241         radeon_emit(cs, 0);               /* CP_COHER_BASE */
  242         radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
  243     }
  244 
  245     if (rctx->b.flags & R600_CONTEXT_START_PIPELINE_STATS) {
  246         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
  247         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) |
  248                     EVENT_INDEX(0));
  249     } else if (rctx->b.flags & R600_CONTEXT_STOP_PIPELINE_STATS) {
  250         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
  251         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_STOP) |
  252                     EVENT_INDEX(0));
  253     }
  254 
  255     /* everything is properly flushed */
  256     rctx->b.flags = 0;
  257 }
  258 
  259 void r600_context_gfx_flush(void *context, unsigned flags,
  260                 struct pipe_fence_handle **fence)
  261 {
  262     struct r600_context *ctx = context;
  263     struct radeon_cmdbuf *cs = ctx->b.gfx.cs;
  264     struct radeon_winsys *ws = ctx->b.ws;
  265 
  266     if (!radeon_emitted(cs, ctx->b.initial_gfx_cs_size))
  267         return;
  268 
  269     if (r600_check_device_reset(&ctx->b))
  270         return;
  271 
  272     r600_preflush_suspend_features(&ctx->b);
  273 
  274     /* flush the framebuffer cache */
  275     ctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV |
  276               R600_CONTEXT_FLUSH_AND_INV_CB |
  277               R600_CONTEXT_FLUSH_AND_INV_DB |
  278               R600_CONTEXT_FLUSH_AND_INV_CB_META |
  279               R600_CONTEXT_FLUSH_AND_INV_DB_META |
  280               R600_CONTEXT_WAIT_3D_IDLE |
  281               R600_CONTEXT_WAIT_CP_DMA_IDLE;
  282 
  283     r600_flush_emit(ctx);
  284 
  285     if (ctx->trace_buf)
  286         eg_trace_emit(ctx);
  287     /* old kernels and userspace don't set SX_MISC, so we must reset it to 0 here */
  288     if (ctx->b.chip_class == R600) {
  289         radeon_set_context_reg(cs, R_028350_SX_MISC, 0);
  290     }
  291 
  292     if (ctx->is_debug) {
  293         /* Save the IB for debug contexts. */
  294         radeon_clear_saved_cs(&ctx->last_gfx);
  295         radeon_save_cs(ws, cs, &ctx->last_gfx, true);
  296         r600_resource_reference(&ctx->last_trace_buf, ctx->trace_buf);
  297         r600_resource_reference(&ctx->trace_buf, NULL);
  298     }
  299     /* Flush the CS. */
  300     ws->cs_flush(cs, flags, &ctx->b.last_gfx_fence);
  301     if (fence)
  302         ws->fence_reference(fence, ctx->b.last_gfx_fence);
  303     ctx->b.num_gfx_cs_flushes++;
  304 
  305     if (ctx->is_debug) {
  306         if (!ws->fence_wait(ws, ctx->b.last_gfx_fence, 10000000)) {
  307             const char *fname = getenv("R600_TRACE");
  308             if (!fname)
  309                 exit(-1);
  310             FILE *fl = fopen(fname, "w+");
  311             if (fl) {
  312                 eg_dump_debug_state(&ctx->b.b, fl, 0);
  313                 fclose(fl);
  314             } else
  315                 perror(fname);
  316             exit(-1);
  317         }
  318     }
  319     r600_begin_new_cs(ctx);
  320 }
  321 
  322 void r600_begin_new_cs(struct r600_context *ctx)
  323 {
  324     unsigned shader;
  325 
  326     if (ctx->is_debug) {
  327         uint32_t zero = 0;
  328 
  329         /* Create a buffer used for writing trace IDs and initialize it to 0. */
  330         assert(!ctx->trace_buf);
  331         ctx->trace_buf = (struct r600_resource*)
  332             pipe_buffer_create(ctx->b.b.screen, 0,
  333                        PIPE_USAGE_STAGING, 4);
  334         if (ctx->trace_buf)
  335             pipe_buffer_write_nooverlap(&ctx->b.b, &ctx->trace_buf->b.b,
  336                             0, sizeof(zero), &zero);
  337         ctx->trace_id = 0;
  338     }
  339 
  340     if (ctx->trace_buf)
  341         eg_trace_emit(ctx);
  342 
  343     ctx->b.flags = 0;
  344     ctx->b.gtt = 0;
  345     ctx->b.vram = 0;
  346 
  347     /* Begin a new CS. */
  348     r600_emit_command_buffer(ctx->b.gfx.cs, &ctx->start_cs_cmd);
  349 
  350     /* Re-emit states. */
  351     r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom);
  352     r600_mark_atom_dirty(ctx, &ctx->blend_color.atom);
  353     r600_mark_atom_dirty(ctx, &ctx->cb_misc_state.atom);
  354     r600_mark_atom_dirty(ctx, &ctx->clip_misc_state.atom);
  355     r600_mark_atom_dirty(ctx, &ctx->clip_state.atom);
  356     r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom);
  357     r600_mark_atom_dirty(ctx, &ctx->db_state.atom);
  358     r600_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
  359     if (ctx->b.chip_class >= EVERGREEN) {
  360         r600_mark_atom_dirty(ctx, &ctx->fragment_images.atom);
  361         r600_mark_atom_dirty(ctx, &ctx->fragment_buffers.atom);
  362         r600_mark_atom_dirty(ctx, &ctx->compute_images.atom);
  363         r600_mark_atom_dirty(ctx, &ctx->compute_buffers.atom);
  364     }
  365     r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[R600_HW_STAGE_PS].atom);
  366     r600_mark_atom_dirty(ctx, &ctx->poly_offset_state.atom);
  367     r600_mark_atom_dirty(ctx, &ctx->vgt_state.atom);
  368     r600_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
  369     ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
  370     r600_mark_atom_dirty(ctx, &ctx->b.scissors.atom);
  371     ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
  372     ctx->b.viewports.depth_range_dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
  373     r600_mark_atom_dirty(ctx, &ctx->b.viewports.atom);
  374     if (ctx->b.chip_class <= EVERGREEN) {
  375         r600_mark_atom_dirty(ctx, &ctx->config_state.atom);
  376     }
  377     r600_mark_atom_dirty(ctx, &ctx->stencil_ref.atom);
  378     r600_mark_atom_dirty(ctx, &ctx->vertex_fetch_shader.atom);
  379     r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[R600_HW_STAGE_ES].atom);
  380     r600_mark_atom_dirty(ctx, &ctx->shader_stages.atom);
  381     if (ctx->gs_shader) {
  382         r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[R600_HW_STAGE_GS].atom);
  383         r600_mark_atom_dirty(ctx, &ctx->gs_rings.atom);
  384     }
  385     if (ctx->tes_shader) {
  386         r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[EG_HW_STAGE_HS].atom);
  387         r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[EG_HW_STAGE_LS].atom);
  388     }
  389     r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[R600_HW_STAGE_VS].atom);
  390     r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
  391     r600_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
  392 
  393     if (ctx->blend_state.cso)
  394         r600_mark_atom_dirty(ctx, &ctx->blend_state.atom);
  395     if (ctx->dsa_state.cso)
  396         r600_mark_atom_dirty(ctx, &ctx->dsa_state.atom);
  397     if (ctx->rasterizer_state.cso)
  398         r600_mark_atom_dirty(ctx, &ctx->rasterizer_state.atom);
  399 
  400     if (ctx->b.chip_class <= R700) {
  401         r600_mark_atom_dirty(ctx, &ctx->seamless_cube_map.atom);
  402     }
  403 
  404     ctx->vertex_buffer_state.dirty_mask = ctx->vertex_buffer_state.enabled_mask;
  405     r600_vertex_buffers_dirty(ctx);
  406 
  407     /* Re-emit shader resources. */
  408     for (shader = 0; shader < PIPE_SHADER_TYPES; shader++) {
  409         struct r600_constbuf_state *constbuf = &ctx->constbuf_state[shader];
  410         struct r600_textures_info *samplers = &ctx->samplers[shader];
  411 
  412         constbuf->dirty_mask = constbuf->enabled_mask;
  413         samplers->views.dirty_mask = samplers->views.enabled_mask;
  414         samplers->states.dirty_mask = samplers->states.enabled_mask;
  415 
  416         r600_constant_buffers_dirty(ctx, constbuf);
  417         r600_sampler_views_dirty(ctx, &samplers->views);
  418         r600_sampler_states_dirty(ctx, &samplers->states);
  419     }
  420 
  421     for (shader = 0; shader < ARRAY_SIZE(ctx->scratch_buffers); shader++) {
  422         ctx->scratch_buffers[shader].dirty = true;
  423     }
  424 
  425     r600_postflush_resume_features(&ctx->b);
  426 
  427     /* Re-emit the draw state. */
  428     ctx->last_primitive_type = -1;
  429     ctx->last_start_instance = -1;
  430     ctx->last_rast_prim      = -1;
  431     ctx->current_rast_prim   = -1;
  432 
  433     assert(!ctx->b.gfx.cs->prev_dw);
  434     ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->current.cdw;
  435 }
  436 
  437 void r600_emit_pfp_sync_me(struct r600_context *rctx)
  438 {
  439     struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
  440 
  441     if (rctx->b.chip_class >= EVERGREEN &&
  442         rctx->b.screen->info.drm_minor >= 46) {
  443         radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
  444         radeon_emit(cs, 0);
  445     } else {
  446         /* Emulate PFP_SYNC_ME by writing a value to memory in ME and
  447          * waiting for it in PFP.
  448          */
  449         struct r600_resource *buf = NULL;
  450         unsigned offset, reloc;
  451         uint64_t va;
  452 
  453         /* 16-byte address alignment is required by WAIT_REG_MEM. */
  454         u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16,
  455                      &offset, (struct pipe_resource**)&buf);
  456         if (!buf) {
  457             /* This is too heavyweight, but will work. */
  458             rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
  459             return;
  460         }
  461 
  462         reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf,
  463                           RADEON_USAGE_READWRITE,
  464                           RADEON_PRIO_FENCE);
  465 
  466         va = buf->gpu_address + offset;
  467         assert(va % 16 == 0);
  468 
  469         /* Write 1 to memory in ME. */
  470         radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
  471         radeon_emit(cs, va);
  472         radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS);
  473         radeon_emit(cs, 1);
  474         radeon_emit(cs, 0);
  475 
  476         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
  477         radeon_emit(cs, reloc);
  478 
  479         /* Wait in PFP (PFP can only do GEQUAL against memory). */
  480         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
  481         radeon_emit(cs, WAIT_REG_MEM_GEQUAL |
  482                     WAIT_REG_MEM_MEMORY |
  483                     WAIT_REG_MEM_PFP);
  484         radeon_emit(cs, va);
  485         radeon_emit(cs, va >> 32);
  486         radeon_emit(cs, 1); /* reference value */
  487         radeon_emit(cs, 0xffffffff); /* mask */
  488         radeon_emit(cs, 4); /* poll interval */
  489 
  490         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
  491         radeon_emit(cs, reloc);
  492 
  493         r600_resource_reference(&buf, NULL);
  494     }
  495 }
  496 
  497 /* The max number of bytes to copy per packet. */
  498 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
  499 
  500 void r600_cp_dma_copy_buffer(struct r600_context *rctx,
  501                  struct pipe_resource *dst, uint64_t dst_offset,
  502                  struct pipe_resource *src, uint64_t src_offset,
  503                  unsigned size)
  504 {
  505     struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
  506 
  507     assert(size);
  508     assert(rctx->screen->b.has_cp_dma);
  509 
  510     /* Mark the buffer range of destination as valid (initialized),
  511      * so that transfer_map knows it should wait for the GPU when mapping
  512      * that range. */
  513     util_range_add(dst, &r600_resource(dst)->valid_buffer_range, dst_offset,
  514                dst_offset + size);
  515 
  516     dst_offset += r600_resource(dst)->gpu_address;
  517     src_offset += r600_resource(src)->gpu_address;
  518 
  519     /* Flush the caches where the resources are bound. */
  520     rctx->b.flags |= r600_get_flush_flags(R600_COHERENCY_SHADER) |
  521              R600_CONTEXT_WAIT_3D_IDLE;
  522 
  523     /* There are differences between R700 and EG in CP DMA,
  524      * but we only use the common bits here. */
  525     while (size) {
  526         unsigned sync = 0;
  527         unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
  528         unsigned src_reloc, dst_reloc;
  529 
  530         r600_need_cs_space(rctx,
  531                    10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
  532                    3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE, 0);
  533 
  534         /* Flush the caches for the first copy only. */
  535         if (rctx->b.flags) {
  536             r600_flush_emit(rctx);
  537         }
  538 
  539         /* Do the synchronization after the last copy, so that all data is written to memory. */
  540         if (size == byte_count) {
  541             sync = PKT3_CP_DMA_CP_SYNC;
  542         }
  543 
  544         /* This must be done after r600_need_cs_space. */
  545         src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src,
  546                           RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
  547         dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst,
  548                           RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
  549 
  550         radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
  551         radeon_emit(cs, src_offset);    /* SRC_ADDR_LO [31:0] */
  552         radeon_emit(cs, sync | ((src_offset >> 32) & 0xff));        /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */
  553         radeon_emit(cs, dst_offset);    /* DST_ADDR_LO [31:0] */
  554         radeon_emit(cs, (dst_offset >> 32) & 0xff);     /* DST_ADDR_HI [7:0] */
  555         radeon_emit(cs, byte_count);    /* COMMAND [29:22] | BYTE_COUNT [20:0] */
  556 
  557         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
  558         radeon_emit(cs, src_reloc);
  559         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
  560         radeon_emit(cs, dst_reloc);
  561 
  562         size -= byte_count;
  563         src_offset += byte_count;
  564         dst_offset += byte_count;
  565     }
  566 
  567     /* CP_DMA_CP_SYNC doesn't wait for idle on R6xx, but this does. */
  568     if (rctx->b.chip_class == R600)
  569         radeon_set_config_reg(cs, R_008040_WAIT_UNTIL,
  570                       S_008040_WAIT_CP_DMA_IDLE(1));
  571 
  572     /* CP DMA is executed in ME, but index buffers are read by PFP.
  573      * This ensures that ME (CP DMA) is idle before PFP starts fetching
  574      * indices. If we wanted to execute CP DMA in PFP, this packet
  575      * should precede it.
  576      */
  577     r600_emit_pfp_sync_me(rctx);
  578 }
  579 
  580 void r600_dma_copy_buffer(struct r600_context *rctx,
  581               struct pipe_resource *dst,
  582               struct pipe_resource *src,
  583               uint64_t dst_offset,
  584               uint64_t src_offset,
  585               uint64_t size)
  586 {
  587     struct radeon_cmdbuf *cs = rctx->b.dma.cs;
  588     unsigned i, ncopy, csize;
  589     struct r600_resource *rdst = (struct r600_resource*)dst;
  590     struct r600_resource *rsrc = (struct r600_resource*)src;
  591 
  592     /* Mark the buffer range of destination as valid (initialized),
  593      * so that transfer_map knows it should wait for the GPU when mapping
  594      * that range. */
  595     util_range_add(&rdst->b.b, &rdst->valid_buffer_range, dst_offset,
  596                dst_offset + size);
  597 
  598     size >>= 2; /* convert to dwords */
  599     ncopy = (size / R600_DMA_COPY_MAX_SIZE_DW) + !!(size % R600_DMA_COPY_MAX_SIZE_DW);
  600 
  601     r600_need_dma_space(&rctx->b, ncopy * 5, rdst, rsrc);
  602     for (i = 0; i < ncopy; i++) {
  603         csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW;
  604         /* emit reloc before writing cs so that cs is always in consistent state */
  605         radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, 0);
  606         radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, 0);
  607         radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize));
  608         radeon_emit(cs, dst_offset & 0xfffffffc);
  609         radeon_emit(cs, src_offset & 0xfffffffc);
  610         radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
  611         radeon_emit(cs, (src_offset >> 32UL) & 0xff);
  612         dst_offset += csize << 2;
  613         src_offset += csize << 2;
  614         size -= csize;
  615     }
  616 }