"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/broadcom/common/v3d_cpu_tiling.h" (16 Sep 2020, 11923 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "v3d_cpu_tiling.h" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright © 2017 Broadcom
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8  * and/or sell copies of the Software, and to permit persons to whom the
    9  * Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   21  * IN THE SOFTWARE.
   22  */
   23 
   24 /** @file v3d_cpu_tiling.h
   25  *
   26  * Contains load/store functions common to both v3d and vc4.  The utile layout
   27  * stayed the same, though the way utiles get laid out has changed.
   28  */
   29 
   30 static inline void
   31 v3d_load_utile(void *cpu, uint32_t cpu_stride,
   32                void *gpu, uint32_t gpu_stride)
   33 {
   34 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
   35         if (gpu_stride == 8) {
   36                 __asm__ volatile (
   37                         /* Load from the GPU in one shot, no interleave, to
   38                          * d0-d7.
   39                          */
   40                         "vldm %[gpu], {q0, q1, q2, q3}\n"
   41                         /* Store each 8-byte line to cpu-side destination,
   42                          * incrementing it by the stride each time.
   43                          */
   44                         "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
   45                         "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
   46                         "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
   47                         "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
   48                         "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
   49                         "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
   50                         "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
   51                         "vst1.8 d7, [%[cpu]]\n"
   52                         : [cpu]         "+r"(cpu)
   53                         : [gpu]         "r"(gpu),
   54                           [cpu_stride]  "r"(cpu_stride)
   55                         : "q0", "q1", "q2", "q3");
   56                 return;
   57         } else if (gpu_stride == 16) {
   58                 void *cpu2 = cpu + 8;
   59                 __asm__ volatile (
   60                         /* Load from the GPU in one shot, no interleave, to
   61                          * d0-d7.
   62                          */
   63                         "vldm %[gpu], {q0, q1, q2, q3};\n"
   64                         /* Store each 16-byte line in 2 parts to the cpu-side
   65                          * destination.  (vld1 can only store one d-register
   66                          * at a time).
   67                          */
   68                         "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
   69                         "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
   70                         "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
   71                         "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
   72                         "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
   73                         "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
   74                         "vst1.8 d6, [%[cpu]]\n"
   75                         "vst1.8 d7, [%[cpu2]]\n"
   76                         : [cpu]         "+r"(cpu),
   77                           [cpu2]        "+r"(cpu2)
   78                         : [gpu]         "r"(gpu),
   79                           [cpu_stride]  "r"(cpu_stride)
   80                         : "q0", "q1", "q2", "q3");
   81                 return;
   82         }
   83 #elif defined (PIPE_ARCH_AARCH64)
   84         if (gpu_stride == 8) {
   85                 __asm__ volatile (
   86                         /* Load from the GPU in one shot, no interleave, to
   87                          * d0-d7.
   88                          */
   89                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
   90                         /* Store each 8-byte line to cpu-side destination,
   91                          * incrementing it by the stride each time.
   92                          */
   93                         "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
   94                         "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
   95                         "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
   96                         "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
   97                         "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
   98                         "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
   99                         "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
  100                         "st1 {v3.D}[1], [%[cpu]]\n"
  101                         : [cpu]         "+r"(cpu)
  102                         : [gpu]         "r"(gpu),
  103                           [cpu_stride]  "r"(cpu_stride)
  104                         : "v0", "v1", "v2", "v3");
  105                 return;
  106         } else if (gpu_stride == 16) {
  107                 void *cpu2 = cpu + 8;
  108                 __asm__ volatile (
  109                         /* Load from the GPU in one shot, no interleave, to
  110                          * d0-d7.
  111                          */
  112                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
  113                         /* Store each 16-byte line in 2 parts to the cpu-side
  114                          * destination.  (vld1 can only store one d-register
  115                          * at a time).
  116                          */
  117                         "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
  118                         "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
  119                         "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
  120                         "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
  121                         "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
  122                         "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
  123                         "st1 {v3.D}[0], [%[cpu]]\n"
  124                         "st1 {v3.D}[1], [%[cpu2]]\n"
  125                         : [cpu]         "+r"(cpu),
  126                           [cpu2]        "+r"(cpu2)
  127                         : [gpu]         "r"(gpu),
  128                           [cpu_stride]  "r"(cpu_stride)
  129                         : "v0", "v1", "v2", "v3");
  130                 return;
  131         }
  132 #endif
  133 
  134         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
  135                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
  136                 cpu += cpu_stride;
  137         }
  138 }
  139 
  140 static inline void
  141 v3d_store_utile(void *gpu, uint32_t gpu_stride,
  142                 void *cpu, uint32_t cpu_stride)
  143 {
  144 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
  145         if (gpu_stride == 8) {
  146                 __asm__ volatile (
  147                         /* Load each 8-byte line from cpu-side source,
  148                          * incrementing it by the stride each time.
  149                          */
  150                         "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
  151                         "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
  152                         "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
  153                         "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
  154                         "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
  155                         "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
  156                         "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
  157                         "vld1.8 d7, [%[cpu]]\n"
  158                         /* Load from the GPU in one shot, no interleave, to
  159                          * d0-d7.
  160                          */
  161                         "vstm %[gpu], {q0, q1, q2, q3}\n"
  162                         : [cpu]         "+r"(cpu)
  163                         : [gpu]         "r"(gpu),
  164                           [cpu_stride]  "r"(cpu_stride)
  165                         : "q0", "q1", "q2", "q3");
  166                 return;
  167         } else if (gpu_stride == 16) {
  168                 void *cpu2 = cpu + 8;
  169                 __asm__ volatile (
  170                         /* Load each 16-byte line in 2 parts from the cpu-side
  171                          * destination.  (vld1 can only store one d-register
  172                          * at a time).
  173                          */
  174                         "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
  175                         "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
  176                         "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
  177                         "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
  178                         "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
  179                         "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
  180                         "vld1.8 d6, [%[cpu]]\n"
  181                         "vld1.8 d7, [%[cpu2]]\n"
  182                         /* Store to the GPU in one shot, no interleave. */
  183                         "vstm %[gpu], {q0, q1, q2, q3}\n"
  184                         : [cpu]         "+r"(cpu),
  185                           [cpu2]        "+r"(cpu2)
  186                         : [gpu]         "r"(gpu),
  187                           [cpu_stride]  "r"(cpu_stride)
  188                         : "q0", "q1", "q2", "q3");
  189                 return;
  190         }
  191 #elif defined (PIPE_ARCH_AARCH64)
  192         if (gpu_stride == 8) {
  193                 __asm__ volatile (
  194                         /* Load each 8-byte line from cpu-side source,
  195                          * incrementing it by the stride each time.
  196                          */
  197                         "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
  198                         "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
  199                         "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
  200                         "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
  201                         "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
  202                         "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
  203                         "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
  204                         "ld1 {v3.D}[1], [%[cpu]]\n"
  205                         /* Store to the GPU in one shot, no interleave. */
  206                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
  207                         : [cpu]         "+r"(cpu)
  208                         : [gpu]         "r"(gpu),
  209                           [cpu_stride]  "r"(cpu_stride)
  210                         : "v0", "v1", "v2", "v3");
  211                 return;
  212         } else if (gpu_stride == 16) {
  213                 void *cpu2 = cpu + 8;
  214                 __asm__ volatile (
  215                         /* Load each 16-byte line in 2 parts from the cpu-side
  216                          * destination.  (vld1 can only store one d-register
  217                          * at a time).
  218                          */
  219                         "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
  220                         "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
  221                         "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
  222                         "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
  223                         "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
  224                         "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
  225                         "ld1 {v3.D}[0], [%[cpu]]\n"
  226                         "ld1 {v3.D}[1], [%[cpu2]]\n"
  227                         /* Store to the GPU in one shot, no interleave. */
  228                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
  229                         : [cpu]         "+r"(cpu),
  230                           [cpu2]        "+r"(cpu2)
  231                         : [gpu]         "r"(gpu),
  232                           [cpu_stride]  "r"(cpu_stride)
  233                         : "v0", "v1", "v2", "v3");
  234                 return;
  235         }
  236 #endif
  237 
  238         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
  239                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
  240                 cpu += cpu_stride;
  241         }
  242 }