"Fossies" - the Fresh Open Source Software Archive

Member "pngcrush-1.8.13/filter_neon_intrinsics.c" (27 Aug 2016, 11303 Bytes) of package /linux/privat/pngcrush-1.8.13.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "filter_neon_intrinsics.c" see the Fossies "Dox" file reference documentation.

    1 
    2 /* filter_neon_intrinsics.c - NEON optimised filter functions
    3  *
    4  * Copyright (c) 2014,2016 Glenn Randers-Pehrson
    5  * Written by James Yu <james.yu at linaro.org>, October 2013.
    6  * Based on filter_neon.S, written by Mans Rullgard, 2011.
    7  *
    8  * Last changed in libpng 1.6.22 [May 26, 2016]
    9  *
   10  * This code is released under the libpng license.
   11  * For conditions of distribution and use, see the disclaimer
   12  * and license in png.h
   13  */
   14 
   15 #include "pngpriv.h"
   16 
   17 #ifdef PNG_READ_SUPPORTED
   18 
   19 /* This code requires -mfpu=neon on the command line: */
   20 #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
   21 
   22 #include <arm_neon.h>
   23 
   24 /* libpng row pointers are not necessarily aligned to any particular boundary,
   25  * however this code will only work with appropriate alignment.  arm/arm_init.c
   26  * checks for this (and will not compile unless it is done). This code uses
   27  * variants of png_aligncast to avoid compiler warnings.
   28  */
   29 #define png_ptr(type,pointer) png_aligncast(type *,pointer)
   30 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
   31 
   32 /* The following relies on a variable 'temp_pointer' being declared with type
   33  * 'type'.  This is written this way just to hide the GCC strict aliasing
   34  * warning; note that the code is safe because there never is an alias between
   35  * the input and output pointers.
   36  */
   37 #define png_ldr(type,pointer)\
   38    (temp_pointer = png_ptr(type,pointer), *temp_pointer)
   39 
   40 #if PNG_ARM_NEON_OPT > 0
   41 
   42 void
   43 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
   44    png_const_bytep prev_row)
   45 {
   46    png_bytep rp = row;
   47    png_bytep rp_stop = row + row_info->rowbytes;
   48    png_const_bytep pp = prev_row;
   49 
   50    png_debug(1, "in png_read_filter_row_up_neon");
   51 
   52    for (; rp < rp_stop; rp += 16, pp += 16)
   53    {
   54       uint8x16_t qrp, qpp;
   55 
   56       qrp = vld1q_u8(rp);
   57       qpp = vld1q_u8(pp);
   58       qrp = vaddq_u8(qrp, qpp);
   59       vst1q_u8(rp, qrp);
   60    }
   61 }
   62 
   63 void
   64 png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
   65    png_const_bytep prev_row)
   66 {
   67    png_bytep rp = row;
   68    png_bytep rp_stop = row + row_info->rowbytes;
   69 
   70    uint8x16_t vtmp = vld1q_u8(rp);
   71    uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
   72    uint8x8x2_t vrp = *vrpt;
   73 
   74    uint8x8x4_t vdest;
   75    vdest.val[3] = vdup_n_u8(0);
   76 
   77    png_debug(1, "in png_read_filter_row_sub3_neon");
   78 
   79    for (; rp < rp_stop;)
   80    {
   81       uint8x8_t vtmp1, vtmp2;
   82       uint32x2_t *temp_pointer;
   83 
   84       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
   85       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
   86       vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
   87       vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
   88 
   89       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
   90       vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
   91       vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
   92 
   93       vtmp = vld1q_u8(rp + 12);
   94       vrpt = png_ptr(uint8x8x2_t, &vtmp);
   95       vrp = *vrpt;
   96 
   97       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
   98       rp += 3;
   99       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
  100       rp += 3;
  101       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
  102       rp += 3;
  103       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
  104       rp += 3;
  105    }
  106 
  107    PNG_UNUSED(prev_row)
  108 }
  109 
  110 void
  111 png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
  112    png_const_bytep prev_row)
  113 {
  114    png_bytep rp = row;
  115    png_bytep rp_stop = row + row_info->rowbytes;
  116 
  117    uint8x8x4_t vdest;
  118    vdest.val[3] = vdup_n_u8(0);
  119 
  120    png_debug(1, "in png_read_filter_row_sub4_neon");
  121 
  122    for (; rp < rp_stop; rp += 16)
  123    {
  124       uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
  125       uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
  126       uint8x8x4_t vrp = *vrpt;
  127       uint32x2x4_t *temp_pointer;
  128 
  129       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
  130       vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
  131       vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
  132       vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
  133       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
  134    }
  135 
  136    PNG_UNUSED(prev_row)
  137 }
  138 
  139 void
  140 png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
  141    png_const_bytep prev_row)
  142 {
  143    png_bytep rp = row;
  144    png_const_bytep pp = prev_row;
  145    png_bytep rp_stop = row + row_info->rowbytes;
  146 
  147    uint8x16_t vtmp;
  148    uint8x8x2_t *vrpt;
  149    uint8x8x2_t vrp;
  150    uint8x8x4_t vdest;
  151    vdest.val[3] = vdup_n_u8(0);
  152 
  153    vtmp = vld1q_u8(rp);
  154    vrpt = png_ptr(uint8x8x2_t,&vtmp);
  155    vrp = *vrpt;
  156 
  157    png_debug(1, "in png_read_filter_row_avg3_neon");
  158 
  159    for (; rp < rp_stop; pp += 12)
  160    {
  161       uint8x8_t vtmp1, vtmp2, vtmp3;
  162 
  163       uint8x8x2_t *vppt;
  164       uint8x8x2_t vpp;
  165 
  166       uint32x2_t *temp_pointer;
  167 
  168       vtmp = vld1q_u8(pp);
  169       vppt = png_ptr(uint8x8x2_t,&vtmp);
  170       vpp = *vppt;
  171 
  172       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
  173       vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
  174       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
  175 
  176       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
  177       vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
  178       vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
  179       vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
  180 
  181       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
  182       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
  183 
  184       vtmp = vld1q_u8(rp + 12);
  185       vrpt = png_ptr(uint8x8x2_t,&vtmp);
  186       vrp = *vrpt;
  187 
  188       vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
  189       vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
  190 
  191       vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
  192 
  193       vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
  194       vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
  195 
  196       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
  197       rp += 3;
  198       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
  199       rp += 3;
  200       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
  201       rp += 3;
  202       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
  203       rp += 3;
  204    }
  205 }
  206 
  207 void
  208 png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
  209    png_const_bytep prev_row)
  210 {
  211    png_bytep rp = row;
  212    png_bytep rp_stop = row + row_info->rowbytes;
  213    png_const_bytep pp = prev_row;
  214 
  215    uint8x8x4_t vdest;
  216    vdest.val[3] = vdup_n_u8(0);
  217 
  218    png_debug(1, "in png_read_filter_row_avg4_neon");
  219 
  220    for (; rp < rp_stop; rp += 16, pp += 16)
  221    {
  222       uint32x2x4_t vtmp;
  223       uint8x8x4_t *vrpt, *vppt;
  224       uint8x8x4_t vrp, vpp;
  225       uint32x2x4_t *temp_pointer;
  226 
  227       vtmp = vld4_u32(png_ptr(uint32_t,rp));
  228       vrpt = png_ptr(uint8x8x4_t,&vtmp);
  229       vrp = *vrpt;
  230       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
  231       vppt = png_ptr(uint8x8x4_t,&vtmp);
  232       vpp = *vppt;
  233 
  234       vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
  235       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
  236       vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
  237       vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
  238       vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
  239       vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
  240       vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
  241       vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
  242 
  243       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
  244    }
  245 }
  246 
  247 static uint8x8_t
  248 paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
  249 {
  250    uint8x8_t d, e;
  251    uint16x8_t p1, pa, pb, pc;
  252 
  253    p1 = vaddl_u8(a, b); /* a + b */
  254    pc = vaddl_u8(c, c); /* c * 2 */
  255    pa = vabdl_u8(b, c); /* pa */
  256    pb = vabdl_u8(a, c); /* pb */
  257    pc = vabdq_u16(p1, pc); /* pc */
  258 
  259    p1 = vcleq_u16(pa, pb); /* pa <= pb */
  260    pa = vcleq_u16(pa, pc); /* pa <= pc */
  261    pb = vcleq_u16(pb, pc); /* pb <= pc */
  262 
  263    p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
  264 
  265    d = vmovn_u16(pb);
  266    e = vmovn_u16(p1);
  267 
  268    d = vbsl_u8(d, b, c);
  269    e = vbsl_u8(e, a, d);
  270 
  271    return e;
  272 }
  273 
  274 void
  275 png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
  276    png_const_bytep prev_row)
  277 {
  278    png_bytep rp = row;
  279    png_const_bytep pp = prev_row;
  280    png_bytep rp_stop = row + row_info->rowbytes;
  281 
  282    uint8x16_t vtmp;
  283    uint8x8x2_t *vrpt;
  284    uint8x8x2_t vrp;
  285    uint8x8_t vlast = vdup_n_u8(0);
  286    uint8x8x4_t vdest;
  287    vdest.val[3] = vdup_n_u8(0);
  288 
  289    vtmp = vld1q_u8(rp);
  290    vrpt = png_ptr(uint8x8x2_t,&vtmp);
  291    vrp = *vrpt;
  292 
  293    png_debug(1, "in png_read_filter_row_paeth3_neon");
  294 
  295    for (; rp < rp_stop; pp += 12)
  296    {
  297       uint8x8x2_t *vppt;
  298       uint8x8x2_t vpp;
  299       uint8x8_t vtmp1, vtmp2, vtmp3;
  300       uint32x2_t *temp_pointer;
  301 
  302       vtmp = vld1q_u8(pp);
  303       vppt = png_ptr(uint8x8x2_t,&vtmp);
  304       vpp = *vppt;
  305 
  306       vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
  307       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
  308 
  309       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
  310       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
  311       vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
  312       vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
  313 
  314       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
  315       vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
  316       vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
  317       vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
  318 
  319       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
  320       vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
  321 
  322       vtmp = vld1q_u8(rp + 12);
  323       vrpt = png_ptr(uint8x8x2_t,&vtmp);
  324       vrp = *vrpt;
  325 
  326       vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
  327       vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
  328 
  329       vlast = vtmp2;
  330 
  331       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
  332       rp += 3;
  333       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
  334       rp += 3;
  335       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
  336       rp += 3;
  337       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
  338       rp += 3;
  339    }
  340 }
  341 
  342 void
  343 png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
  344    png_const_bytep prev_row)
  345 {
  346    png_bytep rp = row;
  347    png_bytep rp_stop = row + row_info->rowbytes;
  348    png_const_bytep pp = prev_row;
  349 
  350    uint8x8_t vlast = vdup_n_u8(0);
  351    uint8x8x4_t vdest;
  352    vdest.val[3] = vdup_n_u8(0);
  353 
  354    png_debug(1, "in png_read_filter_row_paeth4_neon");
  355 
  356    for (; rp < rp_stop; rp += 16, pp += 16)
  357    {
  358       uint32x2x4_t vtmp;
  359       uint8x8x4_t *vrpt, *vppt;
  360       uint8x8x4_t vrp, vpp;
  361       uint32x2x4_t *temp_pointer;
  362 
  363       vtmp = vld4_u32(png_ptr(uint32_t,rp));
  364       vrpt = png_ptr(uint8x8x4_t,&vtmp);
  365       vrp = *vrpt;
  366       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
  367       vppt = png_ptr(uint8x8x4_t,&vtmp);
  368       vpp = *vppt;
  369 
  370       vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
  371       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
  372       vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
  373       vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
  374       vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
  375       vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
  376       vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
  377       vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
  378 
  379       vlast = vpp.val[3];
  380 
  381       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
  382    }
  383 }
  384 
  385 #endif /* PNG_ARM_NEON_OPT > 0 */
  386 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
  387 #endif /* READ */