"Fossies" - the Fresh Open Source Software Archive

Member "pngcrush-1.8.13/filter_vsx_intrinsics.c" (16 Mar 2017, 25480 Bytes) of package /linux/privat/pngcrush-1.8.13.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "filter_vsx_intrinsics.c" see the Fossies "Dox" file reference documentation.

    1 /* filter_vsx_intrinsics.c - PowerPC optimised filter functions
    2  *
    3  * Copyright (c) 2017 Glenn Randers-Pehrson
    4  * Written by Vadim Barkov, 2017.
    5  * Last changed in libpng 1.6.29 [(PENDING RELEASE)]
    6  *
    7  * This code is released under the libpng license.
    8  * For conditions of distribution and use, see the disclaimer
    9  * and license in png.h
   10  */
   11 #include <stdio.h>
   12 #include <stdint.h>
   13 #include "pngpriv.h"
   14 
   15 #ifdef PNG_READ_SUPPORTED
   16 
   17 /* This code requires -maltivec and -mvsx on the command line: */
   18 #if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
   19 
   20 #include <altivec.h>
   21 
   22 #if PNG_POWERPC_VSX_OPT > 0
   23 
   24 #ifndef __VSX__
   25 #  error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag."
   26 #endif
   27 
   28 #define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data)
   29 #define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data)
   30 
   31 
   32 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
   33  * They're positioned like this:
   34  *    prev:  c b
   35  *    row:   a d
   36  * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
   37  * whichever of a, b, or c is closest to p=a+b-c.
   38  * ( this is taken from ../intel/filter_sse2_intrinsics.c )
   39  */
   40 
   41 #define vsx_declare_common_vars(row_info,row,prev_row,offset) \
   42    png_byte i;\
   43    png_bytep rp = row + offset;\
   44    png_const_bytep pp = prev_row;\
   45    png_size_t unaligned_top = 16 - (((png_size_t)rp % 16));\
   46    png_size_t istop;\
   47    if(unaligned_top == 16)\
   48       unaligned_top = 0;\
   49    istop = row_info->rowbytes;\
   50    if((unaligned_top < istop))\
   51       istop -= unaligned_top;\
   52    else{\
   53       unaligned_top = istop;\
   54       istop = 0;\
   55    }
   56 
   57 void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row,
   58                                 png_const_bytep prev_row)
   59 {
   60    vector unsigned char rp_vec;
   61    vector unsigned char pp_vec;
   62    vsx_declare_common_vars(row_info,row,prev_row,0)
   63 
   64    /* Altivec operations require 16-byte aligned data
   65     * but input can be unaligned. So we calculate
   66     * unaligned part as usual.
   67     */
   68    for (i = 0; i < unaligned_top; i++)
   69    {
   70       *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
   71       rp++;
   72    }
   73 
   74    /* Using SIMD while we can */
   75    while( istop >= 16 )
   76    {
   77       rp_vec = vec_ld(0,rp);
   78       vec_ld_unaligned(pp_vec,pp);
   79 
   80       rp_vec = vec_add(rp_vec,pp_vec);
   81 
   82       vec_st(rp_vec,0,rp);
   83 
   84       pp += 16;
   85       rp += 16;
   86       istop -= 16;
   87    }
   88 
   89    if(istop > 0)
   90    {
   91       /* If byte count of row is not divisible by 16
   92        * we will process remaining part as usual
   93        */
   94       for (i = 0; i < istop; i++)
   95       {
   96          *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
   97          rp++;
   98       }
   99 }
  100 
  101 }
  102 
  103 static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16};
  104 static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16};
  105 static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11};
  106 
  107 static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16};
  108 static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16};
  109 static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16};
  110 static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16};
  111 
  112 static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16};
  113 static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16};
  114 static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15};
  115 
  116 static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16};
  117 static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16};
  118 static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16};
  119 static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16};
  120 
  121 static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  122 #ifdef __LITTLE_ENDIAN__
  123 
  124 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16};
  125 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16};
  126 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16};
  127 
  128 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16};
  129 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16};
  130 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6};
  131 
  132 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16};
  133 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16};
  134 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16};
  135 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16};
  136 
  137 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16};
  138 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16};
  139 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16};
  140 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16};
  141 
  142 #elif defined(__BIG_ENDIAN__)
  143 
  144 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16};
  145 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16};
  146 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16};
  147 
  148 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16};
  149 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16};
  150 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7};
  151 
  152 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16};
  153 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16};
  154 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16};
  155 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16};
  156 
  157 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16};
  158 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16};
  159 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16};
  160 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16};
  161 
  162 #endif
  163 
  164 #define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp)
  165 #define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp)
  166 
  167 #ifdef PNG_USE_ABS
  168 #  define vsx_abs(number) abs(number)
  169 #else
  170 #  define vsx_abs(number) (number > 0) ? (number) : -(number)
  171 #endif
  172 
  173 void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row,
  174                                   png_const_bytep prev_row)
  175 {
  176    const png_byte bpp = 4;
  177 
  178    vector unsigned char rp_vec;
  179    vector unsigned char part_vec;
  180 
  181    vsx_declare_common_vars(row_info,row,prev_row,bpp)
  182 
  183    PNG_UNUSED(pp)
  184 
  185    /* Altivec operations require 16-byte aligned data
  186     * but input can be unaligned. So we calculate
  187     * unaligned part as usual.
  188     */
  189    for (i = 0; i < unaligned_top; i++)
  190    {
  191       *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
  192       rp++;
  193    }
  194 
  195    /* Using SIMD while we can */
  196    while( istop >= 16 )
  197    {
  198       for(i=0;i < bpp ; i++)
  199       {
  200          *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
  201          rp++;
  202       }
  203       rp -= bpp;
  204 
  205       rp_vec = vec_ld(0,rp);
  206       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
  207       rp_vec = vec_add(rp_vec,part_vec);
  208 
  209       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
  210       rp_vec = vec_add(rp_vec,part_vec);
  211 
  212       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
  213       rp_vec = vec_add(rp_vec,part_vec);
  214 
  215       vec_st(rp_vec,0,rp);
  216 
  217       rp += 16;
  218       istop -= 16;
  219    }
  220 
  221    if(istop > 0)
  222       for (i = 0; i < istop % 16; i++)
  223       {
  224          *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff);
  225          rp++;
  226       }
  227 
  228 }
  229 
  230 void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row,
  231                                   png_const_bytep prev_row)
  232 {
  233    const png_byte bpp = 3;
  234 
  235    vector unsigned char rp_vec;
  236    vector unsigned char part_vec;
  237 
  238    vsx_declare_common_vars(row_info,row,prev_row,bpp)
  239 
  240    PNG_UNUSED(pp)
  241 
  242    /* Altivec operations require 16-byte aligned data
  243     * but input can be unaligned. So we calculate
  244     * unaligned part as usual.
  245     */
  246    for (i = 0; i < unaligned_top; i++)
  247    {
  248       *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
  249       rp++;
  250    }
  251 
  252    /* Using SIMD while we can */
  253    while( istop >= 16 )
  254    {
  255       for(i=0;i < bpp ; i++)
  256       {
  257          *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
  258          rp++;
  259       }
  260       rp -= bpp;
  261 
  262       rp_vec = vec_ld(0,rp);
  263       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
  264       rp_vec = vec_add(rp_vec,part_vec);
  265 
  266       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
  267       rp_vec = vec_add(rp_vec,part_vec);
  268 
  269       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
  270       rp_vec = vec_add(rp_vec,part_vec);
  271 
  272       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
  273       rp_vec = vec_add(rp_vec,part_vec);
  274 
  275       vec_st(rp_vec,0,rp);
  276       rp += 15;
  277       istop -= 16;
  278 
  279       /* Since 16 % bpp = 16 % 3 = 1, last element of array must
  280        * be proceeded manually
  281        */
  282       *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
  283       rp++;
  284    }
  285 
  286    if(istop > 0)
  287       for (i = 0; i < istop % 16; i++)
  288       {
  289          *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
  290          rp++;
  291       }
  292 }
  293 
  294 void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row,
  295                                   png_const_bytep prev_row)
  296 {
  297    const png_byte bpp = 4;
  298 
  299    vector unsigned char rp_vec;
  300    vector unsigned char pp_vec;
  301    vector unsigned char pp_part_vec;
  302    vector unsigned char rp_part_vec;
  303    vector unsigned char avg_vec;
  304 
  305    vsx_declare_common_vars(row_info,row,prev_row,bpp)
  306    rp -= bpp;
  307    if(istop >= bpp)
  308       istop -= bpp;
  309 
  310    for (i = 0; i < bpp; i++)
  311    {
  312       *rp = (png_byte)(((int)(*rp) +
  313          ((int)(*pp++) / 2 )) & 0xff);
  314 
  315       rp++;
  316    }
  317 
  318    /* Altivec operations require 16-byte aligned data
  319     * but input can be unaligned. So we calculate
  320     * unaligned part as usual.
  321     */
  322    for (i = 0; i < unaligned_top; i++)
  323    {
  324       *rp = (png_byte)(((int)(*rp) +
  325          (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
  326 
  327       rp++;
  328    }
  329 
  330    /* Using SIMD while we can */
  331    while( istop >= 16 )
  332    {
  333       for(i=0;i < bpp ; i++)
  334       {
  335          *rp = (png_byte)(((int)(*rp) +
  336             (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
  337 
  338          rp++;
  339       }
  340       rp -= bpp;
  341       pp -= bpp;
  342 
  343       vec_ld_unaligned(pp_vec,pp);
  344       rp_vec = vec_ld(0,rp);
  345 
  346       rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
  347       pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4);
  348       avg_vec = vec_avg(rp_part_vec,pp_part_vec);
  349       avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
  350       rp_vec = vec_add(rp_vec,avg_vec);
  351 
  352       rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
  353       pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4);
  354       avg_vec = vec_avg(rp_part_vec,pp_part_vec);
  355       avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
  356       rp_vec = vec_add(rp_vec,avg_vec);
  357 
  358       rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
  359       pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4);
  360       avg_vec = vec_avg(rp_part_vec,pp_part_vec);
  361       avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
  362       rp_vec = vec_add(rp_vec,avg_vec);
  363 
  364       vec_st(rp_vec,0,rp);
  365 
  366       rp += 16;
  367       pp += 16;
  368       istop -= 16;
  369    }
  370 
  371    if(istop  > 0)
  372       for (i = 0; i < istop % 16; i++)
  373       {
  374          *rp = (png_byte)(((int)(*rp) +
  375             (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
  376 
  377          rp++;
  378       }
  379 }
  380 
  381 void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row,
  382                                   png_const_bytep prev_row)
  383 {
  384   const png_byte bpp = 3;
  385 
  386   vector unsigned char rp_vec;
  387   vector unsigned char pp_vec;
  388   vector unsigned char pp_part_vec;
  389   vector unsigned char rp_part_vec;
  390   vector unsigned char avg_vec;
  391 
  392   vsx_declare_common_vars(row_info,row,prev_row,bpp)
  393   rp -= bpp;
  394   if(istop >= bpp)
  395      istop -= bpp;
  396 
  397   for (i = 0; i < bpp; i++)
  398   {
  399      *rp = (png_byte)(((int)(*rp) +
  400         ((int)(*pp++) / 2 )) & 0xff);
  401 
  402      rp++;
  403   }
  404 
  405   /* Altivec operations require 16-byte aligned data
  406    * but input can be unaligned. So we calculate
  407    * unaligned part as usual.
  408    */
  409   for (i = 0; i < unaligned_top; i++)
  410   {
  411      *rp = (png_byte)(((int)(*rp) +
  412         (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
  413 
  414      rp++;
  415   }
  416 
  417   /* Using SIMD while we can */
  418   while( istop >= 16 )
  419   {
  420      for(i=0;i < bpp ; i++)
  421      {
  422         *rp = (png_byte)(((int)(*rp) +
  423            (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
  424 
  425         rp++;
  426      }
  427      rp -= bpp;
  428      pp -= bpp;
  429 
  430      vec_ld_unaligned(pp_vec,pp);
  431      rp_vec = vec_ld(0,rp);
  432 
  433      rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
  434      pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3);
  435      avg_vec = vec_avg(rp_part_vec,pp_part_vec);
  436      avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
  437      rp_vec = vec_add(rp_vec,avg_vec);
  438 
  439      rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
  440      pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3);
  441      avg_vec = vec_avg(rp_part_vec,pp_part_vec);
  442      avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
  443      rp_vec = vec_add(rp_vec,avg_vec);
  444 
  445      rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
  446      pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3);
  447      avg_vec = vec_avg(rp_part_vec,pp_part_vec);
  448      avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
  449      rp_vec = vec_add(rp_vec,avg_vec);
  450 
  451      rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
  452      pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3);
  453      avg_vec = vec_avg(rp_part_vec,pp_part_vec);
  454      avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
  455      rp_vec = vec_add(rp_vec,avg_vec);
  456 
  457      vec_st(rp_vec,0,rp);
  458 
  459      rp += 15;
  460      pp += 15;
  461      istop -= 16;
  462 
  463      /* Since 16 % bpp = 16 % 3 = 1, last element of array must
  464       * be proceeded manually
  465       */
  466      *rp = (png_byte)(((int)(*rp) +
  467         (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
  468      rp++;
  469   }
  470 
  471   if(istop  > 0)
  472      for (i = 0; i < istop % 16; i++)
  473      {
  474         *rp = (png_byte)(((int)(*rp) +
  475            (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
  476 
  477         rp++;
  478      }
  479 }
  480 
  481 /* Bytewise c ? t : e. */
  482 #define if_then_else(c,t,e) vec_sel(e,t,c)
  483 
  484 #define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\
  485       c = *(pp - bpp);\
  486       a = *(rp - bpp);\
  487       b = *pp++;\
  488       p = b - c;\
  489       pc = a - c;\
  490       pa = vsx_abs(p);\
  491       pb = vsx_abs(pc);\
  492       pc = vsx_abs(p + pc);\
  493       if (pb < pa) pa = pb, a = b;\
  494       if (pc < pa) a = c;\
  495       a += *rp;\
  496       *rp++ = (png_byte)a;\
  497       }
  498 
  499 void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row,
  500    png_const_bytep prev_row)
  501 {
  502    const png_byte bpp = 4;
  503 
  504    int a, b, c, pa, pb, pc, p;
  505    vector unsigned char rp_vec;
  506    vector unsigned char pp_vec;
  507    vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
  508    vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
  509 
  510    vsx_declare_common_vars(row_info,row,prev_row,bpp)
  511    rp -= bpp;
  512    if(istop >= bpp)
  513       istop -= bpp;
  514 
  515    /* Process the first pixel in the row completely (this is the same as 'up'
  516     * because there is only one candidate predictor for the first row).
  517     */
  518    for(i = 0; i < bpp ; i++)
  519    {
  520       *rp = (png_byte)( *rp + *pp);
  521       rp++;
  522       pp++;
  523    }
  524 
  525    for(i = 0; i < unaligned_top ; i++)
  526    {
  527       vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
  528    }
  529 
  530    while( istop >= 16)
  531    {
  532       for(i = 0; i < bpp ; i++)
  533       {
  534          vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
  535       }
  536 
  537       rp -= bpp;
  538       pp -= bpp;
  539       rp_vec = vec_ld(0,rp);
  540       vec_ld_unaligned(pp_vec,pp);
  541 
  542       a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
  543       b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4);
  544       c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
  545       pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
  546       pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
  547       pc_vec = vec_add(pa_vec,pb_vec);
  548       pa_vec = vec_abs(pa_vec);
  549       pb_vec = vec_abs(pb_vec);
  550       pc_vec = vec_abs(pc_vec);
  551       smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
  552       nearest_vec =  if_then_else(
  553             vec_cmpeq(pa_vec,smallest_vec),
  554             a_vec,
  555             if_then_else(
  556               vec_cmpeq(pb_vec,smallest_vec),
  557               b_vec,
  558               c_vec
  559               )
  560             );
  561       rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4)));
  562 
  563       a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
  564       b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4);
  565       c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
  566       pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
  567       pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
  568       pc_vec = vec_add(pa_vec,pb_vec);
  569       pa_vec = vec_abs(pa_vec);
  570       pb_vec = vec_abs(pb_vec);
  571       pc_vec = vec_abs(pc_vec);
  572       smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
  573       nearest_vec =  if_then_else(
  574             vec_cmpeq(pa_vec,smallest_vec),
  575             a_vec,
  576             if_then_else(
  577               vec_cmpeq(pb_vec,smallest_vec),
  578               b_vec,
  579               c_vec
  580               )
  581             );
  582       rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4)));
  583 
  584       a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
  585       b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4);
  586       c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
  587       pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
  588       pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
  589       pc_vec = vec_add(pa_vec,pb_vec);
  590       pa_vec = vec_abs(pa_vec);
  591       pb_vec = vec_abs(pb_vec);
  592       pc_vec = vec_abs(pc_vec);
  593       smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
  594       nearest_vec =  if_then_else(
  595             vec_cmpeq(pa_vec,smallest_vec),
  596             a_vec,
  597             if_then_else(
  598               vec_cmpeq(pb_vec,smallest_vec),
  599               b_vec,
  600               c_vec
  601               )
  602             );
  603       rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4)));
  604 
  605       vec_st(rp_vec,0,rp);
  606 
  607       rp += 16;
  608       pp += 16;
  609       istop -= 16;
  610    }
  611 
  612    if(istop > 0)
  613       for (i = 0; i < istop % 16; i++)
  614       {
  615          vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
  616       }
  617 }
  618 
  619 void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row,
  620    png_const_bytep prev_row)
  621 {
  622   const png_byte bpp = 3;
  623 
  624   int a, b, c, pa, pb, pc, p;
  625   vector unsigned char rp_vec;
  626   vector unsigned char pp_vec;
  627   vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
  628   vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
  629 
  630   vsx_declare_common_vars(row_info,row,prev_row,bpp)
  631   rp -= bpp;
  632   if(istop >= bpp)
  633      istop -= bpp;
  634 
  635   /* Process the first pixel in the row completely (this is the same as 'up'
  636    * because there is only one candidate predictor for the first row).
  637    */
  638   for(i = 0; i < bpp ; i++)
  639   {
  640      *rp = (png_byte)( *rp + *pp);
  641      rp++;
  642      pp++;
  643   }
  644 
  645   for(i = 0; i < unaligned_top ; i++)
  646   {
  647      vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
  648   }
  649 
  650   while( istop >= 16)
  651   {
  652      for(i = 0; i < bpp ; i++)
  653      {
  654         vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
  655      }
  656 
  657      rp -= bpp;
  658      pp -= bpp;
  659      rp_vec = vec_ld(0,rp);
  660      vec_ld_unaligned(pp_vec,pp);
  661 
  662      a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
  663      b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3);
  664      c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
  665      pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
  666      pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
  667      pc_vec = vec_add(pa_vec,pb_vec);
  668      pa_vec = vec_abs(pa_vec);
  669      pb_vec = vec_abs(pb_vec);
  670      pc_vec = vec_abs(pc_vec);
  671      smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
  672      nearest_vec =  if_then_else(
  673            vec_cmpeq(pa_vec,smallest_vec),
  674            a_vec,
  675            if_then_else(
  676              vec_cmpeq(pb_vec,smallest_vec),
  677              b_vec,
  678              c_vec
  679              )
  680            );
  681      rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3)));
  682 
  683      a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
  684      b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3);
  685      c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
  686      pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
  687      pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
  688      pc_vec = vec_add(pa_vec,pb_vec);
  689      pa_vec = vec_abs(pa_vec);
  690      pb_vec = vec_abs(pb_vec);
  691      pc_vec = vec_abs(pc_vec);
  692      smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
  693      nearest_vec =  if_then_else(
  694            vec_cmpeq(pa_vec,smallest_vec),
  695            a_vec,
  696            if_then_else(
  697              vec_cmpeq(pb_vec,smallest_vec),
  698              b_vec,
  699              c_vec
  700              )
  701            );
  702      rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3)));
  703 
  704      a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
  705      b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3);
  706      c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
  707      pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
  708      pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
  709      pc_vec = vec_add(pa_vec,pb_vec);
  710      pa_vec = vec_abs(pa_vec);
  711      pb_vec = vec_abs(pb_vec);
  712      pc_vec = vec_abs(pc_vec);
  713      smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
  714      nearest_vec =  if_then_else(
  715            vec_cmpeq(pa_vec,smallest_vec),
  716            a_vec,
  717            if_then_else(
  718              vec_cmpeq(pb_vec,smallest_vec),
  719              b_vec,
  720              c_vec
  721              )
  722            );
  723      rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3)));
  724 
  725      a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
  726      b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3);
  727      c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
  728      pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
  729      pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
  730      pc_vec = vec_add(pa_vec,pb_vec);
  731      pa_vec = vec_abs(pa_vec);
  732      pb_vec = vec_abs(pb_vec);
  733      pc_vec = vec_abs(pc_vec);
  734      smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
  735      nearest_vec =  if_then_else(
  736            vec_cmpeq(pa_vec,smallest_vec),
  737            a_vec,
  738            if_then_else(
  739              vec_cmpeq(pb_vec,smallest_vec),
  740              b_vec,
  741              c_vec
  742              )
  743            );
  744      rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3)));
  745 
  746      vec_st(rp_vec,0,rp);
  747 
  748      rp += 15;
  749      pp += 15;
  750      istop -= 16;
  751 
  752      /* Since 16 % bpp = 16 % 3 = 1, last element of array must
  753       * be proceeded manually
  754       */
  755      vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
  756   }
  757 
  758   if(istop > 0)
  759      for (i = 0; i < istop % 16; i++)
  760      {
  761         vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
  762      }
  763 }
  764 
  765 #endif /* PNG_POWERPC_VSX_OPT > 0 */
  766 #endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */
  767 #endif /* READ */