"Fossies" - the Fresh Open Source Software Archive

Member "darktable-3.6.1/src/iop/amaze_demosaic_RT.cc" (10 Sep 2021, 104063 Bytes) of package /linux/misc/darktable-3.6.1.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "amaze_demosaic_RT.cc" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 3.4.1.1_vs_3.6.0.

    1 /*
    2     This file is part of darktable,
    3     Copyright (C) 2011-2020 darktable developers.
    4 
    5     darktable is free software: you can redistribute it and/or modify
    6     it under the terms of the GNU General Public License as published by
    7     the Free Software Foundation, either version 3 of the License, or
    8     (at your option) any later version.
    9 
   10     darktable is distributed in the hope that it will be useful,
   11     but WITHOUT ANY WARRANTY; without even the implied warranty of
   12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13     GNU General Public License for more details.
   14 
   15     You should have received a copy of the GNU General Public License
   16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
   17 */
   18 
   19 #define __STDC_FORMAT_MACROS
   20 
   21 #if defined(__SSE__)
   22 #include <xmmintrin.h>
   23 #endif
   24 
   25 extern "C" {
   26 #include "develop/imageop.h"
   27 #include "develop/imageop_math.h"
   28 
   29 // otherwise the name will be mangled and the linker won't be able to see the function ...
   30 void amaze_demosaic_RT(
   31     dt_dev_pixelpipe_iop_t *piece,
   32     const float *const in,
   33     float *out,
   34     const dt_iop_roi_t *const roi_in,
   35     const dt_iop_roi_t *const roi_out,
   36     const int filters);
   37 }
   38 
   39 #include <algorithm>
   40 #include <cmath>
   41 #include <cstdint>
   42 #include <cstdlib>
   43 #include <cstring>
   44 
   45 static __inline float clampnan(const float x, const float m, const float M)
   46 {
   47   float r;
   48 
   49   // clamp to [m, M] if x is infinite; return average of m and M if x is NaN; else just return x
   50 
   51   if(std::isinf(x))
   52     r = (std::isless(x, m) ? m : (std::isgreater(x, M) ? M : x));
   53   else if(std::isnan(x))
   54     r = (m + M) / 2.0f;
   55   else // normal number
   56     r = x;
   57 
   58   return r;
   59 }
   60 
   61 #ifndef __SSE2__
   62 static __inline float xmul2f(float d)
   63 {
   64   union {
   65       float f;
   66       uint32_t u;
   67   } x;
   68   x.f = d;
   69   if(x.u & 0x7FFFFFFF) // if f==0 do nothing
   70   {
   71     x.u += 1 << 23; // add 1 to the exponent
   72   }
   73   return x.f;
   74 }
   75 #endif
   76 
   77 static __inline float xdiv2f(float d)
   78 {
   79   union {
   80       float f;
   81       uint32_t u;
   82   } x;
   83   x.f = d;
   84   if(x.u & 0x7FFFFFFF) // if f==0 do nothing
   85   {
   86     x.u -= 1 << 23; // sub 1 from the exponent
   87   }
   88   return x.f;
   89 }
   90 
   91 static __inline float xdivf(float d, int n)
   92 {
   93   union {
   94       float f;
   95       uint32_t u;
   96   } x;
   97   x.f = d;
   98   if(x.u & 0x7FFFFFFF) // if f==0 do nothing
   99   {
  100     x.u -= n << 23; // add n to the exponent
  101   }
  102   return x.f;
  103 }
  104 
  105 
  106 /*==================================================================================
  107  * begin raw therapee code, hg checkout of march 03, 2016 branch master.
  108  *==================================================================================*/
  109 
  110 
  111 #ifdef __SSE2__
  112 
  113 #ifdef __GNUC__
  114 #define INLINE __inline
  115 #else
  116 #define INLINE inline
  117 #endif
  118 
  119 #ifdef __GNUC__
  120 #if((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4) && (!defined(WIN32) || defined(__x86_64__))
  121 #define LVF(x) _mm_load_ps(&x)
  122 #define LVFU(x) _mm_loadu_ps(&x)
  123 #define STVF(x, y) _mm_store_ps(&x, y)
  124 #define STVFU(x, y) _mm_storeu_ps(&x, y)
  125 #else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3, also need to map the
  126       // aligned functions to unaligned functions for WIN32 builds
  127 #define LVF(x) _mm_loadu_ps(&x)
  128 #define LVFU(x) _mm_loadu_ps(&x)
  129 #define STVF(x, y) _mm_storeu_ps(&x, y)
  130 #define STVFU(x, y) _mm_storeu_ps(&x, y)
  131 #endif
  132 #else
  133 #define LVF(x) _mm_load_ps(&x)
  134 #define LVFU(x) _mm_loadu_ps(&x)
  135 #define STVF(x, y) _mm_store_ps(&x, y)
  136 #define STVFU(x, y) _mm_storeu_ps(&x, y)
  137 #endif
  138 
  139 #define STC2VFU(a, v)                                                                                        \
  140   {                                                                                                          \
  141     __m128 TST1V = _mm_loadu_ps(&a);                                                                         \
  142     __m128 TST2V = _mm_unpacklo_ps(v, v);                                                                    \
  143     vmask cmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);                                               \
  144     _mm_storeu_ps(&a, vself(cmask, TST1V, TST2V));                                                           \
  145     TST1V = _mm_loadu_ps((&a) + 4);                                                                          \
  146     TST2V = _mm_unpackhi_ps(v, v);                                                                           \
  147     _mm_storeu_ps((&a) + 4, vself(cmask, TST1V, TST2V));                                                     \
  148   }
  149 
  150 #define ZEROV _mm_setzero_ps()
  151 #define F2V(a) _mm_set1_ps((a))
  152 
  153 typedef __m128i vmask;
  154 typedef __m128 vfloat;
  155 typedef __m128i vint;
  156 
  157 static INLINE vfloat LC2VFU(float &a)
  158 {
  159   // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
  160   vfloat a1 = _mm_loadu_ps(&a);
  161   vfloat a2 = _mm_loadu_ps((&a) + 4);
  162   return _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(2, 0, 2, 0));
  163 }
  164 static INLINE vfloat vmaxf(vfloat x, vfloat y)
  165 {
  166   return _mm_max_ps(x, y);
  167 }
  168 static INLINE vfloat vminf(vfloat x, vfloat y)
  169 {
  170   return _mm_min_ps(x, y);
  171 }
  172 static INLINE vfloat vcast_vf_f(float f)
  173 {
  174   return _mm_set_ps(f, f, f, f);
  175 }
  176 static INLINE vmask vorm(vmask x, vmask y)
  177 {
  178   return _mm_or_si128(x, y);
  179 }
  180 static INLINE vmask vandm(vmask x, vmask y)
  181 {
  182   return _mm_and_si128(x, y);
  183 }
  184 static INLINE vmask vandnotm(vmask x, vmask y)
  185 {
  186   return _mm_andnot_si128(x, y);
  187 }
  188 static INLINE vfloat vabsf(vfloat f)
  189 {
  190   return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f);
  191 }
  192 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y)
  193 {
  194   return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
  195 }
  196 static INLINE vmask vmaskf_lt(vfloat x, vfloat y)
  197 {
  198   return (__m128i)_mm_cmplt_ps(x, y);
  199 }
  200 static INLINE vmask vmaskf_gt(vfloat x, vfloat y)
  201 {
  202   return (__m128i)_mm_cmpgt_ps(x, y);
  203 }
  204 static INLINE vfloat ULIMV(vfloat a, vfloat b, vfloat c)
  205 {
  206   // made to clamp a in range [b,c] but in fact it's also the median of a,b,c, which means that the result is
  207   // independent on order of arguments
  208   // ULIMV(a,b,c) = ULIMV(a,c,b) = ULIMV(b,a,c) = ULIMV(b,c,a) = ULIMV(c,a,b) = ULIMV(c,b,a)
  209   return vmaxf(vminf(a, b), vminf(vmaxf(a, b), c));
  210 }
  211 static INLINE vfloat SQRV(vfloat a)
  212 {
  213   return a * a;
  214 }
  215 static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c)
  216 {
  217   // calculate a * b + (1 - a) * c (interpolate two values)
  218   // following is valid:
  219   // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
  220   // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
  221   return a * (b - c) + c;
  222 }
  223 static INLINE vfloat vaddc2vfu(float &a)
  224 {
  225   // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
  226   vfloat a1 = _mm_loadu_ps(&a);
  227   vfloat a2 = _mm_loadu_ps((&a) + 4);
  228   return _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(2, 0, 2, 0)) + _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(3, 1, 3, 1));
  229 }
  230 static INLINE vfloat vadivapb(vfloat a, vfloat b)
  231 {
  232   return a / (a + b);
  233 }
  234 static INLINE vint vselc(vmask mask, vint x, vint y)
  235 {
  236   return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
  237 }
  238 static INLINE vint vselinotzero(vmask mask, vint x)
  239 {
  240   // returns value of x if corresponding mask bits are 0, else returns 0
  241   // faster than vselc(mask, ZEROV, x)
  242   return _mm_andnot_si128(mask, x);
  243 }
  244 static INLINE vfloat vmul2f(vfloat a)
  245 {
  246   // fastest way to multiply by 2
  247   return a + a;
  248 }
  249 static INLINE vmask vmaskf_ge(vfloat x, vfloat y)
  250 {
  251   return (__m128i)_mm_cmpge_ps(x, y);
  252 }
  253 static INLINE vmask vnotm(vmask x)
  254 {
  255   return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
  256 }
  257 static INLINE vfloat vdup(vfloat a)
  258 {
  259   // returns { a[0],a[0],a[1],a[1] }
  260   return _mm_unpacklo_ps(a, a);
  261 }
  262 
  263 #endif // __SSE2__
  264 
  265 template <typename _Tp> static inline const _Tp SQR(_Tp x)
  266 {
  267   //      return std::pow(x,2); Slower than:
  268   return (x * x);
  269 }
  270 
  271 template <typename _Tp> static inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c)
  272 {
  273   // calculate a * b + (1 - a) * c
  274   // following is valid:
  275   // intp(a, b+x, c+x) = intp(a, b, c) + x
  276   // intp(a, b*x, c*x) = intp(a, b, c) * x
  277   return a * (b - c) + c;
  278 }
  279 
  280 template <typename _Tp> static inline const _Tp LIM(const _Tp a, const _Tp b, const _Tp c)
  281 {
  282   return std::max(b, std::min(a, c));
  283 }
  284 
  285 template <typename _Tp> static inline const _Tp ULIM(const _Tp a, const _Tp b, const _Tp c)
  286 {
  287   return ((b < c) ? LIM(a, b, c) : LIM(a, c, b));
  288 }
  289 
  290 
  291 
  292 ////////////////////////////////////////////////////////////////
  293 //
  294 //          AMaZE demosaic algorithm
  295 // (Aliasing Minimization and Zipper Elimination)
  296 //
  297 //  copyright (c) 2008-2010  Emil Martinec <ejmartin@uchicago.edu>
  298 //  optimized for speed by Ingo Weyrich
  299 //
  300 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
  301 //
  302 // code dated: May 27, 2010
  303 // latest modification: Ingo Weyrich, January 25, 2016
  304 //
  305 //  amaze_interpolate_RT.cc is free software: you can redistribute it and/or modify
  306 //  it under the terms of the GNU General Public License as published by
  307 //  the Free Software Foundation, either version 3 of the License, or
  308 //  (at your option) any later version.
  309 //
  310 //  This program is distributed in the hope that it will be useful,
  311 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  312 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  313 //  GNU General Public License for more details.
  314 //
  315 //  You should have received a copy of the GNU General Public License
  316 //  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  317 //
  318 ////////////////////////////////////////////////////////////////
  319 
  320 
  321 void amaze_demosaic_RT(dt_dev_pixelpipe_iop_t *piece, const float *const in,
  322                        float *out, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
  323                        const int filters)
  324 {
  325   int winx = roi_out->x;
  326   int winy = roi_out->y;
  327   int winw = roi_in->width;
  328   int winh = roi_in->height;
  329 
  330   const int width = winw, height = winh;
  331   const float clip_pt = fminf(piece->pipe->dsc.processed_maximum[0],
  332                               fminf(piece->pipe->dsc.processed_maximum[1], piece->pipe->dsc.processed_maximum[2]));
  333   const float clip_pt8 = 0.8f * clip_pt;
  334 
  335 // this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
  336 // If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines
  337 #ifndef AMAZETS
  338 #define AMAZETS 160
  339 #endif
  340   // Tile size; the image is processed in square tiles to lower memory requirements and facilitate
  341   // multi-threading
  342   // We assure that Tile size is a multiple of 32 in the range [96;992]
  343   constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992);
  344   constexpr int tsh = ts / 2; // half of Tile size
  345 
  346   // offset of R pixel within a Bayer quartet
  347   int ex, ey;
  348 
  349   // determine GRBG coset; (ey,ex) is the offset of the R subarray
  350   if(FC(0, 0, filters) == 1)
  351   { // first pixel is G
  352     if(FC(0, 1, filters) == 0)
  353     {
  354       ey = 0;
  355       ex = 1;
  356     }
  357     else
  358     {
  359       ey = 1;
  360       ex = 0;
  361     }
  362   }
  363   else
  364   { // first pixel is R or B
  365     if(FC(0, 0, filters) == 0)
  366     {
  367       ey = 0;
  368       ex = 0;
  369     }
  370     else
  371     {
  372       ey = 1;
  373       ex = 1;
  374     }
  375   }
  376 
  377   // shifts of pointer value to access pixels in vertical and diagonal directions
  378   constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3,
  379                 m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3;
  380 
  381   // tolerance to avoid dividing by zero
  382   constexpr float eps = 1e-5, epssq = 1e-10; // tolerance to avoid dividing by zero
  383 
  384   // adaptive ratios threshold
  385   constexpr float arthresh = 0.75;
  386 
  387   // gaussian on 5x5 quincunx, sigma=1.2
  388   constexpr float gaussodd[4]
  389       = { 0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f };
  390   // nyquist texture test threshold
  391   constexpr float nyqthresh = 0.5;
  392   // gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
  393   // Is this really sigma=1.2????, seems more like sigma = 1.672
  394   constexpr float gaussgrad[6] = { nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f,
  395                                    nyqthresh * 0.0521818194747806f,  nyqthresh * 0.03687419286733595f,
  396                                    nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f };
  397   // gaussian on 5x5 alt quincunx, sigma=1.5
  398   constexpr float gausseven[2] = { 0.13719494435797422f, 0.05640252782101291f };
  399   // gaussian on quincunx grid
  400   constexpr float gquinc[4] = { 0.169917f, 0.108947f, 0.069855f, 0.0287182f };
  401 
  402   typedef struct
  403   {
  404     float h;
  405     float v;
  406   } s_hv;
  407 
  408 #ifdef _OPENMP
  409 #pragma omp parallel
  410 #endif
  411   {
  412     //     int progresscounter = 0;
  413 
  414     constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
  415     // assign working space
  416     char *buffer
  417         = (char *)calloc(sizeof(float) * 14 * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1);
  418     // aligned to 64 byte boundary
  419     char *data = (char *)((uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
  420 
  421     // green values
  422     float *rgbgreen = (float(*))data;
  423     // sum of square of horizontal gradient and square of vertical gradient
  424     float *delhvsqsum = (float(*))((char *)rgbgreen + sizeof(float) * ts * ts + cldf * 64); // 1
  425     // gradient based directional weights for interpolation
  426     float *dirwts0 = (float(*))((char *)delhvsqsum + sizeof(float) * ts * ts + cldf * 64); // 1
  427     float *dirwts1 = (float(*))((char *)dirwts0 + sizeof(float) * ts * ts + cldf * 64);    // 1
  428     // vertically interpolated colour differences G-R, G-B
  429     float *vcd = (float(*))((char *)dirwts1 + sizeof(float) * ts * ts + cldf * 64); // 1
  430     // horizontally interpolated colour differences
  431     float *hcd = (float(*))((char *)vcd + sizeof(float) * ts * ts + cldf * 64); // 1
  432     // alternative vertical interpolation
  433     float *vcdalt = (float(*))((char *)hcd + sizeof(float) * ts * ts + cldf * 64); // 1
  434     // alternative horizontal interpolation
  435     float *hcdalt = (float(*))((char *)vcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
  436     // square of average colour difference
  437     float *cddiffsq = (float(*))((char *)hcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
  438     // weight to give horizontal vs vertical interpolation
  439     float *hvwt = (float(*))((char *)cddiffsq + sizeof(float) * ts * ts + 2 * cldf * 64); // 1
  440     // final interpolated colour difference
  441     float(*Dgrb)[ts * tsh] = (float(*)[ts * tsh])vcdalt; // there is no overlap in buffer usage => share
  442     // gradient in plus (NE/SW) direction
  443     float *delp = (float(*))cddiffsq; // there is no overlap in buffer usage => share
  444     // gradient in minus (NW/SE) direction
  445     float *delm = (float(*))((char *)delp + sizeof(float) * ts * tsh + cldf * 64);
  446     // diagonal interpolation of R+B
  447     float *rbint = (float(*))delm; // there is no overlap in buffer usage => share
  448     // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture
  449     // regions)
  450     s_hv *Dgrb2 = (s_hv(*))((char *)hvwt + sizeof(float) * ts * tsh + cldf * 64); // 1
  451     // difference between up/down interpolations of G
  452     float *dgintv = (float(*))Dgrb2; // there is no overlap in buffer usage => share
  453     // difference between left/right interpolations of G
  454     float *dginth = (float(*))((char *)dgintv + sizeof(float) * ts * ts + cldf * 64); // 1
  455     // square of diagonal colour differences
  456     float *Dgrbsq1m = (float(*))((char *)dginth + sizeof(float) * ts * ts + cldf * 64);    // 1
  457     float *Dgrbsq1p = (float(*))((char *)Dgrbsq1m + sizeof(float) * ts * tsh + cldf * 64); // 1
  458     // tile raw data
  459     float *cfa = (float(*))((char *)Dgrbsq1p + sizeof(float) * ts * tsh + cldf * 64); // 1
  460     // relative weight for combining plus and minus diagonal interpolations
  461     float *pmwt = (float(*))delhvsqsum; // there is no overlap in buffer usage => share
  462     // interpolated colour difference R-B in minus and plus direction
  463     float *rbm = (float(*))vcd; // there is no overlap in buffer usage => share
  464     float *rbp = (float(*))((char *)rbm + sizeof(float) * ts * tsh + cldf * 64);
  465     // nyquist texture flags 1=nyquist, 0=not nyquist
  466     unsigned char *nyquist = (unsigned char(*))((char *)cfa + sizeof(float) * ts * ts + cldf * 64); // 1
  467     unsigned char *nyquist2 = (unsigned char(*))cddiffsq;
  468     float *nyqutest = (float(*))((char *)nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64); // 1
  469 
  470 // Main algorithm: Tile loop
  471 // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
  472 #ifdef _OPENMP
  473 #pragma omp for SIMD() schedule(static) collapse(2) nowait
  474 #endif
  475 
  476     for(int top = winy - 16; top < winy + height; top += ts - 32)
  477     {
  478       for(int left = winx - 16; left < winx + width; left += ts - 32)
  479       {
  480         memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
  481         // location of tile bottom edge
  482         int bottom = MIN(top + ts, winy + height + 16);
  483         // location of tile right edge
  484         int right = MIN(left + ts, winx + width + 16);
  485         // tile width  (=ts except for right edge of image)
  486         int rr1 = bottom - top;
  487         // tile height (=ts except for bottom edge of image)
  488         int cc1 = right - left;
  489         // bookkeeping for borders
  490         // min and max row/column in the tile
  491         int rrmin = top < winy ? 16 : 0;
  492         int ccmin = left < winx ? 16 : 0;
  493         int rrmax = bottom > (winy + height) ? winy + height - top : rr1;
  494         int ccmax = right > (winx + width) ? winx + width - left : cc1;
  495 
  496 // rgb from input CFA data
  497 // rgb values should be floating point number between 0 and 1
  498 // after white balance multipliers are applied
  499 // a 16 pixel border is added to each side of the image
  500 
  501 // begin of tile initialization
  502 #ifdef __SSE2__
  503         // fill upper border
  504         if(rrmin > 0)
  505         {
  506           for(int rr = 0; rr < 16; rr++)
  507           {
  508             int row = 32 - rr + top;
  509 
  510             for(int cc = ccmin; cc < ccmax; cc += 4)
  511             {
  512               int indx1 = rr * ts + cc;
  513               vfloat tempv = LVFU(in[row * width + (cc + left)]);
  514               STVF(cfa[indx1], tempv);
  515               STVF(rgbgreen[indx1], tempv);
  516             }
  517           }
  518         }
  519 
  520         // fill inner part
  521         for(int rr = rrmin; rr < rrmax; rr++)
  522         {
  523           int row = rr + top;
  524 
  525           for(int cc = ccmin; cc < ccmax; cc += 4)
  526           {
  527             int indx1 = rr * ts + cc;
  528             vfloat tempv = LVFU(in[row * width + (cc + left)]);
  529             STVF(cfa[indx1], tempv);
  530             STVF(rgbgreen[indx1], tempv);
  531           }
  532         }
  533 
  534         // fill lower border
  535         if(rrmax < rr1)
  536         {
  537           for(int rr = 0; rr < 16; rr++)
  538             for(int cc = ccmin; cc < ccmax; cc += 4)
  539             {
  540               int indx1 = (rrmax + rr) * ts + cc;
  541               vfloat tempv = LVFU(in[(winy + height - rr - 2) * width + (left + cc)]);
  542               STVF(cfa[indx1], tempv);
  543               STVF(rgbgreen[indx1], tempv);
  544             }
  545         }
  546 
  547 #else
  548 
  549         // fill upper border
  550         if(rrmin > 0)
  551         {
  552           for(int rr = 0; rr < 16; rr++)
  553             for(int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++)
  554             {
  555               cfa[rr * ts + cc] = (in[row * width + (cc + left)]);
  556               rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
  557             }
  558         }
  559 
  560         // fill inner part
  561         for(int rr = rrmin; rr < rrmax; rr++)
  562         {
  563           int row = rr + top;
  564 
  565           for(int cc = ccmin; cc < ccmax; cc++)
  566           {
  567             int indx1 = rr * ts + cc;
  568             cfa[indx1] = (in[row * width + (cc + left)]);
  569             rgbgreen[indx1] = cfa[indx1];
  570           }
  571         }
  572 
  573         // fill lower border
  574         if(rrmax < rr1)
  575         {
  576           for(int rr = 0; rr < 16; rr++)
  577             for(int cc = ccmin; cc < ccmax; cc++)
  578             {
  579               cfa[(rrmax + rr) * ts + cc] = (in[(winy + height - rr - 2) * width + (left + cc)]);
  580               rgbgreen[(rrmax + rr) * ts + cc] = cfa[(rrmax + rr) * ts + cc];
  581             }
  582         }
  583 
  584 #endif
  585 
  586         // fill left border
  587         if(ccmin > 0)
  588         {
  589           for(int rr = rrmin; rr < rrmax; rr++)
  590             for(int cc = 0, row = rr + top; cc < 16; cc++)
  591             {
  592               cfa[rr * ts + cc] = (in[row * width + (32 - cc + left)]);
  593               rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
  594             }
  595         }
  596 
  597         // fill right border
  598         if(ccmax < cc1)
  599         {
  600           for(int rr = rrmin; rr < rrmax; rr++)
  601             for(int cc = 0; cc < 16; cc++)
  602             {
  603               cfa[rr * ts + ccmax + cc] = (in[(top + rr) * width + ((winx + width - cc - 2))]);
  604               rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc];
  605             }
  606         }
  607 
  608         // also, fill the image corners
  609         if(rrmin > 0 && ccmin > 0)
  610         {
  611           for(int rr = 0; rr < 16; rr++)
  612             for(int cc = 0; cc < 16; cc++)
  613             {
  614               cfa[(rr)*ts + cc] = (in[(winy + 32 - rr) * width + (winx + 32 - cc)]);
  615               rgbgreen[(rr)*ts + cc] = cfa[(rr)*ts + cc];
  616             }
  617         }
  618 
  619         if(rrmax < rr1 && ccmax < cc1)
  620         {
  621           for(int rr = 0; rr < 16; rr++)
  622             for(int cc = 0; cc < 16; cc++)
  623             {
  624               cfa[(rrmax + rr) * ts + ccmax + cc]
  625                   = (in[(winy + height - rr - 2) * width + ((winx + width - cc - 2))]);
  626               rgbgreen[(rrmax + rr) * ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc];
  627             }
  628         }
  629 
  630         if(rrmin > 0 && ccmax < cc1)
  631         {
  632           for(int rr = 0; rr < 16; rr++)
  633             for(int cc = 0; cc < 16; cc++)
  634             {
  635               cfa[(rr)*ts + ccmax + cc] = (in[(winy + 32 - rr) * width + ((winx + width - cc - 2))]);
  636               rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr)*ts + ccmax + cc];
  637             }
  638         }
  639 
  640         if(rrmax < rr1 && ccmin > 0)
  641         {
  642           for(int rr = 0; rr < 16; rr++)
  643             for(int cc = 0; cc < 16; cc++)
  644             {
  645               cfa[(rrmax + rr) * ts + cc] = (in[(winy + height - rr - 2) * width + ((winx + 32 - cc))]);
  646               rgbgreen[(rrmax + rr) * ts + cc] = cfa[(rrmax + rr) * ts + cc];
  647             }
  648         }
  649 
  650 // end of tile initialization
  651 
  652 // horizontal and vertical gradients
  653 #ifdef __SSE2__
  654         vfloat epsv = F2V(eps);
  655 
  656         for(int rr = 2; rr < rr1 - 2; rr++)
  657         {
  658           for(int indx = rr * ts; indx < rr * ts + cc1; indx += 4)
  659           {
  660             vfloat delhv = vabsf(LVFU(cfa[indx + 1]) - LVFU(cfa[indx - 1]));
  661             vfloat delvv = vabsf(LVF(cfa[indx + v1]) - LVF(cfa[indx - v1]));
  662             STVF(dirwts1[indx], epsv + vabsf(LVFU(cfa[indx + 2]) - LVF(cfa[indx]))
  663                                     + vabsf(LVF(cfa[indx]) - LVFU(cfa[indx - 2])) + delhv);
  664             STVF(dirwts0[indx], epsv + vabsf(LVF(cfa[indx + v2]) - LVF(cfa[indx]))
  665                                     + vabsf(LVF(cfa[indx]) - LVF(cfa[indx - v2])) + delvv);
  666             STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv));
  667           }
  668         }
  669 
  670 #else
  671 
  672         for(int rr = 2; rr < rr1 - 2; rr++)
  673           for(int cc = 2, indx = (rr)*ts + cc; cc < cc1 - 2; cc++, indx++)
  674           {
  675             float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
  676             float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
  677             dirwts0[indx]
  678                 = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
  679             dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh;
  680             delhvsqsum[indx] = SQR(delh) + SQR(delv);
  681           }
  682 
  683 #endif
  684 
  685 // interpolate vertical and horizontal colour differences
  686 #ifdef __SSE2__
  687         vfloat sgnv;
  688 
  689         if(!(FC(4, 4, filters) & 1))
  690         {
  691           sgnv = _mm_set_ps(1.f, -1.f, 1.f, -1.f);
  692         }
  693         else
  694         {
  695           sgnv = _mm_set_ps(-1.f, 1.f, -1.f, 1.f);
  696         }
  697 
  698         vfloat zd5v = F2V(0.5f);
  699         vfloat onev = F2V(1.f);
  700         vfloat arthreshv = F2V(arthresh);
  701         vfloat clip_pt8v = F2V(clip_pt8);
  702 
  703         for(int rr = 4; rr < rr1 - 4; rr++)
  704         {
  705           sgnv = -sgnv;
  706 
  707           for(int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4)
  708           {
  709             // colour ratios in each cardinal direction
  710             vfloat cfav = LVF(cfa[indx]);
  711             vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx]))
  712                           / (LVF(dirwts0[indx - v2]) * (epsv + cfav)
  713                              + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
  714             vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx]))
  715                           / (LVF(dirwts0[indx + v2]) * (epsv + cfav)
  716                              + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
  717             vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx]))
  718                           / (LVFU(dirwts1[indx - 2]) * (epsv + cfav)
  719                              + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
  720             vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx]))
  721                           / (LVFU(dirwts1[indx + 2]) * (epsv + cfav)
  722                              + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
  723 
  724             // G interpolated in vert/hor directions using Hamilton-Adams method
  725             vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
  726             vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
  727             vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
  728             vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
  729 
  730             // G interpolated in vert/hor directions using adaptive ratios
  731             vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
  732             vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
  733             vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
  734             vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
  735 
  736             // adaptive weights for vertical/horizontal directions
  737             vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
  738             vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
  739 
  740             // interpolated G via adaptive weights of cardinal evaluations
  741             vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
  742             vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
  743 
  744             // interpolated colour differences
  745             vfloat hcdaltv = sgnv * (Ginthhav - cfav);
  746             vfloat vcdaltv = sgnv * (Gintvhav - cfav);
  747             STVF(hcdalt[indx], hcdaltv);
  748             STVF(vcdalt[indx], vcdaltv);
  749 
  750             vmask clipmask = vorm(vorm(vmaskf_gt(cfav, clip_pt8v), vmaskf_gt(Gintvhav, clip_pt8v)),
  751                                   vmaskf_gt(Ginthhav, clip_pt8v));
  752             guarv = vself(clipmask, guhav, guarv);
  753             gdarv = vself(clipmask, gdhav, gdarv);
  754             glarv = vself(clipmask, glhav, glarv);
  755             grarv = vself(clipmask, grhav, grarv);
  756 
  757             // use HA if highlights are (nearly) clipped
  758             STVF(vcd[indx], vself(clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
  759             STVF(hcd[indx], vself(clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
  760 
  761             // differences of interpolations in opposite directions
  762             STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
  763             STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
  764           }
  765         }
  766 
  767 #else
  768 
  769         for(int rr = 4; rr < rr1 - 4; rr++)
  770         {
  771           bool fcswitch = FC(rr, 4, filters) & 1;
  772 
  773           for(int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++)
  774           {
  775 
  776             // colour ratios in each cardinal direction
  777             float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx])
  778                         / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
  779             float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx])
  780                         / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
  781             float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx])
  782                         / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
  783             float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx])
  784                         / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
  785 
  786             // G interpolated in vert/hor directions using Hamilton-Adams method
  787             float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]);
  788             float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]);
  789             float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]);
  790             float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]);
  791 
  792             // G interpolated in vert/hor directions using adaptive ratios
  793             float guar, gdar, glar, grar;
  794 
  795             if(fabsf(1.f - cru) < arthresh)
  796             {
  797               guar = cfa[indx] * cru;
  798             }
  799             else
  800             {
  801               guar = guha;
  802             }
  803 
  804             if(fabsf(1.f - crd) < arthresh)
  805             {
  806               gdar = cfa[indx] * crd;
  807             }
  808             else
  809             {
  810               gdar = gdha;
  811             }
  812 
  813             if(fabsf(1.f - crl) < arthresh)
  814             {
  815               glar = cfa[indx] * crl;
  816             }
  817             else
  818             {
  819               glar = glha;
  820             }
  821 
  822             if(fabsf(1.f - crr) < arthresh)
  823             {
  824               grar = cfa[indx] * crr;
  825             }
  826             else
  827             {
  828               grar = grha;
  829             }
  830 
  831             // adaptive weights for vertical/horizontal directions
  832             float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
  833             float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
  834 
  835             // interpolated G via adaptive weights of cardinal evaluations
  836             float Gintvha = vwt * gdha + (1.f - vwt) * guha;
  837             float Ginthha = hwt * grha + (1.f - hwt) * glha;
  838 
  839             // interpolated colour differences
  840             if(fcswitch)
  841             {
  842               vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar);
  843               hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar);
  844               vcdalt[indx] = cfa[indx] - Gintvha;
  845               hcdalt[indx] = cfa[indx] - Ginthha;
  846             }
  847             else
  848             {
  849               // interpolated colour differences
  850               vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx];
  851               hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx];
  852               vcdalt[indx] = Gintvha - cfa[indx];
  853               hcdalt[indx] = Ginthha - cfa[indx];
  854             }
  855 
  856             fcswitch = !fcswitch;
  857 
  858             if(cfa[indx] > clip_pt8 || Gintvha > clip_pt8 || Ginthha > clip_pt8)
  859             {
  860               // use HA if highlights are (nearly) clipped
  861               guar = guha;
  862               gdar = gdha;
  863               glar = glha;
  864               grar = grha;
  865               vcd[indx] = vcdalt[indx];
  866               hcd[indx] = hcdalt[indx];
  867             }
  868 
  869             // differences of interpolations in opposite directions
  870             dgintv[indx] = MIN(SQR(guha - gdha), SQR(guar - gdar));
  871             dginth[indx] = MIN(SQR(glha - grha), SQR(glar - grar));
  872           }
  873         }
  874 
  875 #endif
  876 
  877 
  878 
  879 #ifdef __SSE2__
  880         vfloat clip_ptv = F2V(clip_pt);
  881         vfloat sgn3v;
  882 
  883         if(!(FC(4, 4, filters) & 1))
  884         {
  885           sgnv = _mm_set_ps(1.f, -1.f, 1.f, -1.f);
  886         }
  887         else
  888         {
  889           sgnv = _mm_set_ps(-1.f, 1.f, -1.f, 1.f);
  890         }
  891 
  892         sgn3v = sgnv + sgnv + sgnv;
  893 
  894         for(int rr = 4; rr < rr1 - 4; rr++)
  895         {
  896           vfloat nsgnv = sgnv;
  897           sgnv = -sgnv;
  898           sgn3v = -sgn3v;
  899 
  900           for(int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4)
  901           {
  902             vfloat hcdv = LVF(hcd[indx]);
  903             vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv)
  904                              + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2]))
  905                              + SQRV(hcdv - LVFU(hcd[indx + 2]));
  906             vfloat hcdaltv = LVF(hcdalt[indx]);
  907             vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv)
  908                                 + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2]))
  909                                 + SQRV(hcdaltv - LVFU(hcdalt[indx + 2]));
  910             vfloat vcdv = LVF(vcd[indx]);
  911             vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv)
  912                              + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2]))
  913                              + SQRV(vcdv - LVF(vcd[indx + v2]));
  914             vfloat vcdaltv = LVF(vcdalt[indx]);
  915             vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv)
  916                                 + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2]))
  917                                 + SQRV(vcdaltv - LVF(vcdalt[indx + v2]));
  918 
  919             // choose the smallest variance; this yields a smoother interpolation
  920             hcdv = vself(vmaskf_lt(hcdaltvarv, hcdvarv), hcdaltv, hcdv);
  921             vcdv = vself(vmaskf_lt(vcdaltvarv, vcdvarv), vcdaltv, vcdv);
  922 
  923             // bound the interpolation in regions of high saturation
  924             // vertical and horizontal G interpolations
  925             vfloat Ginthv = sgnv * hcdv + LVF(cfa[indx]);
  926             vfloat temp2v = sgn3v * hcdv;
  927             vfloat hwtv = onev + temp2v / (epsv + Ginthv + LVF(cfa[indx]));
  928             vmask hcdmask = vmaskf_gt(nsgnv * hcdv, ZEROV);
  929             vfloat hcdoldv = hcdv;
  930             vfloat tempv = nsgnv * (LVF(cfa[indx]) - ULIMV(Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1])));
  931             hcdv = vself(vmaskf_lt(temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv));
  932             hcdv = vself(hcdmask, hcdv, hcdoldv);
  933             hcdv = vself(vmaskf_gt(Ginthv, clip_ptv), tempv, hcdv);
  934             STVF(hcd[indx], hcdv);
  935 
  936             vfloat Gintvv = sgnv * vcdv + LVF(cfa[indx]);
  937             temp2v = sgn3v * vcdv;
  938             vfloat vwtv = onev + temp2v / (epsv + Gintvv + LVF(cfa[indx]));
  939             vmask vcdmask = vmaskf_gt(nsgnv * vcdv, ZEROV);
  940             vfloat vcdoldv = vcdv;
  941             tempv = nsgnv * (LVF(cfa[indx]) - ULIMV(Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1])));
  942             vcdv = vself(vmaskf_lt(temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv));
  943             vcdv = vself(vcdmask, vcdv, vcdoldv);
  944             vcdv = vself(vmaskf_gt(Gintvv, clip_ptv), tempv, vcdv);
  945             STVF(vcd[indx], vcdv);
  946             STVFU(cddiffsq[indx], SQRV(vcdv - hcdv));
  947           }
  948         }
  949 
  950 #else
  951 
  952         for(int rr = 4; rr < rr1 - 4; rr++)
  953         {
  954           for(int cc = 4, indx = rr * ts + cc, c = FC(rr, cc, filters) & 1; cc < cc1 - 4; cc++, indx++)
  955           {
  956             float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2]))
  957                            - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
  958             float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2]))
  959                               - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
  960             float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2]))
  961                            - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
  962             float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2]))
  963                               - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
  964 
  965             // choose the smallest variance; this yields a smoother interpolation
  966             if(hcdaltvar < hcdvar)
  967             {
  968               hcd[indx] = hcdalt[indx];
  969             }
  970 
  971             if(vcdaltvar < vcdvar)
  972             {
  973               vcd[indx] = vcdalt[indx];
  974             }
  975 
  976             // bound the interpolation in regions of high saturation
  977             // vertical and horizontal G interpolations
  978             float Gintv, Ginth;
  979 
  980             if(c)
  981             {                                 // G site
  982               Ginth = -hcd[indx] + cfa[indx]; // R or B
  983               Gintv = -vcd[indx] + cfa[indx]; // B or R
  984 
  985               if(hcd[indx] > 0)
  986               {
  987                 if(3.f * hcd[indx] > (Ginth + cfa[indx]))
  988                 {
  989                   hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
  990                 }
  991                 else
  992                 {
  993                   float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
  994                   hcd[indx] = hwt * hcd[indx]
  995                               + (1.f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
  996                 }
  997               }
  998 
  999               if(vcd[indx] > 0)
 1000               {
 1001                 if(3.f * vcd[indx] > (Gintv + cfa[indx]))
 1002                 {
 1003                   vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
 1004                 }
 1005                 else
 1006                 {
 1007                   float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
 1008                   vcd[indx] = vwt * vcd[indx]
 1009                               + (1.f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
 1010                 }
 1011               }
 1012 
 1013               if(Ginth > clip_pt)
 1014               {
 1015                 hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
 1016               }
 1017 
 1018               if(Gintv > clip_pt)
 1019               {
 1020                 vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
 1021               }
 1022             }
 1023             else
 1024             { // R or B site
 1025 
 1026               Ginth = hcd[indx] + cfa[indx]; // interpolated G
 1027               Gintv = vcd[indx] + cfa[indx];
 1028 
 1029               if(hcd[indx] < 0)
 1030               {
 1031                 if(3.f * hcd[indx] < -(Ginth + cfa[indx]))
 1032                 {
 1033                   hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
 1034                 }
 1035                 else
 1036                 {
 1037                   float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
 1038                   hcd[indx] = hwt * hcd[indx]
 1039                               + (1.f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
 1040                 }
 1041               }
 1042 
 1043               if(vcd[indx] < 0)
 1044               {
 1045                 if(3.f * vcd[indx] < -(Gintv + cfa[indx]))
 1046                 {
 1047                   vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
 1048                 }
 1049                 else
 1050                 {
 1051                   float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
 1052                   vcd[indx] = vwt * vcd[indx]
 1053                               + (1.f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
 1054                 }
 1055               }
 1056 
 1057               if(Ginth > clip_pt)
 1058               {
 1059                 hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
 1060               }
 1061 
 1062               if(Gintv > clip_pt)
 1063               {
 1064                 vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
 1065               }
 1066 
 1067               cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]);
 1068             }
 1069 
 1070             c = !c;
 1071           }
 1072         }
 1073 
 1074 #endif
 1075 
 1076 
 1077 
 1078 #ifdef __SSE2__
 1079         vfloat epssqv = F2V(epssq);
 1080 
 1081         for(int rr = 6; rr < rr1 - 6; rr++)
 1082         {
 1083           for(int indx = rr * ts + 6 + (FC(rr, 2, filters) & 1); indx < rr * ts + cc1 - 6; indx += 8)
 1084           {
 1085             // compute colour difference variances in cardinal directions
 1086             vfloat tempv = LC2VFU(vcd[indx]);
 1087             vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
 1088             vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
 1089             vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev)
 1090                                 + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
 1091             vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev)
 1092                                 + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
 1093 
 1094             vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1]));
 1095             vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1]));
 1096 
 1097             tempv = LC2VFU(hcd[indx]);
 1098             vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
 1099             vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]);
 1100 
 1101             vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev)
 1102                                 + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
 1103             vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev)
 1104                                 + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
 1105 
 1106 
 1107             vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
 1108             vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
 1109 
 1110             // compute fluctuations in up/down and left/right interpolations of colours
 1111             Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
 1112             Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
 1113 
 1114             Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]);
 1115             Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]);
 1116 
 1117             vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
 1118             vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
 1119 
 1120             // determine adaptive weights for G interpolation
 1121             vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv);
 1122             vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
 1123 
 1124             // if both agree on interpolation direction, choose the one with strongest directional
 1125             // discrimination;
 1126             // otherwise, choose the u/d and l/r difference fluctuation weights
 1127             vmask decmask = vandm(vmaskf_gt((zd5v - varwtv) * (zd5v - diffwtv), ZEROV),
 1128                                   vmaskf_lt(vabsf(zd5v - diffwtv), vabsf(zd5v - varwtv)));
 1129             STVFU(hvwt[indx >> 1], vself(decmask, varwtv, diffwtv));
 1130           }
 1131         }
 1132 
 1133 #else
 1134 
 1135         for(int rr = 6; rr < rr1 - 6; rr++)
 1136         {
 1137           for(int cc = 6 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
 1138           {
 1139 
 1140             // compute colour difference variances in cardinal directions
 1141 
 1142             float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
 1143             float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
 1144             float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
 1145             float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
 1146 
 1147             // colour difference (G-R or G-B) variance in up/down/left/right directions
 1148             float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave)
 1149                               + SQR(vcd[indx - v3] - uave);
 1150             float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave)
 1151                               + SQR(vcd[indx + v3] - dave);
 1152             float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave)
 1153                               + SQR(hcd[indx - 3] - lave);
 1154             float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave)
 1155                               + SQR(hcd[indx + 3] - rave);
 1156 
 1157             float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
 1158             float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
 1159 
 1160             float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
 1161             float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
 1162 
 1163             // compute fluctuations in up/down and left/right interpolations of colours
 1164             Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
 1165             Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]);
 1166             Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
 1167             Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
 1168 
 1169             float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
 1170             float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
 1171 
 1172             // determine adaptive weights for G interpolation
 1173             float varwt = hcdvar / (vcdvar + hcdvar);
 1174             float diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
 1175 
 1176             // if both agree on interpolation direction, choose the one with strongest directional
 1177             // discrimination;
 1178             // otherwise, choose the u/d and l/r difference fluctuation weights
 1179             if((0.5 - varwt) * (0.5 - diffwt) > 0 && fabsf(0.5f - diffwt) < fabsf(0.5f - varwt))
 1180             {
 1181               hvwt[indx >> 1] = varwt;
 1182             }
 1183             else
 1184             {
 1185               hvwt[indx >> 1] = diffwt;
 1186             }
 1187           }
 1188         }
 1189 
 1190 #endif
 1191 
 1192 #ifdef __SSE2__
 1193         vfloat gaussg0 = F2V(gaussgrad[0]);
 1194         vfloat gaussg1 = F2V(gaussgrad[1]);
 1195         vfloat gaussg2 = F2V(gaussgrad[2]);
 1196         vfloat gaussg3 = F2V(gaussgrad[3]);
 1197         vfloat gaussg4 = F2V(gaussgrad[4]);
 1198         vfloat gaussg5 = F2V(gaussgrad[5]);
 1199         vfloat gausso0 = F2V(gaussodd[0]);
 1200         vfloat gausso1 = F2V(gaussodd[1]);
 1201         vfloat gausso2 = F2V(gaussodd[2]);
 1202         vfloat gausso3 = F2V(gaussodd[3]);
 1203 
 1204 #endif
 1205 
 1206         // precompute nyquist
 1207         for(int rr = 6; rr < rr1 - 6; rr++)
 1208         {
 1209           int cc = 6 + (FC(rr, 2, filters) & 1);
 1210           int indx = rr * ts + cc;
 1211 
 1212 #ifdef __SSE2__
 1213 
 1214           for(; cc < cc1 - 7; cc += 8, indx += 8)
 1215           {
 1216             vfloat valv
 1217                 = (gausso0 * LC2VFU(cddiffsq[indx])
 1218                    + gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)])
 1219                                 + LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)]))
 1220                    + gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)])
 1221                                 + LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)]))
 1222                    + gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)])
 1223                                 + LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)])))
 1224                   - (gaussg0 * LC2VFU(delhvsqsum[indx])
 1225                      + gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1])
 1226                                   + LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1]))
 1227                      + gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1])
 1228                                   + LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1]))
 1229                      + gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2])
 1230                                   + LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2]))
 1231                      + gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1])
 1232                                   + LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2])
 1233                                   + LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2])
 1234                                   + LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1]))
 1235                      + gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2])
 1236                                   + LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2])));
 1237             STVFU(nyqutest[indx >> 1], valv);
 1238           }
 1239 
 1240 #endif
 1241 
 1242           for(; cc < cc1 - 6; cc += 2, indx += 2)
 1243           {
 1244             nyqutest[indx >> 1]
 1245                 = (gaussodd[0] * cddiffsq[indx]
 1246                    + gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] + cddiffsq[(indx - p1)]
 1247                                     + cddiffsq[(indx + m1)])
 1248                    + gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] + cddiffsq[(indx + 2)]
 1249                                     + cddiffsq[(indx + v2)])
 1250                    + gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] + cddiffsq[(indx - p2)]
 1251                                     + cddiffsq[(indx + m2)]))
 1252                   - (gaussgrad[0] * delhvsqsum[indx]
 1253                      + gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] + delhvsqsum[indx - 1]
 1254                                        + delhvsqsum[indx + v1])
 1255                      + gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] + delhvsqsum[indx - p1]
 1256                                        + delhvsqsum[indx + m1])
 1257                      + gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] + delhvsqsum[indx + 2]
 1258                                        + delhvsqsum[indx + v2])
 1259                      + gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1]
 1260                                        + delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2]
 1261                                        + delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2]
 1262                                        + delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1])
 1263                      + gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] + delhvsqsum[indx - p2]
 1264                                        + delhvsqsum[indx + m2]));
 1265           }
 1266         }
 1267 
 1268         // Nyquist test
 1269         int nystartrow = 0;
 1270         int nyendrow = 0;
 1271         int nystartcol = ts + 1;
 1272         int nyendcol = 0;
 1273 
 1274         for(int rr = 6; rr < rr1 - 6; rr++)
 1275         {
 1276           for(int cc = 6 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
 1277           {
 1278 
 1279             // nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB
 1280             // gradients
 1281             if(nyqutest[indx >> 1] > 0.f)
 1282             {
 1283               nyquist[indx >> 1] = 1; // nyquist=1 for nyquist region
 1284               nystartrow = nystartrow ? nystartrow : rr;
 1285               nyendrow = rr;
 1286               nystartcol = nystartcol > cc ? cc : nystartcol;
 1287               nyendcol = nyendcol < cc ? cc : nyendcol;
 1288             }
 1289           }
 1290         }
 1291 
 1292 
 1293         bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol;
 1294 
 1295         if(doNyquist)
 1296         {
 1297           nyendrow++; // because of < condition
 1298           nyendcol++; // because of < condition
 1299           nystartcol -= (nystartcol & 1);
 1300           nystartrow = std::max(8, nystartrow);
 1301           nyendrow = std::min(rr1 - 8, nyendrow);
 1302           nystartcol = std::max(8, nystartcol);
 1303           nyendcol = std::min(cc1 - 8, nyendcol);
 1304           memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh);
 1305 
 1306 #ifdef __SSE2__
 1307           vint fourvb = _mm_set1_epi8(4);
 1308           vint onevb = _mm_set1_epi8(1);
 1309 
 1310 #endif
 1311 
 1312           for(int rr = nystartrow; rr < nyendrow; rr++)
 1313           {
 1314 #ifdef __SSE2__
 1315 
 1316             for(int indx = rr * ts; indx < rr * ts + cc1; indx += 32)
 1317             {
 1318               vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint *)&nyquist[(indx - v2) >> 1]),
 1319                                                  _mm_loadu_si128((vint *)&nyquist[(indx - m1) >> 1]));
 1320               vint nyquisttemp2v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + p1) >> 1]),
 1321                                                  _mm_loadu_si128((vint *)&nyquist[(indx - 2) >> 1]));
 1322               vint nyquisttemp3v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + 2) >> 1]),
 1323                                                  _mm_loadu_si128((vint *)&nyquist[(indx - p1) >> 1]));
 1324               vint valv = _mm_load_si128((vint *)&nyquist[indx >> 1]);
 1325               vint nyquisttemp4v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + m1) >> 1]),
 1326                                                  _mm_load_si128((vint *)&nyquist[(indx + v2) >> 1]));
 1327               nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v);
 1328               nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v);
 1329               nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v);
 1330               valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv);
 1331               valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv);
 1332               _mm_store_si128((vint *)&nyquist2[indx >> 1], valv);
 1333             }
 1334 
 1335 #else
 1336 
 1337             for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
 1338                 indx += 2)
 1339             {
 1340               unsigned int nyquisttemp
 1341                   = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1]
 1342                      + nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] + nyquist[(indx - p1) >> 1]
 1343                      + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
 1344               // if most of your neighbours are named Nyquist, it's likely that you're one too, or not
 1345               nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]);
 1346             }
 1347 
 1348 #endif
 1349           }
 1350 
 1351           // end of Nyquist test
 1352 
 1353           // in areas of Nyquist texture, do area interpolation
 1354           for(int rr = nystartrow; rr < nyendrow; rr++)
 1355             for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
 1356                 indx += 2)
 1357             {
 1358 
 1359               if(nyquist2[indx >> 1])
 1360               {
 1361                 // area interpolation
 1362 
 1363                 float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
 1364 
 1365                 for(int i = -6; i < 7; i += 2)
 1366                 {
 1367                   int indx1 = indx + (i * ts) - 6;
 1368 
 1369                   for(int j = -6; j < 7; j += 2, indx1 += 2)
 1370                   {
 1371                     if(nyquist2[indx1 >> 1])
 1372                     {
 1373                       float cfatemp = cfa[indx1];
 1374                       sumcfa += cfatemp;
 1375                       sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
 1376                       sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]);
 1377                       sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]);
 1378                       sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]);
 1379                       areawt += 1;
 1380                     }
 1381                   }
 1382                 }
 1383 
 1384                 // horizontal and vertical colour differences, and adaptive weight
 1385                 sumh = sumcfa - xdiv2f(sumh);
 1386                 sumv = sumcfa - xdiv2f(sumv);
 1387                 areawt = xdiv2f(areawt);
 1388                 float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
 1389                 float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
 1390                 hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
 1391 
 1392                 // end of area interpolation
 1393               }
 1394             }
 1395         }
 1396 
 1397 
 1398         // populate G at R/B sites
 1399         for(int rr = 8; rr < rr1 - 8; rr++)
 1400           for(int indx = rr * ts + 8 + (FC(rr, 2, filters) & 1); indx < rr * ts + cc1 - 8; indx += 2)
 1401           {
 1402 
 1403             // first ask if one gets more directional discrimination from nearby B/R sites
 1404             float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1]
 1405                                       + hvwt[(indx + m1) >> 1],
 1406                                   2);
 1407 
 1408             hvwt[indx >> 1]
 1409                 = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
 1410             // a better result was obtained from the neighbours
 1411 
 1412             Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); // evaluate colour differences
 1413 
 1414             rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; // evaluate G (finally!)
 1415 
 1416             // local curvature in G (preparation for nyquist refinement step)
 1417             Dgrb2[indx >> 1].h = nyquist2[indx >> 1]
 1418                                      ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1]))
 1419                                      : 0.f;
 1420             Dgrb2[indx >> 1].v = nyquist2[indx >> 1]
 1421                                      ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1]))
 1422                                      : 0.f;
 1423           }
 1424 
 1425 
 1426         // end of standard interpolation
 1427 
 1428         // refine Nyquist areas using G curvatures
 1429         if(doNyquist)
 1430         {
 1431           for(int rr = nystartrow; rr < nyendrow; rr++)
 1432             for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
 1433                 indx += 2)
 1434             {
 1435 
 1436               if(nyquist2[indx >> 1])
 1437               {
 1438                 // local averages (over Nyquist pixels only) of G curvature squared
 1439                 float gvarh
 1440                     = epssq + (gquinc[0] * Dgrb2[indx >> 1].h
 1441                                + gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h
 1442                                               + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h)
 1443                                + gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h
 1444                                               + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h)
 1445                                + gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h
 1446                                               + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
 1447                 float gvarv
 1448                     = epssq + (gquinc[0] * Dgrb2[indx >> 1].v
 1449                                + gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v
 1450                                               + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v)
 1451                                + gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v
 1452                                               + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v)
 1453                                + gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v
 1454                                               + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
 1455                 // use the results as weights for refined G interpolation
 1456                 Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
 1457                 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
 1458               }
 1459             }
 1460         }
 1461 
 1462 
 1463 #ifdef __SSE2__
 1464 
 1465         for(int rr = 6; rr < rr1 - 6; rr++)
 1466         {
 1467           if((FC(rr, 2, filters) & 1) == 0)
 1468           {
 1469             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8)
 1470             {
 1471               vfloat tempv = LC2VFU(cfa[indx + 1]);
 1472               vfloat Dgrbsq1pv
 1473                   = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
 1474               STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
 1475               STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
 1476               vfloat Dgrbsq1mv
 1477                   = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
 1478               STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv);
 1479               STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv);
 1480             }
 1481           }
 1482           else
 1483           {
 1484             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8)
 1485             {
 1486               vfloat tempv = LC2VFU(cfa[indx]);
 1487               vfloat Dgrbsq1pv
 1488                   = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
 1489               STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
 1490               STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
 1491               vfloat Dgrbsq1mv
 1492                   = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
 1493               STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv);
 1494               STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv);
 1495             }
 1496           }
 1497         }
 1498 
 1499 #else
 1500 
 1501         for(int rr = 6; rr < rr1 - 6; rr++)
 1502         {
 1503           if((FC(rr, 2, filters) & 1) == 0)
 1504           {
 1505             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
 1506             {
 1507               delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
 1508               delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
 1509               Dgrbsq1p[indx >> 1]
 1510                   = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
 1511               Dgrbsq1m[indx >> 1]
 1512                   = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
 1513             }
 1514           }
 1515           else
 1516           {
 1517             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
 1518             {
 1519               Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
 1520               Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
 1521               delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
 1522               delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
 1523             }
 1524           }
 1525         }
 1526 
 1527 #endif
 1528 
 1529 // diagonal interpolation correction
 1530 
 1531 #ifdef __SSE2__
 1532         vfloat gausseven0v = F2V(gausseven[0]);
 1533         vfloat gausseven1v = F2V(gausseven[1]);
 1534 #endif
 1535 
 1536         for(int rr = 8; rr < rr1 - 8; rr++)
 1537         {
 1538 #ifdef __SSE2__
 1539 
 1540           for(int indx = rr * ts + 8 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8;
 1541               indx += 8, indx1 += 4)
 1542           {
 1543 
 1544             // diagonal colour ratios
 1545             vfloat cfav = LC2VFU(cfa[indx]);
 1546 
 1547             vfloat temp1v = LC2VFU(cfa[indx + m1]);
 1548             vfloat temp2v = LC2VFU(cfa[indx + m2]);
 1549             vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v);
 1550             rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev,
 1551                           temp1v + zd5v * (cfav - temp2v));
 1552 
 1553             temp1v = LC2VFU(cfa[indx - m1]);
 1554             temp2v = LC2VFU(cfa[indx - m2]);
 1555             vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v);
 1556             rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv,
 1557                           temp1v + zd5v * (cfav - temp2v));
 1558 
 1559             temp1v = epsv + LVFU(delm[indx1]);
 1560             vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1])
 1561                            + LVFU(delm[(indx + m2) >> 1]); // same as for wtu,wtd,wtl,wtr
 1562             vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
 1563 
 1564             vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
 1565 
 1566             temp1v = ULIMV(rbmv, LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1]));
 1567             vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav);
 1568             temp2v = vintpf(wtv, rbmv, temp1v);
 1569 
 1570             temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v);
 1571             temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv);
 1572             STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv),
 1573                                     ULIMV(temp2v, LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v));
 1574 
 1575 
 1576             temp1v = LC2VFU(cfa[indx + p1]);
 1577             temp2v = LC2VFU(cfa[indx + p2]);
 1578             vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v);
 1579             rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev,
 1580                           temp1v + zd5v * (cfav - temp2v));
 1581 
 1582             temp1v = LC2VFU(cfa[indx - p1]);
 1583             temp2v = LC2VFU(cfa[indx - p2]);
 1584             vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v);
 1585             rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv,
 1586                           temp1v + zd5v * (cfav - temp2v));
 1587 
 1588             temp1v = epsv + LVFU(delp[indx1]);
 1589             vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
 1590             vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
 1591 
 1592             vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
 1593 
 1594             temp1v = ULIMV(rbpv, LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1]));
 1595             wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav);
 1596             temp2v = vintpf(wtv, rbpv, temp1v);
 1597 
 1598             temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v);
 1599             temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv);
 1600             STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv),
 1601                                     ULIMV(temp2v, LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v));
 1602 
 1603             vfloat rbvarmv
 1604                 = epssqv
 1605                   + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1])
 1606                                     + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1]))
 1607                      + gausseven1v
 1608                            * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1])
 1609                               + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1])
 1610                               + LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1])
 1611                               + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1])));
 1612             STVFU(pmwt[indx1],
 1613                   rbvarmv / ((epssqv
 1614                               + (gausseven0v
 1615                                      * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1])
 1616                                         + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1]))
 1617                                  + gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1])
 1618                                                   + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1])
 1619                                                   + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1])
 1620                                                   + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1])
 1621                                                   + LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1])
 1622                                                   + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1])
 1623                                                   + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1])
 1624                                                   + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1]))))
 1625                              + rbvarmv));
 1626           }
 1627 
 1628 #else
 1629 
 1630           for(int cc = 8 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8;
 1631               cc += 2, indx += 2, indx1++)
 1632           {
 1633 
 1634             // diagonal colour ratios
 1635             float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
 1636             float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
 1637             float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
 1638             float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
 1639             // colour differences in diagonal directions
 1640             float rbse, rbnw, rbne, rbsw;
 1641 
 1642             // assign B/R at R/B sites
 1643             if(fabsf(1.f - crse) < arthresh)
 1644             {
 1645               rbse = cfa[indx] * crse; // use this if more precise diag interp is necessary
 1646             }
 1647             else
 1648             {
 1649               rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]);
 1650             }
 1651 
 1652             if(fabsf(1.f - crnw) < arthresh)
 1653             {
 1654               rbnw = cfa[indx] * crnw;
 1655             }
 1656             else
 1657             {
 1658               rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]);
 1659             }
 1660 
 1661             if(fabsf(1.f - crne) < arthresh)
 1662             {
 1663               rbne = cfa[indx] * crne;
 1664             }
 1665             else
 1666             {
 1667               rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]);
 1668             }
 1669 
 1670             if(fabsf(1.f - crsw) < arthresh)
 1671             {
 1672               rbsw = cfa[indx] * crsw;
 1673             }
 1674             else
 1675             {
 1676               rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
 1677             }
 1678 
 1679             float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1]
 1680                          + delm[(indx + m2) >> 1]; // same as for wtu,wtd,wtl,wtr
 1681             float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
 1682             float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
 1683             float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
 1684 
 1685 
 1686             rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw);
 1687             rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw);
 1688 
 1689             // variance of R-B in plus/minus directions
 1690             float rbvarm
 1691                 = epssq
 1692                   + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1]
 1693                                      + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1])
 1694                      + gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1]
 1695                                        + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1]
 1696                                        + Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1]
 1697                                        + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
 1698             pmwt[indx1]
 1699                 = rbvarm
 1700                   / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1]
 1701                                                + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1])
 1702                                + gausseven[1]
 1703                                      * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1]
 1704                                         + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1]
 1705                                         + Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1]
 1706                                         + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1])))
 1707                      + rbvarm);
 1708 
 1709             // bound the interpolation in regions of high saturation
 1710 
 1711             if(rbp[indx1] < cfa[indx])
 1712             {
 1713               if(xmul2f(rbp[indx1]) < cfa[indx])
 1714               {
 1715                 rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
 1716               }
 1717               else
 1718               {
 1719                 float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
 1720                 rbp[indx1]
 1721                     = pwt * rbp[indx1] + (1.f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
 1722               }
 1723             }
 1724 
 1725             if(rbm[indx1] < cfa[indx])
 1726             {
 1727               if(xmul2f(rbm[indx1]) < cfa[indx])
 1728               {
 1729                 rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
 1730               }
 1731               else
 1732               {
 1733                 float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
 1734                 rbm[indx1]
 1735                     = mwt * rbm[indx1] + (1.f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
 1736               }
 1737             }
 1738 
 1739             if(rbp[indx1] > clip_pt)
 1740             {
 1741               rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
 1742             }
 1743 
 1744             if(rbm[indx1] > clip_pt)
 1745             {
 1746               rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
 1747             }
 1748           }
 1749 
 1750 #endif
 1751         }
 1752 
 1753 #ifdef __SSE2__
 1754         vfloat zd25v = F2V(0.25f);
 1755 #endif
 1756 
 1757         for(int rr = 10; rr < rr1 - 10; rr++)
 1758 #ifdef __SSE2__
 1759           for(int indx = rr * ts + 10 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1;
 1760               indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4)
 1761           {
 1762 
 1763             // first ask if one gets more directional discrimination from nearby B/R sites
 1764             vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1])
 1765                                        + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
 1766             vfloat tempv = LVFU(pmwt[indx1]);
 1767             tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv);
 1768             STVFU(pmwt[indx1], tempv);
 1769             STVFU(rbint[indx1],
 1770                   zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1]))));
 1771           }
 1772 
 1773 #else
 1774 
 1775           for(int cc = 10 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10;
 1776               cc += 2, indx += 2, indx1++)
 1777           {
 1778 
 1779             // first ask if one gets more directional discrimination from nearby B/R sites
 1780             float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1]
 1781                                       + pmwt[(indx + m1) >> 1],
 1782                                   2);
 1783 
 1784             if(fabsf(0.5f - pmwt[indx1]) < fabsf(0.5f - pmwtalt))
 1785             {
 1786               pmwt[indx1] = pmwtalt; // a better result was obtained from the neighbours
 1787             }
 1788 
 1789             rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1])
 1790                                   + rbp[indx1] * pmwt[indx1]); // this is R+B, interpolated
 1791           }
 1792 
 1793 #endif
 1794 
 1795         for(int rr = 12; rr < rr1 - 12; rr++)
 1796 #ifdef __SSE2__
 1797           for(int indx = rr * ts + 12 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1;
 1798               indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4)
 1799           {
 1800             vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
 1801 
 1802             if(_mm_movemask_ps((vfloat)copymask))
 1803             { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the
 1804               // unused out at the end
 1805               // now interpolate G vertically/horizontally using R+B values
 1806               // unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
 1807               // colour ratios for G interpolation
 1808               vfloat rbintv = LVFU(rbint[indx1]);
 1809 
 1810               // interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
 1811               vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)]));
 1812               vfloat guv = rbintv * cruv;
 1813               vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)]));
 1814               guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v);
 1815 
 1816               vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)]));
 1817               vfloat gdv = rbintv * crdv;
 1818               vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)]));
 1819               gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v);
 1820 
 1821               vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv)
 1822                               / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
 1823               vfloat Gint1v = ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1]));
 1824               vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv);
 1825               vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v);
 1826               Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v);
 1827               Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv);
 1828               Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv),
 1829                              ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv);
 1830 
 1831               vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)]));
 1832               vfloat glv = rbintv * crlv;
 1833               vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)]));
 1834               glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v);
 1835 
 1836               vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)]));
 1837               vfloat grv = rbintv * crrv;
 1838               vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)]));
 1839               grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v);
 1840 
 1841               vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv)
 1842                               / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
 1843               vfloat Gint1h = ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1]));
 1844               vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv);
 1845               vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h);
 1846               Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h);
 1847               Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv);
 1848               Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv),
 1849                              ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv);
 1850 
 1851               vfloat greenv
 1852                   = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx]));
 1853               STC2VFU(rgbgreen[indx], greenv);
 1854 
 1855               STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1])));
 1856             }
 1857           }
 1858 
 1859 #else
 1860 
 1861           for(int cc = 12 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12;
 1862               cc += 2, indx += 2, indx1++)
 1863           {
 1864 
 1865             if(fabsf(0.5f - pmwt[indx >> 1]) < fabsf(0.5f - hvwt[indx >> 1]))
 1866             {
 1867               continue;
 1868             }
 1869 
 1870             // now interpolate G vertically/horizontally using R+B values
 1871             // unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
 1872 
 1873             // colour ratios for G interpolation
 1874             float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
 1875             float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
 1876             float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
 1877             float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
 1878 
 1879             // interpolation of G in four directions
 1880             float gu, gd, gl, gr;
 1881 
 1882             // interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
 1883             if(fabsf(1.f - cru) < arthresh)
 1884             {
 1885               gu = rbint[indx1] * cru;
 1886             }
 1887             else
 1888             {
 1889               gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]);
 1890             }
 1891 
 1892             if(fabsf(1.f - crd) < arthresh)
 1893             {
 1894               gd = rbint[indx1] * crd;
 1895             }
 1896             else
 1897             {
 1898               gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]);
 1899             }
 1900 
 1901             if(fabsf(1.f - crl) < arthresh)
 1902             {
 1903               gl = rbint[indx1] * crl;
 1904             }
 1905             else
 1906             {
 1907               gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]);
 1908             }
 1909 
 1910             if(fabsf(1.f - crr) < arthresh)
 1911             {
 1912               gr = rbint[indx1] * crr;
 1913             }
 1914             else
 1915             {
 1916               gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]);
 1917             }
 1918 
 1919             // interpolated G via adaptive weights of cardinal evaluations
 1920             float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu)
 1921                           / (dirwts0[indx + v1] + dirwts0[indx - v1]);
 1922             float Ginth
 1923                 = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
 1924 
 1925             // bound the interpolation in regions of high saturation
 1926             if(Gintv < rbint[indx1])
 1927             {
 1928               if(2 * Gintv < rbint[indx1])
 1929               {
 1930                 Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
 1931               }
 1932               else
 1933               {
 1934                 float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
 1935                 Gintv = vwt * Gintv + (1.f - vwt) * ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
 1936               }
 1937             }
 1938 
 1939             if(Ginth < rbint[indx1])
 1940             {
 1941               if(2 * Ginth < rbint[indx1])
 1942               {
 1943                 Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
 1944               }
 1945               else
 1946               {
 1947                 float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
 1948                 Ginth = hwt * Ginth + (1.f - hwt) * ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
 1949               }
 1950             }
 1951 
 1952             if(Ginth > clip_pt)
 1953             {
 1954               Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
 1955             }
 1956 
 1957             if(Gintv > clip_pt)
 1958             {
 1959               Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
 1960             }
 1961 
 1962             rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1];
 1963             Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx];
 1964           }
 1965 
 1966 #endif
 1967 
 1968         // end of diagonal interpolation correction
 1969 
 1970         // fancy chrominance interpolation
 1971         //(ey,ex) is location of R site
 1972         for(int rr = 13 - ey; rr < rr1 - 12; rr += 2)
 1973           for(int indx1 = (rr * ts + 13 - ex) >> 1; indx1<(rr * ts + cc1 - 12)>> 1; indx1++)
 1974           {                                  // B coset
 1975             Dgrb[1][indx1] = Dgrb[0][indx1]; // split out G-B from G-R
 1976             Dgrb[0][indx1] = 0;
 1977           }
 1978 
 1979 #ifdef __SSE2__
 1980         vfloat oned325v = F2V(1.325f);
 1981         vfloat zd175v = F2V(0.175f);
 1982         vfloat zd075v = F2V(0.075f);
 1983 #endif
 1984 
 1985         for(int rr = 14; rr < rr1 - 14; rr++)
 1986 #ifdef __SSE2__
 1987           for(int cc = 14 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc, filters) / 2;
 1988               cc < cc1 - 14; cc += 8, indx += 8)
 1989           {
 1990             vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
 1991             vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
 1992             vfloat wtnwv
 1993                 = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1]))
 1994                           + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
 1995             vfloat wtnev
 1996                 = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1]))
 1997                           + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
 1998             vfloat wtswv
 1999                 = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1]))
 2000                           + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
 2001             vfloat wtsev
 2002                 = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1]))
 2003                           + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
 2004 
 2005             STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1])
 2006                                                 - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1])
 2007                                                 - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1])
 2008                                                             + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])))
 2009                                        + wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1])
 2010                                                   - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1])
 2011                                                   - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1])
 2012                                                               + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])))
 2013                                        + wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1])
 2014                                                   - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1])
 2015                                                   - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1])
 2016                                                               + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])))
 2017                                        + wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1])
 2018                                                   - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1])
 2019                                                   - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1])
 2020                                                               + LVFU(Dgrb[c][(indx + m1 + v2) >> 1]))))
 2021                                           / (wtnwv + wtnev + wtswv + wtsev));
 2022           }
 2023 
 2024 #else
 2025 
 2026           for(int cc = 14 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc, filters) / 2;
 2027               cc < cc1 - 14; cc += 2, indx += 2)
 2028           {
 2029             float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1])
 2030                                 + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1])
 2031                                 + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
 2032             float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1])
 2033                                 + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1])
 2034                                 + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
 2035             float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1])
 2036                                 + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1])
 2037                                 + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
 2038             float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1])
 2039                                 + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1])
 2040                                 + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
 2041 
 2042             Dgrb[c][indx >> 1]
 2043                 = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1]
 2044                            - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1])
 2045                    + wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1]
 2046                              - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1]
 2047                              - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1])
 2048                    + wtsw * (1.325f * Dgrb[c][(indx - p1) >> 1] - 0.175f * Dgrb[c][(indx - p3) >> 1]
 2049                              - 0.075f * Dgrb[c][(indx - p1 - 2) >> 1]
 2050                              - 0.075f * Dgrb[c][(indx - p1 - v2) >> 1])
 2051                    + wtse * (1.325f * Dgrb[c][(indx + m1) >> 1] - 0.175f * Dgrb[c][(indx + m3) >> 1]
 2052                              - 0.075f * Dgrb[c][(indx + m1 + 2) >> 1]
 2053                              - 0.075f * Dgrb[c][(indx + m1 + v2) >> 1]))
 2054                   / (wtnw + wtne + wtsw + wtse);
 2055           }
 2056 
 2057 #endif
 2058 
 2059 #ifdef __SSE2__
 2060         int offset;
 2061         vfloat twov = F2V(2.f);
 2062         vmask selmask;
 2063 
 2064         if((FC(16, 2, filters) & 1) == 1)
 2065         {
 2066           selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);
 2067           offset = 1;
 2068         }
 2069         else
 2070         {
 2071           selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
 2072           offset = 0;
 2073         }
 2074 
 2075 #endif
 2076 
 2077         for(int rr = 16; rr < rr1 - 16; rr++)
 2078         {
 2079           int row = rr + top;
 2080           int col = left + 16;
 2081           int indx = rr * ts + 16;
 2082 #ifdef __SSE2__
 2083           offset = 1 - offset;
 2084           selmask = vnotm(selmask);
 2085 
 2086           for(; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4)
 2087           {
 2088             if(col < roi_out->width && row < roi_out->height)
 2089             {
 2090               vfloat greenv = LVF(rgbgreen[indx]);
 2091               vfloat temp00v = vdup(LVFU(hvwt[(indx - v1) >> 1]));
 2092               vfloat temp01v = vdup(LVFU(hvwt[(indx + v1) >> 1]));
 2093               vfloat tempv = onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))
 2094                                      - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v);
 2095 
 2096               vfloat redv1 = greenv
 2097                              - (temp00v * vdup(LVFU(Dgrb[0][(indx - v1) >> 1]))
 2098                                 + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])))
 2099                                       * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1]))
 2100                                 + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])))
 2101                                       * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1]))
 2102                                 + temp01v * vdup(LVFU(Dgrb[0][(indx + v1) >> 1])))
 2103                                    * tempv;
 2104               vfloat bluev1 = greenv
 2105                               - (temp00v * vdup(LVFU(Dgrb[1][(indx - v1) >> 1]))
 2106                                  + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])))
 2107                                        * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1]))
 2108                                  + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])))
 2109                                        * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1]))
 2110                                  + temp01v * vdup(LVFU(Dgrb[1][(indx + v1) >> 1])))
 2111                                     * tempv;
 2112               vfloat redv2 = greenv - vdup(LVFU(Dgrb[0][indx >> 1]));
 2113               vfloat bluev2 = greenv - vdup(LVFU(Dgrb[1][indx >> 1]));
 2114               __attribute__((aligned(64))) float _r[4];
 2115               __attribute__((aligned(64))) float _b[4];
 2116               STVF(*_r, vself(selmask, redv1, redv2));
 2117               STVF(*_b, vself(selmask, bluev1, bluev2));
 2118               for(int c = 0; c < 4; c++)
 2119               {
 2120                 out[(row * roi_out->width + col + c) * 4] = clampnan(_r[c], 0.0, 1.0);
 2121                 out[(row * roi_out->width + col + c) * 4 + 2] = clampnan(_b[c], 0.0, 1.0);
 2122               }
 2123             }
 2124           }
 2125 
 2126           if(offset == 0)
 2127           {
 2128             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
 2129             {
 2130               if(col < roi_out->width && row < roi_out->height)
 2131               {
 2132                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2133                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2134                 out[(row * roi_out->width + col) * 4]
 2135                     = clampnan(rgbgreen[indx]
 2136                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2137                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2138                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2139                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2140                                          * temp,
 2141                                0.0, 1.0);
 2142                 out[(row * roi_out->width + col) * 4 + 2]
 2143                     = clampnan(rgbgreen[indx]
 2144                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2145                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2146                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2147                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2148                                          * temp,
 2149                                0.0, 1.0);
 2150               }
 2151 
 2152               indx++;
 2153               col++;
 2154               if(col < roi_out->width && row < roi_out->height)
 2155               {
 2156                 out[(row * roi_out->width + col) * 4]
 2157                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
 2158                 out[(row * roi_out->width + col) * 4 + 2]
 2159                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
 2160               }
 2161             }
 2162 
 2163             if(cc1 & 1)
 2164             { // width of tile is odd
 2165               if(col < roi_out->width && row < roi_out->height)
 2166               {
 2167                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2168                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2169                 out[(row * roi_out->width + col) * 4]
 2170                     = clampnan(rgbgreen[indx]
 2171                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2172                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2173                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2174                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2175                                          * temp,
 2176                                0.0, 1.0);
 2177                 out[(row * roi_out->width + col) * 4 + 2]
 2178                     = clampnan(rgbgreen[indx]
 2179                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2180                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2181                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2182                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2183                                          * temp,
 2184                                0.0, 1.0);
 2185               }
 2186             }
 2187           }
 2188           else
 2189           {
 2190             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
 2191             {
 2192               if(col < roi_out->width && row < roi_out->height)
 2193               {
 2194                 out[(row * roi_out->width + col) * 4]
 2195                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
 2196                 out[(row * roi_out->width + col) * 4 + 2]
 2197                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
 2198               }
 2199 
 2200               indx++;
 2201               col++;
 2202               if(col < roi_out->width && row < roi_out->height)
 2203               {
 2204                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2205                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2206                 out[(row * roi_out->width + col) * 4]
 2207                     = clampnan(rgbgreen[indx]
 2208                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2209                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2210                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2211                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2212                                          * temp,
 2213                                0.0, 1.0);
 2214                 out[(row * roi_out->width + col) * 4 + 2]
 2215                     = clampnan(rgbgreen[indx]
 2216                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2217                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2218                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2219                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2220                                          * temp,
 2221                                0.0, 1.0);
 2222               }
 2223             }
 2224 
 2225             if(cc1 & 1)
 2226             { // width of tile is odd
 2227               if(col < roi_out->width && row < roi_out->height)
 2228               {
 2229                 out[(row * roi_out->width + col) * 4]
 2230                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
 2231                 out[(row * roi_out->width + col) * 4 + 2]
 2232                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
 2233               }
 2234             }
 2235           }
 2236 
 2237 #else
 2238 
 2239           if((FC(rr, 2, filters) & 1) == 1)
 2240           {
 2241             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
 2242             {
 2243               if(col < roi_out->width && row < roi_out->height)
 2244               {
 2245                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2246                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2247                 out[(row * roi_out->width + col) * 4]
 2248                     = clampnan(rgbgreen[indx]
 2249                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2250                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2251                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2252                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2253                                          * temp,
 2254                                0.0, 1.0);
 2255                 out[(row * roi_out->width + col) * 4 + 2]
 2256                     = clampnan(rgbgreen[indx]
 2257                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2258                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2259                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2260                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2261                                          * temp,
 2262                                0.0, 1.0);
 2263               }
 2264 
 2265               indx++;
 2266               col++;
 2267               if(col < roi_out->width && row < roi_out->height)
 2268               {
 2269                 out[(row * roi_out->width + col) * 4]
 2270                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
 2271                 out[(row * roi_out->width + col) * 4 + 2]
 2272                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
 2273               }
 2274             }
 2275 
 2276             if(cc1 & 1)
 2277             { // width of tile is odd
 2278               if(col < roi_out->width && row < roi_out->height)
 2279               {
 2280                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2281                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2282                 out[(row * roi_out->width + col) * 4]
 2283                     = clampnan(rgbgreen[indx]
 2284                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2285                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2286                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2287                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2288                                          * temp,
 2289                                0.0, 1.0);
 2290                 out[(row * roi_out->width + col) * 4 + 2]
 2291                     = clampnan(rgbgreen[indx]
 2292                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2293                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2294                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2295                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2296                                          * temp,
 2297                                0.0, 1.0);
 2298               }
 2299             }
 2300           }
 2301           else
 2302           {
 2303             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
 2304             {
 2305               if(col < roi_out->width && row < roi_out->height)
 2306               {
 2307                 out[(row * roi_out->width + col) * 4]
 2308                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0f, 1.0f);
 2309                 out[(row * roi_out->width + col) * 4 + 2]
 2310                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0f, 1.0f);
 2311               }
 2312 
 2313               indx++;
 2314               col++;
 2315               if(col < roi_out->width && row < roi_out->height)
 2316               {
 2317                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2318                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2319 
 2320                 out[(row * roi_out->width + col) * 4]
 2321                     = clampnan(rgbgreen[indx]
 2322                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2323                                       + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2324                                       + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2325                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2326                                          * temp,
 2327                                0.0f, 1.0f);
 2328 
 2329                 out[(row * roi_out->width + col) * 4 + 2]
 2330                     = clampnan(rgbgreen[indx]
 2331                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2332                                       + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2333                                       + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2334                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2335                                          * temp,
 2336                                0.0f, 1.0f);
 2337               }
 2338             }
 2339 
 2340             if(cc1 & 1)
 2341             { // width of tile is odd
 2342               if(col < roi_out->width && row < roi_out->height)
 2343               {
 2344                 out[(row * roi_out->width + col) * 4]
 2345                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0f, 1.0f);
 2346                 out[(row * roi_out->width + col) * 4 + 2]
 2347                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0f, 1.0f);
 2348               }
 2349             }
 2350           }
 2351 
 2352 #endif
 2353         }
 2354 
 2355         // copy smoothed results back to image matrix
 2356         for(int rr = 16; rr < rr1 - 16; rr++)
 2357         {
 2358           int row = rr + top;
 2359           int cc = 16;
 2360           // TODO (darktable): we have the pixel colors interleaved so writing them in blocks using SSE2 is
 2361           // not possible. or is it?
 2362           // #ifdef __SSE2__
 2363           //
 2364           //           for(; cc < cc1 - 19; cc += 4)
 2365           //           {
 2366           //             STVFU(out[(row * roi_out->width + (cc + left))  * 4 + 1], LVF(rgbgreen[rr * ts +
 2367           //             cc]));
 2368           //           }
 2369           //
 2370           // #endif
 2371 
 2372           for(; cc < cc1 - 16; cc++)
 2373           {
 2374             int col = cc + left;
 2375             int indx = rr * ts + cc;
 2376             if(col < roi_out->width && row < roi_out->height)
 2377               out[(row * roi_out->width + col) * 4 + 1] = clampnan(rgbgreen[indx], 0.0f, 1.0f);
 2378           }
 2379         }
 2380       }
 2381     } // end of main loop
 2382 
 2383     // clean up
 2384     free(buffer);
 2385   }
 2386 }
 2387 
 2388 /*==================================================================================
 2389  * end of raw therapee code
 2390  *==================================================================================*/
 2391 
 2392 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
 2393 // vim: shiftwidth=2 expandtab tabstop=2 cindent
 2394 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;