"Fossies" - the Fresh Open Source Software Archive

Member "darktable-2.6.3/src/iop/amaze_demosaic_RT.cc" (20 Oct 2019, 105199 Bytes) of package /linux/misc/darktable-2.6.3.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "amaze_demosaic_RT.cc" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2.6.3_vs_3.0.0.rc0.

    1 /*
    2     This file is part of darktable,
    3     copyright (c) 2009--2011 johannes hanika.
    4 
    5     darktable is free software: you can redistribute it and/or modify
    6     it under the terms of the GNU General Public License as published by
    7     the Free Software Foundation, either version 3 of the License, or
    8     (at your option) any later version.
    9 
   10     darktable is distributed in the hope that it will be useful,
   11     but WITHOUT ANY WARRANTY; without even the implied warranty of
   12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13     GNU General Public License for more details.
   14 
   15     You should have received a copy of the GNU General Public License
   16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
   17 */
   18 
   19 #define __STDC_FORMAT_MACROS
   20 
   21 #if defined(__SSE__)
   22 #include <xmmintrin.h>
   23 #endif
   24 
   25 extern "C" {
   26 #include "develop/imageop.h"
   27 #include "develop/imageop_math.h"
   28 
   29 // otherwise the name will be mangled and the linker won't be able to see the function ...
   30 void amaze_demosaic_RT(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const float *const in,
   31                        float *out, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
   32                        const int filters);
   33 }
   34 
   35 #include <algorithm>
   36 #include <cmath>
   37 #include <cstdint>
   38 #include <cstdlib>
   39 #include <cstring>
   40 
   41 static __inline float clampnan(const float x, const float m, const float M)
   42 {
   43   float r;
   44 
   45   // clamp to [m, M] if x is infinite; return average of m and M if x is NaN; else just return x
   46 
   47   if(std::isinf(x))
   48     r = (std::isless(x, m) ? m : (std::isgreater(x, M) ? M : x));
   49   else if(std::isnan(x))
   50     r = (m + M) / 2.0f;
   51   else // normal number
   52     r = x;
   53 
   54   return r;
   55 }
   56 
   57 #ifndef __SSE2__
   58 static __inline float xmul2f(float d)
   59 {
   60   if(*(int *)&d & 0x7FFFFFFF) // if f==0 do nothing
   61   {
   62     *(int *)&d += 1 << 23; // add 1 to the exponent
   63   }
   64   return d;
   65 }
   66 #endif
   67 
   68 static __inline float xdiv2f(float d)
   69 {
   70   if(*(int *)&d & 0x7FFFFFFF) // if f==0 do nothing
   71   {
   72     *(int *)&d -= 1 << 23; // sub 1 from the exponent
   73   }
   74   return d;
   75 }
   76 
   77 static __inline float xdivf(float d, int n)
   78 {
   79   if(*(int *)&d & 0x7FFFFFFF) // if f==0 do nothing
   80   {
   81     *(int *)&d -= n << 23; // add n to the exponent
   82   }
   83   return d;
   84 }
   85 
   86 
   87 /*==================================================================================
   88  * begin raw therapee code, hg checkout of march 03, 2016 branch master.
   89  *==================================================================================*/
   90 
   91 
   92 #ifdef __SSE2__
   93 
   94 #ifdef __GNUC__
   95 #define INLINE __inline
   96 #else
   97 #define INLINE inline
   98 #endif
   99 
  100 #ifdef __GNUC__
  101 #if((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4) && (!defined(WIN32) || defined(__x86_64__))
  102 #define LVF(x) _mm_load_ps(&x)
  103 #define LVFU(x) _mm_loadu_ps(&x)
  104 #define STVF(x, y) _mm_store_ps(&x, y)
  105 #define STVFU(x, y) _mm_storeu_ps(&x, y)
  106 #else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3, also need to map the
  107       // aligned functions to unaligned functions for WIN32 builds
  108 #define LVF(x) _mm_loadu_ps(&x)
  109 #define LVFU(x) _mm_loadu_ps(&x)
  110 #define STVF(x, y) _mm_storeu_ps(&x, y)
  111 #define STVFU(x, y) _mm_storeu_ps(&x, y)
  112 #endif
  113 #else
  114 #define LVF(x) _mm_load_ps(&x)
  115 #define LVFU(x) _mm_loadu_ps(&x)
  116 #define STVF(x, y) _mm_store_ps(&x, y)
  117 #define STVFU(x, y) _mm_storeu_ps(&x, y)
  118 #endif
  119 
  120 #define STC2VFU(a, v)                                                                                        \
  121   {                                                                                                          \
  122     __m128 TST1V = _mm_loadu_ps(&a);                                                                         \
  123     __m128 TST2V = _mm_unpacklo_ps(v, v);                                                                    \
  124     vmask cmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);                                               \
  125     _mm_storeu_ps(&a, vself(cmask, TST1V, TST2V));                                                           \
  126     TST1V = _mm_loadu_ps((&a) + 4);                                                                          \
  127     TST2V = _mm_unpackhi_ps(v, v);                                                                           \
  128     _mm_storeu_ps((&a) + 4, vself(cmask, TST1V, TST2V));                                                     \
  129   }
  130 
  131 #define ZEROV _mm_setzero_ps()
  132 #define F2V(a) _mm_set1_ps((a))
  133 
  134 typedef __m128i vmask;
  135 typedef __m128 vfloat;
  136 typedef __m128i vint;
  137 
  138 static INLINE vfloat LC2VFU(float &a)
  139 {
  140   // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
  141   vfloat a1 = _mm_loadu_ps(&a);
  142   vfloat a2 = _mm_loadu_ps((&a) + 4);
  143   return _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(2, 0, 2, 0));
  144 }
  145 static INLINE vfloat vmaxf(vfloat x, vfloat y)
  146 {
  147   return _mm_max_ps(x, y);
  148 }
  149 static INLINE vfloat vminf(vfloat x, vfloat y)
  150 {
  151   return _mm_min_ps(x, y);
  152 }
  153 static INLINE vfloat vcast_vf_f(float f)
  154 {
  155   return _mm_set_ps(f, f, f, f);
  156 }
  157 static INLINE vmask vorm(vmask x, vmask y)
  158 {
  159   return _mm_or_si128(x, y);
  160 }
  161 static INLINE vmask vandm(vmask x, vmask y)
  162 {
  163   return _mm_and_si128(x, y);
  164 }
  165 static INLINE vmask vandnotm(vmask x, vmask y)
  166 {
  167   return _mm_andnot_si128(x, y);
  168 }
  169 static INLINE vfloat vabsf(vfloat f)
  170 {
  171   return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f);
  172 }
  173 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y)
  174 {
  175   return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
  176 }
  177 static INLINE vmask vmaskf_lt(vfloat x, vfloat y)
  178 {
  179   return (__m128i)_mm_cmplt_ps(x, y);
  180 }
  181 static INLINE vmask vmaskf_gt(vfloat x, vfloat y)
  182 {
  183   return (__m128i)_mm_cmpgt_ps(x, y);
  184 }
  185 static INLINE vfloat ULIMV(vfloat a, vfloat b, vfloat c)
  186 {
  187   // made to clamp a in range [b,c] but in fact it's also the median of a,b,c, which means that the result is
  188   // independent on order of arguments
  189   // ULIMV(a,b,c) = ULIMV(a,c,b) = ULIMV(b,a,c) = ULIMV(b,c,a) = ULIMV(c,a,b) = ULIMV(c,b,a)
  190   return vmaxf(vminf(a, b), vminf(vmaxf(a, b), c));
  191 }
  192 static INLINE vfloat SQRV(vfloat a)
  193 {
  194   return a * a;
  195 }
  196 static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c)
  197 {
  198   // calculate a * b + (1 - a) * c (interpolate two values)
  199   // following is valid:
  200   // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
  201   // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
  202   return a * (b - c) + c;
  203 }
  204 static INLINE vfloat vaddc2vfu(float &a)
  205 {
  206   // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
  207   vfloat a1 = _mm_loadu_ps(&a);
  208   vfloat a2 = _mm_loadu_ps((&a) + 4);
  209   return _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(2, 0, 2, 0)) + _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(3, 1, 3, 1));
  210 }
  211 static INLINE vfloat vadivapb(vfloat a, vfloat b)
  212 {
  213   return a / (a + b);
  214 }
  215 static INLINE vint vselc(vmask mask, vint x, vint y)
  216 {
  217   return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
  218 }
  219 static INLINE vint vselinotzero(vmask mask, vint x)
  220 {
  221   // returns value of x if corresponding mask bits are 0, else returns 0
  222   // faster than vselc(mask, ZEROV, x)
  223   return _mm_andnot_si128(mask, x);
  224 }
  225 static INLINE vfloat vmul2f(vfloat a)
  226 {
  227   // fastest way to multiply by 2
  228   return a + a;
  229 }
  230 static INLINE vmask vmaskf_ge(vfloat x, vfloat y)
  231 {
  232   return (__m128i)_mm_cmpge_ps(x, y);
  233 }
  234 static INLINE vmask vnotm(vmask x)
  235 {
  236   return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
  237 }
  238 static INLINE vfloat vdup(vfloat a)
  239 {
  240   // returns { a[0],a[0],a[1],a[1] }
  241   return _mm_unpacklo_ps(a, a);
  242 }
  243 
  244 #endif // __SSE2__
  245 
  246 template <typename _Tp> static inline const _Tp SQR(_Tp x)
  247 {
  248   //      return std::pow(x,2); Slower than:
  249   return (x * x);
  250 }
  251 
  252 template <typename _Tp> static inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c)
  253 {
  254   // calculate a * b + (1 - a) * c
  255   // following is valid:
  256   // intp(a, b+x, c+x) = intp(a, b, c) + x
  257   // intp(a, b*x, c*x) = intp(a, b, c) * x
  258   return a * (b - c) + c;
  259 }
  260 
  261 template <typename _Tp> static inline const _Tp LIM(const _Tp a, const _Tp b, const _Tp c)
  262 {
  263   return std::max(b, std::min(a, c));
  264 }
  265 
  266 template <typename _Tp> static inline const _Tp ULIM(const _Tp a, const _Tp b, const _Tp c)
  267 {
  268   return ((b < c) ? LIM(a, b, c) : LIM(a, c, b));
  269 }
  270 
  271 
  272 
  273 ////////////////////////////////////////////////////////////////
  274 //
  275 //          AMaZE demosaic algorithm
  276 // (Aliasing Minimization and Zipper Elimination)
  277 //
  278 //  copyright (c) 2008-2010  Emil Martinec <ejmartin@uchicago.edu>
  279 //  optimized for speed by Ingo Weyrich
  280 //
  281 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
  282 //
  283 // code dated: May 27, 2010
  284 // latest modification: Ingo Weyrich, January 25, 2016
  285 //
  286 //  amaze_interpolate_RT.cc is free software: you can redistribute it and/or modify
  287 //  it under the terms of the GNU General Public License as published by
  288 //  the Free Software Foundation, either version 3 of the License, or
  289 //  (at your option) any later version.
  290 //
  291 //  This program is distributed in the hope that it will be useful,
  292 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  293 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  294 //  GNU General Public License for more details.
  295 //
  296 //  You should have received a copy of the GNU General Public License
  297 //  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  298 //
  299 ////////////////////////////////////////////////////////////////
  300 
  301 // namespace rtengine
  302 // {
  303 
  304 // SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh)
  305 void amaze_demosaic_RT(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const float *const in,
  306                        float *out, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
  307                        const int filters)
  308 {
  309   //   BENCHFUN
  310 
  311   //   volatile double progress = 0.0;
  312 
  313   //   if(plistener)
  314   //   {
  315   //     plistener->setProgressStr(Glib::ustring::compose(
  316   //         M("TP_RAW_DMETHOD_PROGRESSBAR"),
  317   //         RAWParams::BayerSensor::methodstring[RAWParams::BayerSensor::amaze]));
  318   //     plistener->setProgress(0.0);
  319   //   }
  320 
  321   int winx = roi_out->x;
  322   int winy = roi_out->y;
  323   int winw = roi_in->width;
  324   int winh = roi_in->height;
  325 
  326   const int width = winw, height = winh;
  327   //   const float clip_pt = 1.0 / initialGain;
  328   //   const float clip_pt8 = 0.8 / initialGain;
  329   const float clip_pt = fminf(piece->pipe->dsc.processed_maximum[0],
  330                               fminf(piece->pipe->dsc.processed_maximum[1], piece->pipe->dsc.processed_maximum[2]));
  331   const float clip_pt8 = 0.8f * clip_pt;
  332 
  333 // this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
  334 // If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines
  335 #ifndef AMAZETS
  336 #define AMAZETS 160
  337 #endif
  338   // Tile size; the image is processed in square tiles to lower memory requirements and facilitate
  339   // multi-threading
  340   // We assure that Tile size is a multiple of 32 in the range [96;992]
  341   constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992);
  342   constexpr int tsh = ts / 2; // half of Tile size
  343 
  344   // offset of R pixel within a Bayer quartet
  345   int ex, ey;
  346 
  347   // determine GRBG coset; (ey,ex) is the offset of the R subarray
  348   if(FC(0, 0, filters) == 1)
  349   { // first pixel is G
  350     if(FC(0, 1, filters) == 0)
  351     {
  352       ey = 0;
  353       ex = 1;
  354     }
  355     else
  356     {
  357       ey = 1;
  358       ex = 0;
  359     }
  360   }
  361   else
  362   { // first pixel is R or B
  363     if(FC(0, 0, filters) == 0)
  364     {
  365       ey = 0;
  366       ex = 0;
  367     }
  368     else
  369     {
  370       ey = 1;
  371       ex = 1;
  372     }
  373   }
  374 
  375   // shifts of pointer value to access pixels in vertical and diagonal directions
  376   constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3,
  377                 m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3;
  378 
  379   // tolerance to avoid dividing by zero
  380   constexpr float eps = 1e-5, epssq = 1e-10; // tolerance to avoid dividing by zero
  381 
  382   // adaptive ratios threshold
  383   constexpr float arthresh = 0.75;
  384 
  385   // gaussian on 5x5 quincunx, sigma=1.2
  386   constexpr float gaussodd[4]
  387       = { 0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f };
  388   // nyquist texture test threshold
  389   constexpr float nyqthresh = 0.5;
  390   // gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
  391   // Is this really sigma=1.2????, seems more like sigma = 1.672
  392   constexpr float gaussgrad[6] = { nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f,
  393                                    nyqthresh * 0.0521818194747806f,  nyqthresh * 0.03687419286733595f,
  394                                    nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f };
  395   // gaussian on 5x5 alt quincunx, sigma=1.5
  396   constexpr float gausseven[2] = { 0.13719494435797422f, 0.05640252782101291f };
  397   // gaussian on quincunx grid
  398   constexpr float gquinc[4] = { 0.169917f, 0.108947f, 0.069855f, 0.0287182f };
  399 
  400   typedef struct
  401   {
  402     float h;
  403     float v;
  404   } s_hv;
  405 
  406 #ifdef _OPENMP
  407 #pragma omp parallel
  408 #endif
  409   {
  410     //     int progresscounter = 0;
  411 
  412     constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
  413     // assign working space
  414     char *buffer
  415         = (char *)calloc(14 * sizeof(float) * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1);
  416     // aligned to 64 byte boundary
  417     char *data = (char *)((uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
  418 
  419     // green values
  420     float *rgbgreen = (float(*))data;
  421     // sum of square of horizontal gradient and square of vertical gradient
  422     float *delhvsqsum = (float(*))((char *)rgbgreen + sizeof(float) * ts * ts + cldf * 64); // 1
  423     // gradient based directional weights for interpolation
  424     float *dirwts0 = (float(*))((char *)delhvsqsum + sizeof(float) * ts * ts + cldf * 64); // 1
  425     float *dirwts1 = (float(*))((char *)dirwts0 + sizeof(float) * ts * ts + cldf * 64);    // 1
  426     // vertically interpolated colour differences G-R, G-B
  427     float *vcd = (float(*))((char *)dirwts1 + sizeof(float) * ts * ts + cldf * 64); // 1
  428     // horizontally interpolated colour differences
  429     float *hcd = (float(*))((char *)vcd + sizeof(float) * ts * ts + cldf * 64); // 1
  430     // alternative vertical interpolation
  431     float *vcdalt = (float(*))((char *)hcd + sizeof(float) * ts * ts + cldf * 64); // 1
  432     // alternative horizontal interpolation
  433     float *hcdalt = (float(*))((char *)vcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
  434     // square of average colour difference
  435     float *cddiffsq = (float(*))((char *)hcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
  436     // weight to give horizontal vs vertical interpolation
  437     float *hvwt = (float(*))((char *)cddiffsq + sizeof(float) * ts * ts + 2 * cldf * 64); // 1
  438     // final interpolated colour difference
  439     float(*Dgrb)[ts * tsh] = (float(*)[ts * tsh])vcdalt; // there is no overlap in buffer usage => share
  440     // gradient in plus (NE/SW) direction
  441     float *delp = (float(*))cddiffsq; // there is no overlap in buffer usage => share
  442     // gradient in minus (NW/SE) direction
  443     float *delm = (float(*))((char *)delp + sizeof(float) * ts * tsh + cldf * 64);
  444     // diagonal interpolation of R+B
  445     float *rbint = (float(*))delm; // there is no overlap in buffer usage => share
  446     // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture
  447     // regions)
  448     s_hv *Dgrb2 = (s_hv(*))((char *)hvwt + sizeof(float) * ts * tsh + cldf * 64); // 1
  449     // difference between up/down interpolations of G
  450     float *dgintv = (float(*))Dgrb2; // there is no overlap in buffer usage => share
  451     // difference between left/right interpolations of G
  452     float *dginth = (float(*))((char *)dgintv + sizeof(float) * ts * ts + cldf * 64); // 1
  453     // square of diagonal colour differences
  454     float *Dgrbsq1m = (float(*))((char *)dginth + sizeof(float) * ts * ts + cldf * 64);    // 1
  455     float *Dgrbsq1p = (float(*))((char *)Dgrbsq1m + sizeof(float) * ts * tsh + cldf * 64); // 1
  456     // tile raw data
  457     float *cfa = (float(*))((char *)Dgrbsq1p + sizeof(float) * ts * tsh + cldf * 64); // 1
  458     // relative weight for combining plus and minus diagonal interpolations
  459     float *pmwt = (float(*))delhvsqsum; // there is no overlap in buffer usage => share
  460     // interpolated colour difference R-B in minus and plus direction
  461     float *rbm = (float(*))vcd; // there is no overlap in buffer usage => share
  462     float *rbp = (float(*))((char *)rbm + sizeof(float) * ts * tsh + cldf * 64);
  463     // nyquist texture flags 1=nyquist, 0=not nyquist
  464     unsigned char *nyquist = (unsigned char(*))((char *)cfa + sizeof(float) * ts * ts + cldf * 64); // 1
  465     unsigned char *nyquist2 = (unsigned char(*))cddiffsq;
  466     float *nyqutest = (float(*))((char *)nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64); // 1
  467 
  468 // Main algorithm: Tile loop
  469 // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
  470 #ifdef _OPENMP
  471 #pragma omp for schedule(dynamic) collapse(2) nowait
  472 #endif
  473 
  474     for(int top = winy - 16; top < winy + height; top += ts - 32)
  475     {
  476       for(int left = winx - 16; left < winx + width; left += ts - 32)
  477       {
  478         memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
  479         // location of tile bottom edge
  480         int bottom = MIN(top + ts, winy + height + 16);
  481         // location of tile right edge
  482         int right = MIN(left + ts, winx + width + 16);
  483         // tile width  (=ts except for right edge of image)
  484         int rr1 = bottom - top;
  485         // tile height (=ts except for bottom edge of image)
  486         int cc1 = right - left;
  487         // bookkeeping for borders
  488         // min and max row/column in the tile
  489         int rrmin = top < winy ? 16 : 0;
  490         int ccmin = left < winx ? 16 : 0;
  491         int rrmax = bottom > (winy + height) ? winy + height - top : rr1;
  492         int ccmax = right > (winx + width) ? winx + width - left : cc1;
  493 
  494 // rgb from input CFA data
  495 // rgb values should be floating point number between 0 and 1
  496 // after white balance multipliers are applied
  497 // a 16 pixel border is added to each side of the image
  498 
  499 // begin of tile initialization
  500 #ifdef __SSE2__
  501         // fill upper border
  502         if(rrmin > 0)
  503         {
  504           for(int rr = 0; rr < 16; rr++)
  505           {
  506             int row = 32 - rr + top;
  507 
  508             for(int cc = ccmin; cc < ccmax; cc += 4)
  509             {
  510               int indx1 = rr * ts + cc;
  511               vfloat tempv = LVFU(in[row * width + (cc + left)]);
  512               STVF(cfa[indx1], tempv);
  513               STVF(rgbgreen[indx1], tempv);
  514             }
  515           }
  516         }
  517 
  518         // fill inner part
  519         for(int rr = rrmin; rr < rrmax; rr++)
  520         {
  521           int row = rr + top;
  522 
  523           for(int cc = ccmin; cc < ccmax; cc += 4)
  524           {
  525             int indx1 = rr * ts + cc;
  526             vfloat tempv = LVFU(in[row * width + (cc + left)]);
  527             STVF(cfa[indx1], tempv);
  528             STVF(rgbgreen[indx1], tempv);
  529           }
  530         }
  531 
  532         // fill lower border
  533         if(rrmax < rr1)
  534         {
  535           for(int rr = 0; rr < 16; rr++)
  536             for(int cc = ccmin; cc < ccmax; cc += 4)
  537             {
  538               int indx1 = (rrmax + rr) * ts + cc;
  539               vfloat tempv = LVFU(in[(winy + height - rr - 2) * width + (left + cc)]);
  540               STVF(cfa[indx1], tempv);
  541               STVF(rgbgreen[indx1], tempv);
  542             }
  543         }
  544 
  545 #else
  546 
  547         // fill upper border
  548         if(rrmin > 0)
  549         {
  550           for(int rr = 0; rr < 16; rr++)
  551             for(int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++)
  552             {
  553               cfa[rr * ts + cc] = (in[row * width + (cc + left)]);
  554               rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
  555             }
  556         }
  557 
  558         // fill inner part
  559         for(int rr = rrmin; rr < rrmax; rr++)
  560         {
  561           int row = rr + top;
  562 
  563           for(int cc = ccmin; cc < ccmax; cc++)
  564           {
  565             int indx1 = rr * ts + cc;
  566             cfa[indx1] = (in[row * width + (cc + left)]);
  567             rgbgreen[indx1] = cfa[indx1];
  568           }
  569         }
  570 
  571         // fill lower border
  572         if(rrmax < rr1)
  573         {
  574           for(int rr = 0; rr < 16; rr++)
  575             for(int cc = ccmin; cc < ccmax; cc++)
  576             {
  577               cfa[(rrmax + rr) * ts + cc] = (in[(winy + height - rr - 2) * width + (left + cc)]);
  578               rgbgreen[(rrmax + rr) * ts + cc] = cfa[(rrmax + rr) * ts + cc];
  579             }
  580         }
  581 
  582 #endif
  583 
  584         // fill left border
  585         if(ccmin > 0)
  586         {
  587           for(int rr = rrmin; rr < rrmax; rr++)
  588             for(int cc = 0, row = rr + top; cc < 16; cc++)
  589             {
  590               cfa[rr * ts + cc] = (in[row * width + (32 - cc + left)]);
  591               rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
  592             }
  593         }
  594 
  595         // fill right border
  596         if(ccmax < cc1)
  597         {
  598           for(int rr = rrmin; rr < rrmax; rr++)
  599             for(int cc = 0; cc < 16; cc++)
  600             {
  601               cfa[rr * ts + ccmax + cc] = (in[(top + rr) * width + ((winx + width - cc - 2))]);
  602               rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc];
  603             }
  604         }
  605 
  606         // also, fill the image corners
  607         if(rrmin > 0 && ccmin > 0)
  608         {
  609           for(int rr = 0; rr < 16; rr++)
  610             for(int cc = 0; cc < 16; cc++)
  611             {
  612               cfa[(rr)*ts + cc] = (in[(winy + 32 - rr) * width + (winx + 32 - cc)]);
  613               rgbgreen[(rr)*ts + cc] = cfa[(rr)*ts + cc];
  614             }
  615         }
  616 
  617         if(rrmax < rr1 && ccmax < cc1)
  618         {
  619           for(int rr = 0; rr < 16; rr++)
  620             for(int cc = 0; cc < 16; cc++)
  621             {
  622               cfa[(rrmax + rr) * ts + ccmax + cc]
  623                   = (in[(winy + height - rr - 2) * width + ((winx + width - cc - 2))]);
  624               rgbgreen[(rrmax + rr) * ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc];
  625             }
  626         }
  627 
  628         if(rrmin > 0 && ccmax < cc1)
  629         {
  630           for(int rr = 0; rr < 16; rr++)
  631             for(int cc = 0; cc < 16; cc++)
  632             {
  633               cfa[(rr)*ts + ccmax + cc] = (in[(winy + 32 - rr) * width + ((winx + width - cc - 2))]);
  634               rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr)*ts + ccmax + cc];
  635             }
  636         }
  637 
  638         if(rrmax < rr1 && ccmin > 0)
  639         {
  640           for(int rr = 0; rr < 16; rr++)
  641             for(int cc = 0; cc < 16; cc++)
  642             {
  643               cfa[(rrmax + rr) * ts + cc] = (in[(winy + height - rr - 2) * width + ((winx + 32 - cc))]);
  644               rgbgreen[(rrmax + rr) * ts + cc] = cfa[(rrmax + rr) * ts + cc];
  645             }
  646         }
  647 
  648 // end of tile initialization
  649 
  650 // horizontal and vertical gradients
  651 #ifdef __SSE2__
  652         vfloat epsv = F2V(eps);
  653 
  654         for(int rr = 2; rr < rr1 - 2; rr++)
  655         {
  656           for(int indx = rr * ts; indx < rr * ts + cc1; indx += 4)
  657           {
  658             vfloat delhv = vabsf(LVFU(cfa[indx + 1]) - LVFU(cfa[indx - 1]));
  659             vfloat delvv = vabsf(LVF(cfa[indx + v1]) - LVF(cfa[indx - v1]));
  660             STVF(dirwts1[indx], epsv + vabsf(LVFU(cfa[indx + 2]) - LVF(cfa[indx]))
  661                                     + vabsf(LVF(cfa[indx]) - LVFU(cfa[indx - 2])) + delhv);
  662             STVF(dirwts0[indx], epsv + vabsf(LVF(cfa[indx + v2]) - LVF(cfa[indx]))
  663                                     + vabsf(LVF(cfa[indx]) - LVF(cfa[indx - v2])) + delvv);
  664             STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv));
  665           }
  666         }
  667 
  668 #else
  669 
  670         for(int rr = 2; rr < rr1 - 2; rr++)
  671           for(int cc = 2, indx = (rr)*ts + cc; cc < cc1 - 2; cc++, indx++)
  672           {
  673             float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
  674             float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
  675             dirwts0[indx]
  676                 = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
  677             dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh;
  678             delhvsqsum[indx] = SQR(delh) + SQR(delv);
  679           }
  680 
  681 #endif
  682 
  683 // interpolate vertical and horizontal colour differences
  684 #ifdef __SSE2__
  685         vfloat sgnv;
  686 
  687         if(!(FC(4, 4, filters) & 1))
  688         {
  689           sgnv = _mm_set_ps(1.f, -1.f, 1.f, -1.f);
  690         }
  691         else
  692         {
  693           sgnv = _mm_set_ps(-1.f, 1.f, -1.f, 1.f);
  694         }
  695 
  696         vfloat zd5v = F2V(0.5f);
  697         vfloat onev = F2V(1.f);
  698         vfloat arthreshv = F2V(arthresh);
  699         vfloat clip_pt8v = F2V(clip_pt8);
  700 
  701         for(int rr = 4; rr < rr1 - 4; rr++)
  702         {
  703           sgnv = -sgnv;
  704 
  705           for(int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4)
  706           {
  707             // colour ratios in each cardinal direction
  708             vfloat cfav = LVF(cfa[indx]);
  709             vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx]))
  710                           / (LVF(dirwts0[indx - v2]) * (epsv + cfav)
  711                              + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
  712             vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx]))
  713                           / (LVF(dirwts0[indx + v2]) * (epsv + cfav)
  714                              + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
  715             vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx]))
  716                           / (LVFU(dirwts1[indx - 2]) * (epsv + cfav)
  717                              + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
  718             vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx]))
  719                           / (LVFU(dirwts1[indx + 2]) * (epsv + cfav)
  720                              + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
  721 
  722             // G interpolated in vert/hor directions using Hamilton-Adams method
  723             vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
  724             vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
  725             vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
  726             vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
  727 
  728             // G interpolated in vert/hor directions using adaptive ratios
  729             vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
  730             vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
  731             vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
  732             vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
  733 
  734             // adaptive weights for vertical/horizontal directions
  735             vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
  736             vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
  737 
  738             // interpolated G via adaptive weights of cardinal evaluations
  739             vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
  740             vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
  741 
  742             // interpolated colour differences
  743             vfloat hcdaltv = sgnv * (Ginthhav - cfav);
  744             vfloat vcdaltv = sgnv * (Gintvhav - cfav);
  745             STVF(hcdalt[indx], hcdaltv);
  746             STVF(vcdalt[indx], vcdaltv);
  747 
  748             vmask clipmask = vorm(vorm(vmaskf_gt(cfav, clip_pt8v), vmaskf_gt(Gintvhav, clip_pt8v)),
  749                                   vmaskf_gt(Ginthhav, clip_pt8v));
  750             guarv = vself(clipmask, guhav, guarv);
  751             gdarv = vself(clipmask, gdhav, gdarv);
  752             glarv = vself(clipmask, glhav, glarv);
  753             grarv = vself(clipmask, grhav, grarv);
  754 
  755             // use HA if highlights are (nearly) clipped
  756             STVF(vcd[indx], vself(clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
  757             STVF(hcd[indx], vself(clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
  758 
  759             // differences of interpolations in opposite directions
  760             STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
  761             STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
  762           }
  763         }
  764 
  765 #else
  766 
  767         for(int rr = 4; rr < rr1 - 4; rr++)
  768         {
  769           bool fcswitch = FC(rr, 4, filters) & 1;
  770 
  771           for(int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++)
  772           {
  773 
  774             // colour ratios in each cardinal direction
  775             float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx])
  776                         / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
  777             float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx])
  778                         / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
  779             float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx])
  780                         / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
  781             float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx])
  782                         / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
  783 
  784             // G interpolated in vert/hor directions using Hamilton-Adams method
  785             float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]);
  786             float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]);
  787             float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]);
  788             float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]);
  789 
  790             // G interpolated in vert/hor directions using adaptive ratios
  791             float guar, gdar, glar, grar;
  792 
  793             if(fabsf(1.f - cru) < arthresh)
  794             {
  795               guar = cfa[indx] * cru;
  796             }
  797             else
  798             {
  799               guar = guha;
  800             }
  801 
  802             if(fabsf(1.f - crd) < arthresh)
  803             {
  804               gdar = cfa[indx] * crd;
  805             }
  806             else
  807             {
  808               gdar = gdha;
  809             }
  810 
  811             if(fabsf(1.f - crl) < arthresh)
  812             {
  813               glar = cfa[indx] * crl;
  814             }
  815             else
  816             {
  817               glar = glha;
  818             }
  819 
  820             if(fabsf(1.f - crr) < arthresh)
  821             {
  822               grar = cfa[indx] * crr;
  823             }
  824             else
  825             {
  826               grar = grha;
  827             }
  828 
  829             // adaptive weights for vertical/horizontal directions
  830             float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
  831             float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
  832 
  833             // interpolated G via adaptive weights of cardinal evaluations
  834             float Gintvha = vwt * gdha + (1.f - vwt) * guha;
  835             float Ginthha = hwt * grha + (1.f - hwt) * glha;
  836 
  837             // interpolated colour differences
  838             if(fcswitch)
  839             {
  840               vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar);
  841               hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar);
  842               vcdalt[indx] = cfa[indx] - Gintvha;
  843               hcdalt[indx] = cfa[indx] - Ginthha;
  844             }
  845             else
  846             {
  847               // interpolated colour differences
  848               vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx];
  849               hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx];
  850               vcdalt[indx] = Gintvha - cfa[indx];
  851               hcdalt[indx] = Ginthha - cfa[indx];
  852             }
  853 
  854             fcswitch = !fcswitch;
  855 
  856             if(cfa[indx] > clip_pt8 || Gintvha > clip_pt8 || Ginthha > clip_pt8)
  857             {
  858               // use HA if highlights are (nearly) clipped
  859               guar = guha;
  860               gdar = gdha;
  861               glar = glha;
  862               grar = grha;
  863               vcd[indx] = vcdalt[indx];
  864               hcd[indx] = hcdalt[indx];
  865             }
  866 
  867             // differences of interpolations in opposite directions
  868             dgintv[indx] = MIN(SQR(guha - gdha), SQR(guar - gdar));
  869             dginth[indx] = MIN(SQR(glha - grha), SQR(glar - grar));
  870           }
  871         }
  872 
  873 #endif
  874 
  875 
  876 
  877 #ifdef __SSE2__
  878         vfloat clip_ptv = F2V(clip_pt);
  879         vfloat sgn3v;
  880 
  881         if(!(FC(4, 4, filters) & 1))
  882         {
  883           sgnv = _mm_set_ps(1.f, -1.f, 1.f, -1.f);
  884         }
  885         else
  886         {
  887           sgnv = _mm_set_ps(-1.f, 1.f, -1.f, 1.f);
  888         }
  889 
  890         sgn3v = sgnv + sgnv + sgnv;
  891 
  892         for(int rr = 4; rr < rr1 - 4; rr++)
  893         {
  894           vfloat nsgnv = sgnv;
  895           sgnv = -sgnv;
  896           sgn3v = -sgn3v;
  897 
  898           for(int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4)
  899           {
  900             vfloat hcdv = LVF(hcd[indx]);
  901             vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv)
  902                              + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2]))
  903                              + SQRV(hcdv - LVFU(hcd[indx + 2]));
  904             vfloat hcdaltv = LVF(hcdalt[indx]);
  905             vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv)
  906                                 + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2]))
  907                                 + SQRV(hcdaltv - LVFU(hcdalt[indx + 2]));
  908             vfloat vcdv = LVF(vcd[indx]);
  909             vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv)
  910                              + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2]))
  911                              + SQRV(vcdv - LVF(vcd[indx + v2]));
  912             vfloat vcdaltv = LVF(vcdalt[indx]);
  913             vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv)
  914                                 + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2]))
  915                                 + SQRV(vcdaltv - LVF(vcdalt[indx + v2]));
  916 
  917             // choose the smallest variance; this yields a smoother interpolation
  918             hcdv = vself(vmaskf_lt(hcdaltvarv, hcdvarv), hcdaltv, hcdv);
  919             vcdv = vself(vmaskf_lt(vcdaltvarv, vcdvarv), vcdaltv, vcdv);
  920 
  921             // bound the interpolation in regions of high saturation
  922             // vertical and horizontal G interpolations
  923             vfloat Ginthv = sgnv * hcdv + LVF(cfa[indx]);
  924             vfloat temp2v = sgn3v * hcdv;
  925             vfloat hwtv = onev + temp2v / (epsv + Ginthv + LVF(cfa[indx]));
  926             vmask hcdmask = vmaskf_gt(nsgnv * hcdv, ZEROV);
  927             vfloat hcdoldv = hcdv;
  928             vfloat tempv = nsgnv * (LVF(cfa[indx]) - ULIMV(Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1])));
  929             hcdv = vself(vmaskf_lt(temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv));
  930             hcdv = vself(hcdmask, hcdv, hcdoldv);
  931             hcdv = vself(vmaskf_gt(Ginthv, clip_ptv), tempv, hcdv);
  932             STVF(hcd[indx], hcdv);
  933 
  934             vfloat Gintvv = sgnv * vcdv + LVF(cfa[indx]);
  935             temp2v = sgn3v * vcdv;
  936             vfloat vwtv = onev + temp2v / (epsv + Gintvv + LVF(cfa[indx]));
  937             vmask vcdmask = vmaskf_gt(nsgnv * vcdv, ZEROV);
  938             vfloat vcdoldv = vcdv;
  939             tempv = nsgnv * (LVF(cfa[indx]) - ULIMV(Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1])));
  940             vcdv = vself(vmaskf_lt(temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv));
  941             vcdv = vself(vcdmask, vcdv, vcdoldv);
  942             vcdv = vself(vmaskf_gt(Gintvv, clip_ptv), tempv, vcdv);
  943             STVF(vcd[indx], vcdv);
  944             STVFU(cddiffsq[indx], SQRV(vcdv - hcdv));
  945           }
  946         }
  947 
  948 #else
  949 
  950         for(int rr = 4; rr < rr1 - 4; rr++)
  951         {
  952           for(int cc = 4, indx = rr * ts + cc, c = FC(rr, cc, filters) & 1; cc < cc1 - 4; cc++, indx++)
  953           {
  954             float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2]))
  955                            - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
  956             float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2]))
  957                               - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
  958             float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2]))
  959                            - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
  960             float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2]))
  961                               - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
  962 
  963             // choose the smallest variance; this yields a smoother interpolation
  964             if(hcdaltvar < hcdvar)
  965             {
  966               hcd[indx] = hcdalt[indx];
  967             }
  968 
  969             if(vcdaltvar < vcdvar)
  970             {
  971               vcd[indx] = vcdalt[indx];
  972             }
  973 
  974             // bound the interpolation in regions of high saturation
  975             // vertical and horizontal G interpolations
  976             float Gintv, Ginth;
  977 
  978             if(c)
  979             {                                 // G site
  980               Ginth = -hcd[indx] + cfa[indx]; // R or B
  981               Gintv = -vcd[indx] + cfa[indx]; // B or R
  982 
  983               if(hcd[indx] > 0)
  984               {
  985                 if(3.f * hcd[indx] > (Ginth + cfa[indx]))
  986                 {
  987                   hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
  988                 }
  989                 else
  990                 {
  991                   float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
  992                   hcd[indx] = hwt * hcd[indx]
  993                               + (1.f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
  994                 }
  995               }
  996 
  997               if(vcd[indx] > 0)
  998               {
  999                 if(3.f * vcd[indx] > (Gintv + cfa[indx]))
 1000                 {
 1001                   vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
 1002                 }
 1003                 else
 1004                 {
 1005                   float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
 1006                   vcd[indx] = vwt * vcd[indx]
 1007                               + (1.f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
 1008                 }
 1009               }
 1010 
 1011               if(Ginth > clip_pt)
 1012               {
 1013                 hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
 1014               }
 1015 
 1016               if(Gintv > clip_pt)
 1017               {
 1018                 vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
 1019               }
 1020             }
 1021             else
 1022             { // R or B site
 1023 
 1024               Ginth = hcd[indx] + cfa[indx]; // interpolated G
 1025               Gintv = vcd[indx] + cfa[indx];
 1026 
 1027               if(hcd[indx] < 0)
 1028               {
 1029                 if(3.f * hcd[indx] < -(Ginth + cfa[indx]))
 1030                 {
 1031                   hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
 1032                 }
 1033                 else
 1034                 {
 1035                   float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
 1036                   hcd[indx] = hwt * hcd[indx]
 1037                               + (1.f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
 1038                 }
 1039               }
 1040 
 1041               if(vcd[indx] < 0)
 1042               {
 1043                 if(3.f * vcd[indx] < -(Gintv + cfa[indx]))
 1044                 {
 1045                   vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
 1046                 }
 1047                 else
 1048                 {
 1049                   float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
 1050                   vcd[indx] = vwt * vcd[indx]
 1051                               + (1.f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
 1052                 }
 1053               }
 1054 
 1055               if(Ginth > clip_pt)
 1056               {
 1057                 hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
 1058               }
 1059 
 1060               if(Gintv > clip_pt)
 1061               {
 1062                 vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
 1063               }
 1064 
 1065               cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]);
 1066             }
 1067 
 1068             c = !c;
 1069           }
 1070         }
 1071 
 1072 #endif
 1073 
 1074 
 1075 
 1076 #ifdef __SSE2__
 1077         vfloat epssqv = F2V(epssq);
 1078 
 1079         for(int rr = 6; rr < rr1 - 6; rr++)
 1080         {
 1081           for(int indx = rr * ts + 6 + (FC(rr, 2, filters) & 1); indx < rr * ts + cc1 - 6; indx += 8)
 1082           {
 1083             // compute colour difference variances in cardinal directions
 1084             vfloat tempv = LC2VFU(vcd[indx]);
 1085             vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
 1086             vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
 1087             vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev)
 1088                                 + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
 1089             vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev)
 1090                                 + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
 1091 
 1092             vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1]));
 1093             vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1]));
 1094 
 1095             tempv = LC2VFU(hcd[indx]);
 1096             vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
 1097             vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]);
 1098 
 1099             vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev)
 1100                                 + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
 1101             vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev)
 1102                                 + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
 1103 
 1104 
 1105             vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
 1106             vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
 1107 
 1108             // compute fluctuations in up/down and left/right interpolations of colours
 1109             Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
 1110             Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
 1111 
 1112             Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]);
 1113             Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]);
 1114 
 1115             vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
 1116             vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
 1117 
 1118             // determine adaptive weights for G interpolation
 1119             vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv);
 1120             vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
 1121 
 1122             // if both agree on interpolation direction, choose the one with strongest directional
 1123             // discrimination;
 1124             // otherwise, choose the u/d and l/r difference fluctuation weights
 1125             vmask decmask = vandm(vmaskf_gt((zd5v - varwtv) * (zd5v - diffwtv), ZEROV),
 1126                                   vmaskf_lt(vabsf(zd5v - diffwtv), vabsf(zd5v - varwtv)));
 1127             STVFU(hvwt[indx >> 1], vself(decmask, varwtv, diffwtv));
 1128           }
 1129         }
 1130 
 1131 #else
 1132 
 1133         for(int rr = 6; rr < rr1 - 6; rr++)
 1134         {
 1135           for(int cc = 6 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
 1136           {
 1137 
 1138             // compute colour difference variances in cardinal directions
 1139 
 1140             float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
 1141             float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
 1142             float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
 1143             float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
 1144 
 1145             // colour difference (G-R or G-B) variance in up/down/left/right directions
 1146             float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave)
 1147                               + SQR(vcd[indx - v3] - uave);
 1148             float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave)
 1149                               + SQR(vcd[indx + v3] - dave);
 1150             float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave)
 1151                               + SQR(hcd[indx - 3] - lave);
 1152             float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave)
 1153                               + SQR(hcd[indx + 3] - rave);
 1154 
 1155             float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
 1156             float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
 1157 
 1158             float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
 1159             float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
 1160 
 1161             // compute fluctuations in up/down and left/right interpolations of colours
 1162             Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
 1163             Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]);
 1164             Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
 1165             Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
 1166 
 1167             float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
 1168             float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
 1169 
 1170             // determine adaptive weights for G interpolation
 1171             float varwt = hcdvar / (vcdvar + hcdvar);
 1172             float diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
 1173 
 1174             // if both agree on interpolation direction, choose the one with strongest directional
 1175             // discrimination;
 1176             // otherwise, choose the u/d and l/r difference fluctuation weights
 1177             if((0.5 - varwt) * (0.5 - diffwt) > 0 && fabsf(0.5f - diffwt) < fabsf(0.5f - varwt))
 1178             {
 1179               hvwt[indx >> 1] = varwt;
 1180             }
 1181             else
 1182             {
 1183               hvwt[indx >> 1] = diffwt;
 1184             }
 1185           }
 1186         }
 1187 
 1188 #endif
 1189 
 1190 #ifdef __SSE2__
 1191         vfloat gaussg0 = F2V(gaussgrad[0]);
 1192         vfloat gaussg1 = F2V(gaussgrad[1]);
 1193         vfloat gaussg2 = F2V(gaussgrad[2]);
 1194         vfloat gaussg3 = F2V(gaussgrad[3]);
 1195         vfloat gaussg4 = F2V(gaussgrad[4]);
 1196         vfloat gaussg5 = F2V(gaussgrad[5]);
 1197         vfloat gausso0 = F2V(gaussodd[0]);
 1198         vfloat gausso1 = F2V(gaussodd[1]);
 1199         vfloat gausso2 = F2V(gaussodd[2]);
 1200         vfloat gausso3 = F2V(gaussodd[3]);
 1201 
 1202 #endif
 1203 
 1204         // precompute nyquist
 1205         for(int rr = 6; rr < rr1 - 6; rr++)
 1206         {
 1207           int cc = 6 + (FC(rr, 2, filters) & 1);
 1208           int indx = rr * ts + cc;
 1209 
 1210 #ifdef __SSE2__
 1211 
 1212           for(; cc < cc1 - 7; cc += 8, indx += 8)
 1213           {
 1214             vfloat valv
 1215                 = (gausso0 * LC2VFU(cddiffsq[indx])
 1216                    + gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)])
 1217                                 + LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)]))
 1218                    + gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)])
 1219                                 + LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)]))
 1220                    + gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)])
 1221                                 + LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)])))
 1222                   - (gaussg0 * LC2VFU(delhvsqsum[indx])
 1223                      + gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1])
 1224                                   + LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1]))
 1225                      + gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1])
 1226                                   + LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1]))
 1227                      + gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2])
 1228                                   + LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2]))
 1229                      + gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1])
 1230                                   + LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2])
 1231                                   + LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2])
 1232                                   + LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1]))
 1233                      + gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2])
 1234                                   + LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2])));
 1235             STVFU(nyqutest[indx >> 1], valv);
 1236           }
 1237 
 1238 #endif
 1239 
 1240           for(; cc < cc1 - 6; cc += 2, indx += 2)
 1241           {
 1242             nyqutest[indx >> 1]
 1243                 = (gaussodd[0] * cddiffsq[indx]
 1244                    + gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] + cddiffsq[(indx - p1)]
 1245                                     + cddiffsq[(indx + m1)])
 1246                    + gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] + cddiffsq[(indx + 2)]
 1247                                     + cddiffsq[(indx + v2)])
 1248                    + gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] + cddiffsq[(indx - p2)]
 1249                                     + cddiffsq[(indx + m2)]))
 1250                   - (gaussgrad[0] * delhvsqsum[indx]
 1251                      + gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] + delhvsqsum[indx - 1]
 1252                                        + delhvsqsum[indx + v1])
 1253                      + gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] + delhvsqsum[indx - p1]
 1254                                        + delhvsqsum[indx + m1])
 1255                      + gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] + delhvsqsum[indx + 2]
 1256                                        + delhvsqsum[indx + v2])
 1257                      + gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1]
 1258                                        + delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2]
 1259                                        + delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2]
 1260                                        + delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1])
 1261                      + gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] + delhvsqsum[indx - p2]
 1262                                        + delhvsqsum[indx + m2]));
 1263           }
 1264         }
 1265 
 1266         // Nyquist test
 1267         int nystartrow = 0;
 1268         int nyendrow = 0;
 1269         int nystartcol = ts + 1;
 1270         int nyendcol = 0;
 1271 
 1272         for(int rr = 6; rr < rr1 - 6; rr++)
 1273         {
 1274           for(int cc = 6 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
 1275           {
 1276 
 1277             // nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB
 1278             // gradients
 1279             if(nyqutest[indx >> 1] > 0.f)
 1280             {
 1281               nyquist[indx >> 1] = 1; // nyquist=1 for nyquist region
 1282               nystartrow = nystartrow ? nystartrow : rr;
 1283               nyendrow = rr;
 1284               nystartcol = nystartcol > cc ? cc : nystartcol;
 1285               nyendcol = nyendcol < cc ? cc : nyendcol;
 1286             }
 1287           }
 1288         }
 1289 
 1290 
 1291         bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol;
 1292 
 1293         if(doNyquist)
 1294         {
 1295           nyendrow++; // because of < condition
 1296           nyendcol++; // because of < condition
 1297           nystartcol -= (nystartcol & 1);
 1298           nystartrow = std::max(8, nystartrow);
 1299           nyendrow = std::min(rr1 - 8, nyendrow);
 1300           nystartcol = std::max(8, nystartcol);
 1301           nyendcol = std::min(cc1 - 8, nyendcol);
 1302           memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh);
 1303 
 1304 #ifdef __SSE2__
 1305           vint fourvb = _mm_set1_epi8(4);
 1306           vint onevb = _mm_set1_epi8(1);
 1307 
 1308 #endif
 1309 
 1310           for(int rr = nystartrow; rr < nyendrow; rr++)
 1311           {
 1312 #ifdef __SSE2__
 1313 
 1314             for(int indx = rr * ts; indx < rr * ts + cc1; indx += 32)
 1315             {
 1316               vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint *)&nyquist[(indx - v2) >> 1]),
 1317                                                  _mm_loadu_si128((vint *)&nyquist[(indx - m1) >> 1]));
 1318               vint nyquisttemp2v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + p1) >> 1]),
 1319                                                  _mm_loadu_si128((vint *)&nyquist[(indx - 2) >> 1]));
 1320               vint nyquisttemp3v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + 2) >> 1]),
 1321                                                  _mm_loadu_si128((vint *)&nyquist[(indx - p1) >> 1]));
 1322               vint valv = _mm_load_si128((vint *)&nyquist[indx >> 1]);
 1323               vint nyquisttemp4v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + m1) >> 1]),
 1324                                                  _mm_load_si128((vint *)&nyquist[(indx + v2) >> 1]));
 1325               nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v);
 1326               nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v);
 1327               nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v);
 1328               valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv);
 1329               valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv);
 1330               _mm_store_si128((vint *)&nyquist2[indx >> 1], valv);
 1331             }
 1332 
 1333 #else
 1334 
 1335             for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
 1336                 indx += 2)
 1337             {
 1338               unsigned int nyquisttemp
 1339                   = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1]
 1340                      + nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] + nyquist[(indx - p1) >> 1]
 1341                      + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
 1342               // if most of your neighbours are named Nyquist, it's likely that you're one too, or not
 1343               nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]);
 1344             }
 1345 
 1346 #endif
 1347           }
 1348 
 1349           // end of Nyquist test
 1350 
 1351           // in areas of Nyquist texture, do area interpolation
 1352           for(int rr = nystartrow; rr < nyendrow; rr++)
 1353             for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
 1354                 indx += 2)
 1355             {
 1356 
 1357               if(nyquist2[indx >> 1])
 1358               {
 1359                 // area interpolation
 1360 
 1361                 float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
 1362 
 1363                 for(int i = -6; i < 7; i += 2)
 1364                 {
 1365                   int indx1 = indx + (i * ts) - 6;
 1366 
 1367                   for(int j = -6; j < 7; j += 2, indx1 += 2)
 1368                   {
 1369                     if(nyquist2[indx1 >> 1])
 1370                     {
 1371                       float cfatemp = cfa[indx1];
 1372                       sumcfa += cfatemp;
 1373                       sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
 1374                       sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]);
 1375                       sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]);
 1376                       sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]);
 1377                       areawt += 1;
 1378                     }
 1379                   }
 1380                 }
 1381 
 1382                 // horizontal and vertical colour differences, and adaptive weight
 1383                 sumh = sumcfa - xdiv2f(sumh);
 1384                 sumv = sumcfa - xdiv2f(sumv);
 1385                 areawt = xdiv2f(areawt);
 1386                 float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
 1387                 float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
 1388                 hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
 1389 
 1390                 // end of area interpolation
 1391               }
 1392             }
 1393         }
 1394 
 1395 
 1396         // populate G at R/B sites
 1397         for(int rr = 8; rr < rr1 - 8; rr++)
 1398           for(int indx = rr * ts + 8 + (FC(rr, 2, filters) & 1); indx < rr * ts + cc1 - 8; indx += 2)
 1399           {
 1400 
 1401             // first ask if one gets more directional discrimination from nearby B/R sites
 1402             float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1]
 1403                                       + hvwt[(indx + m1) >> 1],
 1404                                   2);
 1405 
 1406             hvwt[indx >> 1]
 1407                 = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
 1408             // a better result was obtained from the neighbours
 1409 
 1410             Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); // evaluate colour differences
 1411 
 1412             rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; // evaluate G (finally!)
 1413 
 1414             // local curvature in G (preparation for nyquist refinement step)
 1415             Dgrb2[indx >> 1].h = nyquist2[indx >> 1]
 1416                                      ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1]))
 1417                                      : 0.f;
 1418             Dgrb2[indx >> 1].v = nyquist2[indx >> 1]
 1419                                      ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1]))
 1420                                      : 0.f;
 1421           }
 1422 
 1423 
 1424         // end of standard interpolation
 1425 
 1426         // refine Nyquist areas using G curvatures
 1427         if(doNyquist)
 1428         {
 1429           for(int rr = nystartrow; rr < nyendrow; rr++)
 1430             for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
 1431                 indx += 2)
 1432             {
 1433 
 1434               if(nyquist2[indx >> 1])
 1435               {
 1436                 // local averages (over Nyquist pixels only) of G curvature squared
 1437                 float gvarh
 1438                     = epssq + (gquinc[0] * Dgrb2[indx >> 1].h
 1439                                + gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h
 1440                                               + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h)
 1441                                + gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h
 1442                                               + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h)
 1443                                + gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h
 1444                                               + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
 1445                 float gvarv
 1446                     = epssq + (gquinc[0] * Dgrb2[indx >> 1].v
 1447                                + gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v
 1448                                               + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v)
 1449                                + gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v
 1450                                               + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v)
 1451                                + gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v
 1452                                               + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
 1453                 // use the results as weights for refined G interpolation
 1454                 Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
 1455                 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
 1456               }
 1457             }
 1458         }
 1459 
 1460 
 1461 #ifdef __SSE2__
 1462 
 1463         for(int rr = 6; rr < rr1 - 6; rr++)
 1464         {
 1465           if((FC(rr, 2, filters) & 1) == 0)
 1466           {
 1467             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8)
 1468             {
 1469               vfloat tempv = LC2VFU(cfa[indx + 1]);
 1470               vfloat Dgrbsq1pv
 1471                   = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
 1472               STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
 1473               STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
 1474               vfloat Dgrbsq1mv
 1475                   = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
 1476               STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv);
 1477               STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv);
 1478             }
 1479           }
 1480           else
 1481           {
 1482             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8)
 1483             {
 1484               vfloat tempv = LC2VFU(cfa[indx]);
 1485               vfloat Dgrbsq1pv
 1486                   = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
 1487               STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
 1488               STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
 1489               vfloat Dgrbsq1mv
 1490                   = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
 1491               STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv);
 1492               STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv);
 1493             }
 1494           }
 1495         }
 1496 
 1497 #else
 1498 
 1499         for(int rr = 6; rr < rr1 - 6; rr++)
 1500         {
 1501           if((FC(rr, 2, filters) & 1) == 0)
 1502           {
 1503             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
 1504             {
 1505               delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
 1506               delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
 1507               Dgrbsq1p[indx >> 1]
 1508                   = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
 1509               Dgrbsq1m[indx >> 1]
 1510                   = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
 1511             }
 1512           }
 1513           else
 1514           {
 1515             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
 1516             {
 1517               Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
 1518               Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
 1519               delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
 1520               delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
 1521             }
 1522           }
 1523         }
 1524 
 1525 #endif
 1526 
 1527 // diagonal interpolation correction
 1528 
 1529 #ifdef __SSE2__
 1530         vfloat gausseven0v = F2V(gausseven[0]);
 1531         vfloat gausseven1v = F2V(gausseven[1]);
 1532 #endif
 1533 
 1534         for(int rr = 8; rr < rr1 - 8; rr++)
 1535         {
 1536 #ifdef __SSE2__
 1537 
 1538           for(int indx = rr * ts + 8 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8;
 1539               indx += 8, indx1 += 4)
 1540           {
 1541 
 1542             // diagonal colour ratios
 1543             vfloat cfav = LC2VFU(cfa[indx]);
 1544 
 1545             vfloat temp1v = LC2VFU(cfa[indx + m1]);
 1546             vfloat temp2v = LC2VFU(cfa[indx + m2]);
 1547             vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v);
 1548             rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev,
 1549                           temp1v + zd5v * (cfav - temp2v));
 1550 
 1551             temp1v = LC2VFU(cfa[indx - m1]);
 1552             temp2v = LC2VFU(cfa[indx - m2]);
 1553             vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v);
 1554             rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv,
 1555                           temp1v + zd5v * (cfav - temp2v));
 1556 
 1557             temp1v = epsv + LVFU(delm[indx1]);
 1558             vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1])
 1559                            + LVFU(delm[(indx + m2) >> 1]); // same as for wtu,wtd,wtl,wtr
 1560             vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
 1561 
 1562             vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
 1563 
 1564             temp1v = ULIMV(rbmv, LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1]));
 1565             vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav);
 1566             temp2v = vintpf(wtv, rbmv, temp1v);
 1567 
 1568             temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v);
 1569             temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv);
 1570             STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv),
 1571                                     ULIMV(temp2v, LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v));
 1572 
 1573 
 1574             temp1v = LC2VFU(cfa[indx + p1]);
 1575             temp2v = LC2VFU(cfa[indx + p2]);
 1576             vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v);
 1577             rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev,
 1578                           temp1v + zd5v * (cfav - temp2v));
 1579 
 1580             temp1v = LC2VFU(cfa[indx - p1]);
 1581             temp2v = LC2VFU(cfa[indx - p2]);
 1582             vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v);
 1583             rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv,
 1584                           temp1v + zd5v * (cfav - temp2v));
 1585 
 1586             temp1v = epsv + LVFU(delp[indx1]);
 1587             vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
 1588             vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
 1589 
 1590             vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
 1591 
 1592             temp1v = ULIMV(rbpv, LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1]));
 1593             wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav);
 1594             temp2v = vintpf(wtv, rbpv, temp1v);
 1595 
 1596             temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v);
 1597             temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv);
 1598             STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv),
 1599                                     ULIMV(temp2v, LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v));
 1600 
 1601             vfloat rbvarmv
 1602                 = epssqv
 1603                   + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1])
 1604                                     + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1]))
 1605                      + gausseven1v
 1606                            * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1])
 1607                               + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1])
 1608                               + LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1])
 1609                               + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1])));
 1610             STVFU(pmwt[indx1],
 1611                   rbvarmv / ((epssqv
 1612                               + (gausseven0v
 1613                                      * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1])
 1614                                         + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1]))
 1615                                  + gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1])
 1616                                                   + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1])
 1617                                                   + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1])
 1618                                                   + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1])
 1619                                                   + LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1])
 1620                                                   + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1])
 1621                                                   + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1])
 1622                                                   + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1]))))
 1623                              + rbvarmv));
 1624           }
 1625 
 1626 #else
 1627 
 1628           for(int cc = 8 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8;
 1629               cc += 2, indx += 2, indx1++)
 1630           {
 1631 
 1632             // diagonal colour ratios
 1633             float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
 1634             float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
 1635             float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
 1636             float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
 1637             // colour differences in diagonal directions
 1638             float rbse, rbnw, rbne, rbsw;
 1639 
 1640             // assign B/R at R/B sites
 1641             if(fabsf(1.f - crse) < arthresh)
 1642             {
 1643               rbse = cfa[indx] * crse; // use this if more precise diag interp is necessary
 1644             }
 1645             else
 1646             {
 1647               rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]);
 1648             }
 1649 
 1650             if(fabsf(1.f - crnw) < arthresh)
 1651             {
 1652               rbnw = cfa[indx] * crnw;
 1653             }
 1654             else
 1655             {
 1656               rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]);
 1657             }
 1658 
 1659             if(fabsf(1.f - crne) < arthresh)
 1660             {
 1661               rbne = cfa[indx] * crne;
 1662             }
 1663             else
 1664             {
 1665               rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]);
 1666             }
 1667 
 1668             if(fabsf(1.f - crsw) < arthresh)
 1669             {
 1670               rbsw = cfa[indx] * crsw;
 1671             }
 1672             else
 1673             {
 1674               rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
 1675             }
 1676 
 1677             float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1]
 1678                          + delm[(indx + m2) >> 1]; // same as for wtu,wtd,wtl,wtr
 1679             float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
 1680             float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
 1681             float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
 1682 
 1683 
 1684             rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw);
 1685             rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw);
 1686 
 1687             // variance of R-B in plus/minus directions
 1688             float rbvarm
 1689                 = epssq
 1690                   + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1]
 1691                                      + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1])
 1692                      + gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1]
 1693                                        + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1]
 1694                                        + Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1]
 1695                                        + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
 1696             pmwt[indx1]
 1697                 = rbvarm
 1698                   / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1]
 1699                                                + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1])
 1700                                + gausseven[1]
 1701                                      * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1]
 1702                                         + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1]
 1703                                         + Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1]
 1704                                         + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1])))
 1705                      + rbvarm);
 1706 
 1707             // bound the interpolation in regions of high saturation
 1708 
 1709             if(rbp[indx1] < cfa[indx])
 1710             {
 1711               if(xmul2f(rbp[indx1]) < cfa[indx])
 1712               {
 1713                 rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
 1714               }
 1715               else
 1716               {
 1717                 float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
 1718                 rbp[indx1]
 1719                     = pwt * rbp[indx1] + (1.f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
 1720               }
 1721             }
 1722 
 1723             if(rbm[indx1] < cfa[indx])
 1724             {
 1725               if(xmul2f(rbm[indx1]) < cfa[indx])
 1726               {
 1727                 rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
 1728               }
 1729               else
 1730               {
 1731                 float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
 1732                 rbm[indx1]
 1733                     = mwt * rbm[indx1] + (1.f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
 1734               }
 1735             }
 1736 
 1737             if(rbp[indx1] > clip_pt)
 1738             {
 1739               rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
 1740             }
 1741 
 1742             if(rbm[indx1] > clip_pt)
 1743             {
 1744               rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
 1745             }
 1746           }
 1747 
 1748 #endif
 1749         }
 1750 
 1751 #ifdef __SSE2__
 1752         vfloat zd25v = F2V(0.25f);
 1753 #endif
 1754 
 1755         for(int rr = 10; rr < rr1 - 10; rr++)
 1756 #ifdef __SSE2__
 1757           for(int indx = rr * ts + 10 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1;
 1758               indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4)
 1759           {
 1760 
 1761             // first ask if one gets more directional discrimination from nearby B/R sites
 1762             vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1])
 1763                                        + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
 1764             vfloat tempv = LVFU(pmwt[indx1]);
 1765             tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv);
 1766             STVFU(pmwt[indx1], tempv);
 1767             STVFU(rbint[indx1],
 1768                   zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1]))));
 1769           }
 1770 
 1771 #else
 1772 
 1773           for(int cc = 10 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10;
 1774               cc += 2, indx += 2, indx1++)
 1775           {
 1776 
 1777             // first ask if one gets more directional discrimination from nearby B/R sites
 1778             float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1]
 1779                                       + pmwt[(indx + m1) >> 1],
 1780                                   2);
 1781 
 1782             if(fabsf(0.5f - pmwt[indx1]) < fabsf(0.5f - pmwtalt))
 1783             {
 1784               pmwt[indx1] = pmwtalt; // a better result was obtained from the neighbours
 1785             }
 1786 
 1787             rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1])
 1788                                   + rbp[indx1] * pmwt[indx1]); // this is R+B, interpolated
 1789           }
 1790 
 1791 #endif
 1792 
 1793         for(int rr = 12; rr < rr1 - 12; rr++)
 1794 #ifdef __SSE2__
 1795           for(int indx = rr * ts + 12 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1;
 1796               indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4)
 1797           {
 1798             vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
 1799 
 1800             if(_mm_movemask_ps((vfloat)copymask))
 1801             { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the
 1802               // unused out at the end
 1803               // now interpolate G vertically/horizontally using R+B values
 1804               // unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
 1805               // colour ratios for G interpolation
 1806               vfloat rbintv = LVFU(rbint[indx1]);
 1807 
 1808               // interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
 1809               vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)]));
 1810               vfloat guv = rbintv * cruv;
 1811               vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)]));
 1812               guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v);
 1813 
 1814               vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)]));
 1815               vfloat gdv = rbintv * crdv;
 1816               vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)]));
 1817               gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v);
 1818 
 1819               vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv)
 1820                               / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
 1821               vfloat Gint1v = ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1]));
 1822               vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv);
 1823               vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v);
 1824               Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v);
 1825               Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv);
 1826               Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv),
 1827                              ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv);
 1828 
 1829               vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)]));
 1830               vfloat glv = rbintv * crlv;
 1831               vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)]));
 1832               glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v);
 1833 
 1834               vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)]));
 1835               vfloat grv = rbintv * crrv;
 1836               vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)]));
 1837               grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v);
 1838 
 1839               vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv)
 1840                               / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
 1841               vfloat Gint1h = ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1]));
 1842               vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv);
 1843               vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h);
 1844               Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h);
 1845               Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv);
 1846               Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv),
 1847                              ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv);
 1848 
 1849               vfloat greenv
 1850                   = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx]));
 1851               STC2VFU(rgbgreen[indx], greenv);
 1852 
 1853               STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1])));
 1854             }
 1855           }
 1856 
 1857 #else
 1858 
 1859           for(int cc = 12 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12;
 1860               cc += 2, indx += 2, indx1++)
 1861           {
 1862 
 1863             if(fabsf(0.5f - pmwt[indx >> 1]) < fabsf(0.5f - hvwt[indx >> 1]))
 1864             {
 1865               continue;
 1866             }
 1867 
 1868             // now interpolate G vertically/horizontally using R+B values
 1869             // unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
 1870 
 1871             // colour ratios for G interpolation
 1872             float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
 1873             float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
 1874             float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
 1875             float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
 1876 
 1877             // interpolation of G in four directions
 1878             float gu, gd, gl, gr;
 1879 
 1880             // interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
 1881             if(fabsf(1.f - cru) < arthresh)
 1882             {
 1883               gu = rbint[indx1] * cru;
 1884             }
 1885             else
 1886             {
 1887               gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]);
 1888             }
 1889 
 1890             if(fabsf(1.f - crd) < arthresh)
 1891             {
 1892               gd = rbint[indx1] * crd;
 1893             }
 1894             else
 1895             {
 1896               gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]);
 1897             }
 1898 
 1899             if(fabsf(1.f - crl) < arthresh)
 1900             {
 1901               gl = rbint[indx1] * crl;
 1902             }
 1903             else
 1904             {
 1905               gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]);
 1906             }
 1907 
 1908             if(fabsf(1.f - crr) < arthresh)
 1909             {
 1910               gr = rbint[indx1] * crr;
 1911             }
 1912             else
 1913             {
 1914               gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]);
 1915             }
 1916 
 1917             // interpolated G via adaptive weights of cardinal evaluations
 1918             float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu)
 1919                           / (dirwts0[indx + v1] + dirwts0[indx - v1]);
 1920             float Ginth
 1921                 = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
 1922 
 1923             // bound the interpolation in regions of high saturation
 1924             if(Gintv < rbint[indx1])
 1925             {
 1926               if(2 * Gintv < rbint[indx1])
 1927               {
 1928                 Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
 1929               }
 1930               else
 1931               {
 1932                 float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
 1933                 Gintv = vwt * Gintv + (1.f - vwt) * ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
 1934               }
 1935             }
 1936 
 1937             if(Ginth < rbint[indx1])
 1938             {
 1939               if(2 * Ginth < rbint[indx1])
 1940               {
 1941                 Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
 1942               }
 1943               else
 1944               {
 1945                 float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
 1946                 Ginth = hwt * Ginth + (1.f - hwt) * ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
 1947               }
 1948             }
 1949 
 1950             if(Ginth > clip_pt)
 1951             {
 1952               Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
 1953             }
 1954 
 1955             if(Gintv > clip_pt)
 1956             {
 1957               Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
 1958             }
 1959 
 1960             rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1];
 1961             Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx];
 1962           }
 1963 
 1964 #endif
 1965 
 1966         // end of diagonal interpolation correction
 1967 
 1968         // fancy chrominance interpolation
 1969         //(ey,ex) is location of R site
 1970         for(int rr = 13 - ey; rr < rr1 - 12; rr += 2)
 1971           for(int indx1 = (rr * ts + 13 - ex) >> 1; indx1<(rr * ts + cc1 - 12)>> 1; indx1++)
 1972           {                                  // B coset
 1973             Dgrb[1][indx1] = Dgrb[0][indx1]; // split out G-B from G-R
 1974             Dgrb[0][indx1] = 0;
 1975           }
 1976 
 1977 #ifdef __SSE2__
 1978         vfloat oned325v = F2V(1.325f);
 1979         vfloat zd175v = F2V(0.175f);
 1980         vfloat zd075v = F2V(0.075f);
 1981 #endif
 1982 
 1983         for(int rr = 14; rr < rr1 - 14; rr++)
 1984 #ifdef __SSE2__
 1985           for(int cc = 14 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc, filters) / 2;
 1986               cc < cc1 - 14; cc += 8, indx += 8)
 1987           {
 1988             vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
 1989             vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
 1990             vfloat wtnwv
 1991                 = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1]))
 1992                           + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
 1993             vfloat wtnev
 1994                 = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1]))
 1995                           + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
 1996             vfloat wtswv
 1997                 = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1]))
 1998                           + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
 1999             vfloat wtsev
 2000                 = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1]))
 2001                           + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
 2002 
 2003             STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1])
 2004                                                 - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1])
 2005                                                 - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1])
 2006                                                             + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])))
 2007                                        + wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1])
 2008                                                   - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1])
 2009                                                   - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1])
 2010                                                               + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])))
 2011                                        + wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1])
 2012                                                   - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1])
 2013                                                   - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1])
 2014                                                               + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])))
 2015                                        + wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1])
 2016                                                   - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1])
 2017                                                   - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1])
 2018                                                               + LVFU(Dgrb[c][(indx + m1 + v2) >> 1]))))
 2019                                           / (wtnwv + wtnev + wtswv + wtsev));
 2020           }
 2021 
 2022 #else
 2023 
 2024           for(int cc = 14 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc, filters) / 2;
 2025               cc < cc1 - 14; cc += 2, indx += 2)
 2026           {
 2027             float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1])
 2028                                 + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1])
 2029                                 + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
 2030             float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1])
 2031                                 + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1])
 2032                                 + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
 2033             float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1])
 2034                                 + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1])
 2035                                 + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
 2036             float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1])
 2037                                 + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1])
 2038                                 + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
 2039 
 2040             Dgrb[c][indx >> 1]
 2041                 = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1]
 2042                            - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1])
 2043                    + wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1]
 2044                              - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1]
 2045                              - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1])
 2046                    + wtsw * (1.325f * Dgrb[c][(indx - p1) >> 1] - 0.175f * Dgrb[c][(indx - p3) >> 1]
 2047                              - 0.075f * Dgrb[c][(indx - p1 - 2) >> 1]
 2048                              - 0.075f * Dgrb[c][(indx - p1 - v2) >> 1])
 2049                    + wtse * (1.325f * Dgrb[c][(indx + m1) >> 1] - 0.175f * Dgrb[c][(indx + m3) >> 1]
 2050                              - 0.075f * Dgrb[c][(indx + m1 + 2) >> 1]
 2051                              - 0.075f * Dgrb[c][(indx + m1 + v2) >> 1]))
 2052                   / (wtnw + wtne + wtsw + wtse);
 2053           }
 2054 
 2055 #endif
 2056 
 2057 #ifdef __SSE2__
 2058         int offset;
 2059         vfloat twov = F2V(2.f);
 2060         vmask selmask;
 2061 
 2062         if((FC(16, 2, filters) & 1) == 1)
 2063         {
 2064           selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);
 2065           offset = 1;
 2066         }
 2067         else
 2068         {
 2069           selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
 2070           offset = 0;
 2071         }
 2072 
 2073 #endif
 2074 
 2075         for(int rr = 16; rr < rr1 - 16; rr++)
 2076         {
 2077           int row = rr + top;
 2078           int col = left + 16;
 2079           int indx = rr * ts + 16;
 2080 #ifdef __SSE2__
 2081           offset = 1 - offset;
 2082           selmask = vnotm(selmask);
 2083 
 2084           for(; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4)
 2085           {
 2086             if(col < roi_out->width && row < roi_out->height)
 2087             {
 2088               vfloat greenv = LVF(rgbgreen[indx]);
 2089               vfloat temp00v = vdup(LVFU(hvwt[(indx - v1) >> 1]));
 2090               vfloat temp01v = vdup(LVFU(hvwt[(indx + v1) >> 1]));
 2091               vfloat tempv = onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))
 2092                                      - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v);
 2093 
 2094               vfloat redv1 = greenv
 2095                              - (temp00v * vdup(LVFU(Dgrb[0][(indx - v1) >> 1]))
 2096                                 + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])))
 2097                                       * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1]))
 2098                                 + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])))
 2099                                       * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1]))
 2100                                 + temp01v * vdup(LVFU(Dgrb[0][(indx + v1) >> 1])))
 2101                                    * tempv;
 2102               vfloat bluev1 = greenv
 2103                               - (temp00v * vdup(LVFU(Dgrb[1][(indx - v1) >> 1]))
 2104                                  + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])))
 2105                                        * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1]))
 2106                                  + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])))
 2107                                        * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1]))
 2108                                  + temp01v * vdup(LVFU(Dgrb[1][(indx + v1) >> 1])))
 2109                                     * tempv;
 2110               vfloat redv2 = greenv - vdup(LVFU(Dgrb[0][indx >> 1]));
 2111               vfloat bluev2 = greenv - vdup(LVFU(Dgrb[1][indx >> 1]));
 2112               __attribute__((aligned(16))) float _r[4];
 2113               __attribute__((aligned(16))) float _b[4];
 2114               STVF(*_r, vself(selmask, redv1, redv2));
 2115               STVF(*_b, vself(selmask, bluev1, bluev2));
 2116               for(int c = 0; c < 4; c++)
 2117               {
 2118                 out[(row * roi_out->width + col + c) * 4] = clampnan(_r[c], 0.0, 1.0);
 2119                 out[(row * roi_out->width + col + c) * 4 + 2] = clampnan(_b[c], 0.0, 1.0);
 2120               }
 2121             }
 2122           }
 2123 
 2124           if(offset == 0)
 2125           {
 2126             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
 2127             {
 2128               if(col < roi_out->width && row < roi_out->height)
 2129               {
 2130                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2131                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2132                 out[(row * roi_out->width + col) * 4]
 2133                     = clampnan(rgbgreen[indx]
 2134                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2135                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2136                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2137                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2138                                          * temp,
 2139                                0.0, 1.0);
 2140                 out[(row * roi_out->width + col) * 4 + 2]
 2141                     = clampnan(rgbgreen[indx]
 2142                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2143                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2144                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2145                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2146                                          * temp,
 2147                                0.0, 1.0);
 2148               }
 2149 
 2150               indx++;
 2151               col++;
 2152               if(col < roi_out->width && row < roi_out->height)
 2153               {
 2154                 out[(row * roi_out->width + col) * 4]
 2155                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
 2156                 out[(row * roi_out->width + col) * 4 + 2]
 2157                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
 2158               }
 2159             }
 2160 
 2161             if(cc1 & 1)
 2162             { // width of tile is odd
 2163               if(col < roi_out->width && row < roi_out->height)
 2164               {
 2165                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2166                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2167                 out[(row * roi_out->width + col) * 4]
 2168                     = clampnan(rgbgreen[indx]
 2169                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2170                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2171                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2172                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2173                                          * temp,
 2174                                0.0, 1.0);
 2175                 out[(row * roi_out->width + col) * 4 + 2]
 2176                     = clampnan(rgbgreen[indx]
 2177                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2178                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2179                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2180                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2181                                          * temp,
 2182                                0.0, 1.0);
 2183               }
 2184             }
 2185           }
 2186           else
 2187           {
 2188             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
 2189             {
 2190               if(col < roi_out->width && row < roi_out->height)
 2191               {
 2192                 out[(row * roi_out->width + col) * 4]
 2193                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
 2194                 out[(row * roi_out->width + col) * 4 + 2]
 2195                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
 2196               }
 2197 
 2198               indx++;
 2199               col++;
 2200               if(col < roi_out->width && row < roi_out->height)
 2201               {
 2202                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2203                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2204                 out[(row * roi_out->width + col) * 4]
 2205                     = clampnan(rgbgreen[indx]
 2206                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2207                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2208                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2209                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2210                                          * temp,
 2211                                0.0, 1.0);
 2212                 out[(row * roi_out->width + col) * 4 + 2]
 2213                     = clampnan(rgbgreen[indx]
 2214                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2215                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2216                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2217                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2218                                          * temp,
 2219                                0.0, 1.0);
 2220               }
 2221             }
 2222 
 2223             if(cc1 & 1)
 2224             { // width of tile is odd
 2225               if(col < roi_out->width && row < roi_out->height)
 2226               {
 2227                 out[(row * roi_out->width + col) * 4]
 2228                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
 2229                 out[(row * roi_out->width + col) * 4 + 2]
 2230                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
 2231               }
 2232             }
 2233           }
 2234 
 2235 #else
 2236 
 2237           if((FC(rr, 2, filters) & 1) == 1)
 2238           {
 2239             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
 2240             {
 2241               if(col < roi_out->width && row < roi_out->height)
 2242               {
 2243                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2244                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2245                 out[(row * roi_out->width + col) * 4]
 2246                     = clampnan(rgbgreen[indx]
 2247                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2248                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2249                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2250                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2251                                          * temp,
 2252                                0.0, 1.0);
 2253                 out[(row * roi_out->width + col) * 4 + 2]
 2254                     = clampnan(rgbgreen[indx]
 2255                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2256                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2257                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2258                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2259                                          * temp,
 2260                                0.0, 1.0);
 2261               }
 2262 
 2263               indx++;
 2264               col++;
 2265               if(col < roi_out->width && row < roi_out->height)
 2266               {
 2267                 out[(row * roi_out->width + col) * 4]
 2268                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
 2269                 out[(row * roi_out->width + col) * 4 + 2]
 2270                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
 2271               }
 2272             }
 2273 
 2274             if(cc1 & 1)
 2275             { // width of tile is odd
 2276               if(col < roi_out->width && row < roi_out->height)
 2277               {
 2278                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2279                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2280                 out[(row * roi_out->width + col) * 4]
 2281                     = clampnan(rgbgreen[indx]
 2282                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2283                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2284                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2285                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2286                                          * temp,
 2287                                0.0, 1.0);
 2288                 out[(row * roi_out->width + col) * 4 + 2]
 2289                     = clampnan(rgbgreen[indx]
 2290                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2291                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2292                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2293                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2294                                          * temp,
 2295                                0.0, 1.0);
 2296               }
 2297             }
 2298           }
 2299           else
 2300           {
 2301             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
 2302             {
 2303               if(col < roi_out->width && row < roi_out->height)
 2304               {
 2305                 out[(row * roi_out->width + col) * 4]
 2306                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0f, 1.0f);
 2307                 out[(row * roi_out->width + col) * 4 + 2]
 2308                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0f, 1.0f);
 2309               }
 2310 
 2311               indx++;
 2312               col++;
 2313               if(col < roi_out->width && row < roi_out->height)
 2314               {
 2315                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
 2316                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
 2317 
 2318                 out[(row * roi_out->width + col) * 4]
 2319                     = clampnan(rgbgreen[indx]
 2320                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
 2321                                       + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
 2322                                       + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
 2323                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
 2324                                          * temp,
 2325                                0.0f, 1.0f);
 2326 
 2327                 out[(row * roi_out->width + col) * 4 + 2]
 2328                     = clampnan(rgbgreen[indx]
 2329                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
 2330                                       + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
 2331                                       + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
 2332                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
 2333                                          * temp,
 2334                                0.0f, 1.0f);
 2335               }
 2336             }
 2337 
 2338             if(cc1 & 1)
 2339             { // width of tile is odd
 2340               if(col < roi_out->width && row < roi_out->height)
 2341               {
 2342                 out[(row * roi_out->width + col) * 4]
 2343                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0f, 1.0f);
 2344                 out[(row * roi_out->width + col) * 4 + 2]
 2345                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0f, 1.0f);
 2346               }
 2347             }
 2348           }
 2349 
 2350 #endif
 2351         }
 2352 
 2353         // copy smoothed results back to image matrix
 2354         for(int rr = 16; rr < rr1 - 16; rr++)
 2355         {
 2356           int row = rr + top;
 2357           int cc = 16;
 2358           // TODO (darktable): we have the pixel colors interleaved so writing them in blocks using SSE2 is
 2359           // not possible. or is it?
 2360           // #ifdef __SSE2__
 2361           //
 2362           //           for(; cc < cc1 - 19; cc += 4)
 2363           //           {
 2364           //             STVFU(out[(row * roi_out->width + (cc + left))  * 4 + 1], LVF(rgbgreen[rr * ts +
 2365           //             cc]));
 2366           //           }
 2367           //
 2368           // #endif
 2369 
 2370           for(; cc < cc1 - 16; cc++)
 2371           {
 2372             int col = cc + left;
 2373             int indx = rr * ts + cc;
 2374             if(col < roi_out->width && row < roi_out->height)
 2375               out[(row * roi_out->width + col) * 4 + 1] = clampnan(rgbgreen[indx], 0.0f, 1.0f);
 2376           }
 2377         }
 2378 
 2379         //         if(plistener)
 2380         //         {
 2381         //           progresscounter++;
 2382         //
 2383         //           if(progresscounter % 32 == 0)
 2384         //           {
 2385         // #ifdef _OPENMP
 2386         // #pragma omp critical(amazeprogress)
 2387         // #endif
 2388         //             {
 2389         //               progress += (double)32 * ((ts - 32) * (ts - 32)) / (height * width);
 2390         //               progress = progress > 1.0 ? 1.0 : progress;
 2391         //               plistener->setProgress(progress);
 2392         //             }
 2393         //           }
 2394         //         }
 2395       }
 2396     } // end of main loop
 2397 
 2398     // clean up
 2399     free(buffer);
 2400   }
 2401 
 2402   //   if(plistener)
 2403   //   {
 2404   //     plistener->setProgress(1.0);
 2405   //   }
 2406 }
 2407 // }
 2408 
 2409 /*==================================================================================
 2410  * end of raw therapee code
 2411  *==================================================================================*/
 2412 
 2413 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
 2414 // vim: shiftwidth=2 expandtab tabstop=2 cindent
 2415 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;