"Fossies" - the Fresh Open Source Software Archive

Member "darktable-2.6.3/data/kernels/colorreconstruction.cl" (20 Oct 2019, 9505 Bytes) of package /linux/misc/darktable-2.6.3.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Lisp source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 /*
    2     This file is part of darktable,
    3     copyright (c) 2012 johannes hanika.
    4     copyright (c) 2015 Ulrich Pegelow.
    5 
    6     darktable is free software: you can redistribute it and/or modify
    7     it under the terms of the GNU General Public License as published by
    8     the Free Software Foundation, either version 3 of the License, or
    9     (at your option) any later version.
   10 
   11     darktable is distributed in the hope that it will be useful,
   12     but WITHOUT ANY WARRANTY; without even the implied warranty of
   13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14     GNU General Public License for more details.
   15 
   16     You should have received a copy of the GNU General Public License
   17     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
   18 */
   19 
   20 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
   21 
   22 #include "common.h"
   23 
   24 typedef enum dt_iop_colorreconstruct_precedence_t
   25 {
   26   COLORRECONSTRUCT_PRECEDENCE_NONE,
   27   COLORRECONSTRUCT_PRECEDENCE_CHROMA,
   28   COLORRECONSTRUCT_PRECEDENCE_HUE
   29 } dt_iop_colorreconstruct_precedence_t;
   30 
   31 float4
   32 image_to_grid(
   33     const float4 p,
   34     const int4   size,
   35     const float4 sigma)
   36 {
   37   return (float4)(
   38     clamp(p.x/sigma.x, 0.0f, size.x-1.0f),
   39     clamp(p.y/sigma.y, 0.0f, size.y-1.0f),
   40     clamp(p.z/sigma.z, 0.0f, size.z-1.0f), 0.0f);
   41 }
   42 
   43 float2
   44 grid_rescale(
   45     const int2 pxy,
   46     const int2 roixy,
   47     const int2 bxy,
   48     const float scale)
   49 {
   50   return convert_float2(roixy + pxy) * scale - convert_float2(bxy);
   51 }
   52 
   53 
   54 void
   55 atomic_add_f(
   56     global float *val,
   57     const  float  delta)
   58 {
   59 #ifdef NVIDIA_SM_20
   60   // buys me another 3x--10x over the `algorithmic' improvements in the splat kernel below,
   61   // depending on configuration (sigma_s and sigma_r)
   62   float res = 0;
   63   asm volatile ("atom.global.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(val), "f"(delta));
   64 
   65 #else
   66   union
   67   {
   68     float f;
   69     unsigned int i;
   70   }
   71   old_val;
   72   union
   73   {
   74     float f;
   75     unsigned int i;
   76   }
   77   new_val;
   78 
   79   global volatile unsigned int *ival = (global volatile unsigned int *)val;
   80 
   81   do
   82   {
   83     // the following is equivalent to old_val.f = *val. however, as according to the opencl standard
   84     // we can not rely on global buffer val to be consistently cached (relaxed memory consistency) we 
   85     // access it via a slower but consistent atomic operation.
   86     old_val.i = atom_add(ival, 0);
   87     new_val.f = old_val.f + delta;
   88   }
   89   while (atom_cmpxchg (ival, old_val.i, new_val.i) != old_val.i);
   90 #endif
   91 }
   92 
   93 kernel void
   94 colorreconstruction_zero(
   95     global float  *grid,
   96     const  int     width,
   97     const  int     height)
   98 {
   99   const int x = get_global_id(0);
  100   const int y = get_global_id(1);
  101   if(x >= width || y >= height) return;
  102   grid[y * width + x] = 0.0f;
  103 }
  104 
  105 
  106 kernel void
  107 colorreconstruction_splat(
  108     read_only image2d_t  in,
  109     global float        *grid,
  110     const int            width,
  111     const int            height,
  112     const int            sizex,
  113     const int            sizey,
  114     const int            sizez,
  115     const float          sigma_s,
  116     const float          sigma_r,
  117     const float          threshold,
  118     const int            precedence,
  119     const float4         params,
  120     local int            *gi,
  121     local float4         *accum)
  122 {
  123   const int x = get_global_id(0);
  124   const int y = get_global_id(1);
  125   const int lszx = get_local_size(0);
  126   const int i = get_local_id(0);
  127   const int j = get_local_id(1);
  128   int li = lszx*j + i;
  129 
  130   int4   size  = (int4)(sizex, sizey, sizez, 0);
  131   float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
  132 
  133   const float4 pixel = read_imagef (in, samplerc, (int2)(x, y));
  134   float weight, m;
  135 
  136   switch(precedence)
  137   {
  138     case COLORRECONSTRUCT_PRECEDENCE_CHROMA:
  139       weight = sqrt(pixel.y * pixel.y + pixel.z * pixel.z);
  140       break;
  141 
  142     case COLORRECONSTRUCT_PRECEDENCE_HUE:
  143       m = atan2(pixel.z, pixel.y) - params.x;
  144       // readjust m into [-pi, +pi] interval
  145       m = m > M_PI_F ? m - 2*M_PI_F : (m < -M_PI_F ? m + 2*M_PI_F : m);
  146       weight = exp(-m*m/params.y);
  147       break;
  148       
  149     case COLORRECONSTRUCT_PRECEDENCE_NONE:
  150     default:
  151       weight = 1.0f;
  152       break;
  153   }
  154 
  155   if(x < width && y < height)
  156   {
  157     // splat into downsampled grid
  158     float4 p = (float4)(x, y, pixel.x, 0);
  159     float4 gridp = image_to_grid(p, size, sigma);
  160     
  161     // closest integer splatting:    
  162     int4 xi = clamp(convert_int4(round(gridp)), 0, size - 1);
  163    
  164     // first accumulate into local memory
  165     gi[li] = xi.x + size.x*xi.y + size.x*size.y*xi.z;
  166     accum[li] = pixel.x < threshold ? weight * (float4)(pixel.x, pixel.y, pixel.z, 1.0f) : (float4)0.0f;
  167   }
  168   else
  169   {
  170     gi[li] = -1;
  171   }
  172 
  173   barrier(CLK_LOCAL_MEM_FENCE);
  174 
  175   if(i != 0) return;
  176 
  177   // non-logarithmic reduction..
  178   // but we also need to take care of where to accumulate (only merge where gi[.] == gi[.])
  179 
  180   li = lszx*j;
  181   int oldgi = gi[li];
  182   float4 tmp = accum[li];
  183  
  184   for(int ii=1; ii < lszx && oldgi != -1; ii++)
  185   {
  186     li = lszx*j + ii;
  187     if(gi[li] != oldgi)
  188     {
  189       atomic_add_f(grid + 4 * oldgi,              tmp.x);
  190       atomic_add_f(grid + 4 * oldgi + 1,          tmp.y);      
  191       atomic_add_f(grid + 4 * oldgi + 2,          tmp.z);
  192       atomic_add_f(grid + 4 * oldgi + 3,          tmp.w);      
  193       
  194       oldgi = gi[li];
  195       tmp = accum[li];
  196     }
  197     else
  198     {
  199       tmp = accum[li];
  200     }
  201   }
  202 
  203   if(oldgi == -1) return;
  204 
  205   atomic_add_f(grid + 4 * oldgi,              tmp.x);
  206   atomic_add_f(grid + 4 * oldgi + 1,          tmp.y);      
  207   atomic_add_f(grid + 4 * oldgi + 2,          tmp.z);
  208   atomic_add_f(grid + 4 * oldgi + 3,          tmp.w);      
  209 }
  210 
  211 
  212 kernel void
  213 colorreconstruction_blur_line(
  214     global const float  *ibuf,
  215     global float        *obuf,
  216     const int           offset1,
  217     const int           offset2,
  218     const int           offset3,
  219     const int           size1,
  220     const int           size2,
  221     const int           size3)
  222 {
  223   const int k = get_global_id(0);
  224   const int j = get_global_id(1);
  225   if(k >= size1 || j >= size2) return;
  226 
  227   const float w0 = 6.0f/16.0f;
  228   const float w1 = 4.0f/16.0f;
  229   const float w2 = 1.0f/16.0f;
  230   int index = k*offset1 + j*offset2;
  231 
  232   float4 tmp1 = vload4(index, ibuf);
  233   float4 out = vload4(index, ibuf)*w0 + vload4(index + offset3, ibuf)*w1 + vload4(index + 2*offset3, ibuf)*w2;
  234   vstore4(out, index, obuf);
  235   index += offset3;
  236   float4 tmp2 = vload4(index, ibuf);
  237   out = vload4(index, ibuf)*w0 + (vload4(index + offset3, ibuf) + tmp1)*w1 + vload4(index + 2*offset3, ibuf)*w2;
  238   vstore4(out, index, obuf);  
  239   index += offset3;
  240   for(int i=2;i<size3-2;i++)
  241   {
  242     const float4 tmp3 = vload4(index, ibuf);
  243     out = vload4(index, ibuf)*w0
  244         + (vload4(index + offset3, ibuf)   + tmp2)*w1
  245         + (vload4(index + 2*offset3, ibuf) + tmp1)*w2;
  246     vstore4(out, index, obuf);        
  247     index += offset3;
  248     tmp1 = tmp2;
  249     tmp2 = tmp3;
  250   }
  251   const float4 tmp3 = vload4(index, ibuf);
  252   out = vload4(index, ibuf)*w0 + (vload4(index + offset3, ibuf) + tmp2)*w1 + tmp1*w2;
  253   vstore4(out, index, obuf);    
  254   index += offset3;
  255   out = vload4(index, ibuf)*w0 + tmp3*w1 + tmp2*w2;
  256   vstore4(out, index, obuf);
  257 }
  258 
  259 
  260 kernel void
  261 colorreconstruction_slice(
  262     read_only  image2d_t in,
  263     write_only image2d_t out,
  264     global float        *grid,
  265     const int            width,
  266     const int            height,
  267     const int            sizex,
  268     const int            sizey,
  269     const int            sizez,
  270     const float          sigma_s,
  271     const float          sigma_r,
  272     const float          threshold,
  273     const int2           bxy,
  274     const int2           roixy,
  275     const float          scale)
  276 {
  277   const int x = get_global_id(0);
  278   const int y = get_global_id(1);
  279   if(x >= width || y >= height) return;
  280 
  281   const int ox = 1;
  282   const int oy = sizex;
  283   const int oz = sizey*sizex;
  284 
  285   int4   size  = (int4)(sizex, sizey, sizez, 0);
  286   float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
  287 
  288   float4 pixel = read_imagef (in, samplerc, (int2)(x, y));
  289   float blend = clamp(20.0f / threshold * pixel.x - 19.0f, 0.0f, 1.0f);
  290   float2 pxy = grid_rescale((int2)(x, y), roixy, bxy, scale);
  291   float4 p = (float4)(pxy.x, pxy.y, pixel.x, 0);
  292   float4 gridp = image_to_grid(p, size, sigma);
  293   int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0));
  294   float fx = gridp.x - gridi.x;
  295   float fy = gridp.y - gridi.y;
  296   float fz = gridp.z - gridi.z;
  297 
  298   // trilinear lookup (wouldn't read/write access to 3d textures be cool)
  299   // could actually use an array of 2d textures, these only require opencl 1.2
  300   const int gi = gridi.x + sizex*(gridi.y + sizey*gridi.z);
  301   const float4 opixel =
  302         vload4(gi, grid)          * (1.0f - fx) * (1.0f - fy) * (1.0f - fz) +
  303         vload4(gi+ox, grid)       * (       fx) * (1.0f - fy) * (1.0f - fz) +
  304         vload4(gi+oy, grid)       * (1.0f - fx) * (       fy) * (1.0f - fz) +
  305         vload4(gi+ox+oy, grid)    * (       fx) * (       fy) * (1.0f - fz) +
  306         vload4(gi+oz, grid)       * (1.0f - fx) * (1.0f - fy) * (       fz) +
  307         vload4(gi+ox+oz, grid)    * (       fx) * (1.0f - fy) * (       fz) +
  308         vload4(gi+oy+oz, grid)    * (1.0f - fx) * (       fy) * (       fz) +
  309         vload4(gi+ox+oy+oz, grid) * (       fx) * (       fy) * (       fz);
  310         
  311   const float opixelx = fmax(opixel.x, 0.01f);
  312   pixel.y = (opixel.w > 0.0f) ? pixel.y * (1.0f - blend) + opixel.y * pixel.x/opixelx * blend : pixel.y;
  313   pixel.z = (opixel.w > 0.0f) ? pixel.z * (1.0f - blend) + opixel.z * pixel.x/opixelx * blend : pixel.z;  
  314 
  315   write_imagef (out, (int2)(x, y), pixel);
  316 }
  317