w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

pixman-sse2.c
Go to the documentation of this file.
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission. Red Hat makes no representations about the
12  * suitability of this software for any purpose. It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author: Rodrigo Kumpera (kumpera@gmail.com)
25  * André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32 
33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
34 #define PSHUFD_IS_FAST 0
35 
36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
37 #include <emmintrin.h> /* for SSE2 intrinsics */
38 #include "pixman-private.h"
39 #include "pixman-combine32.h"
40 #include "pixman-inlines.h"
41 
42 static __m128i mask_0080;
43 static __m128i mask_00ff;
44 static __m128i mask_0101;
45 static __m128i mask_ffff;
46 static __m128i mask_ff000000;
47 static __m128i mask_alpha;
48 
49 static __m128i mask_565_r;
50 static __m128i mask_565_g1, mask_565_g2;
51 static __m128i mask_565_b;
52 static __m128i mask_red;
53 static __m128i mask_green;
54 static __m128i mask_blue;
55 
56 static __m128i mask_565_fix_rb;
57 static __m128i mask_565_fix_g;
58 
59 static __m128i mask_565_rb;
60 static __m128i mask_565_pack_multiplier;
61 
62 static force_inline __m128i
64 {
65  return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
66 }
67 
68 static force_inline void
69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
70 {
71  *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
72  *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
73 }
74 
75 static force_inline __m128i
76 unpack_565_to_8888 (__m128i lo)
77 {
78  __m128i r, g, b, rb, t;
79 
80  r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
81  g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
82  b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
83 
84  rb = _mm_or_si128 (r, b);
85  t = _mm_and_si128 (rb, mask_565_fix_rb);
86  t = _mm_srli_epi32 (t, 5);
87  rb = _mm_or_si128 (rb, t);
88 
89  t = _mm_and_si128 (g, mask_565_fix_g);
90  t = _mm_srli_epi32 (t, 6);
91  g = _mm_or_si128 (g, t);
92 
93  return _mm_or_si128 (rb, g);
94 }
95 
96 static force_inline void
98  __m128i* data0,
99  __m128i* data1,
100  __m128i* data2,
101  __m128i* data3)
102 {
103  __m128i lo, hi;
104 
105  lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
106  hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
107 
108  lo = unpack_565_to_8888 (lo);
109  hi = unpack_565_to_8888 (hi);
110 
111  unpack_128_2x128 (lo, data0, data1);
112  unpack_128_2x128 (hi, data2, data3);
113 }
114 
115 static force_inline uint16_t
117 {
118  return (uint16_t) (((pixel >> 8) & 0xf800) |
119  ((pixel >> 5) & 0x07e0) |
120  ((pixel >> 3) & 0x001f));
121 }
122 
123 static force_inline __m128i
124 pack_2x128_128 (__m128i lo, __m128i hi)
125 {
126  return _mm_packus_epi16 (lo, hi);
127 }
128 
129 static force_inline __m128i
130 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
131 {
132  __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
133  __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
134 
135  __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
136  __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
137 
138  __m128i g0 = _mm_and_si128 (lo, mask_green);
139  __m128i g1 = _mm_and_si128 (hi, mask_green);
140 
141  t0 = _mm_or_si128 (t0, g0);
142  t1 = _mm_or_si128 (t1, g1);
143 
144  /* Simulates _mm_packus_epi32 */
145  t0 = _mm_slli_epi32 (t0, 16 - 5);
146  t1 = _mm_slli_epi32 (t1, 16 - 5);
147  t0 = _mm_srai_epi32 (t0, 16);
148  t1 = _mm_srai_epi32 (t1, 16);
149  return _mm_packs_epi32 (t0, t1);
150 }
151 
152 static force_inline __m128i
153 pack_565_2x128_128 (__m128i lo, __m128i hi)
154 {
155  __m128i data;
156  __m128i r, g1, g2, b;
157 
158  data = pack_2x128_128 (lo, hi);
159 
160  r = _mm_and_si128 (data, mask_565_r);
161  g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
162  g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
163  b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
164 
165  return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166 }
167 
168 static force_inline __m128i
169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
170 {
171  return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
172  pack_565_2x128_128 (*xmm2, *xmm3));
173 }
174 
175 static force_inline int
176 is_opaque (__m128i x)
177 {
178  __m128i ffs = _mm_cmpeq_epi8 (x, x);
179 
180  return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181 }
182 
183 static force_inline int
184 is_zero (__m128i x)
185 {
186  return _mm_movemask_epi8 (
187  _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188 }
189 
190 static force_inline int
191 is_transparent (__m128i x)
192 {
193  return (_mm_movemask_epi8 (
194  _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195 }
196 
197 static force_inline __m128i
199 {
200  return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201 }
202 
203 static force_inline __m128i
205 {
206  return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
207  _MM_SHUFFLE (3, 3, 3, 3)),
208  _MM_SHUFFLE (3, 3, 3, 3));
209 }
210 
211 static force_inline void
212 expand_alpha_2x128 (__m128i data_lo,
213  __m128i data_hi,
214  __m128i* alpha_lo,
215  __m128i* alpha_hi)
216 {
217  __m128i lo, hi;
218 
219  lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
220  hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
221 
222  *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
223  *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224 }
225 
226 static force_inline void
227 expand_alpha_rev_2x128 (__m128i data_lo,
228  __m128i data_hi,
229  __m128i* alpha_lo,
230  __m128i* alpha_hi)
231 {
232  __m128i lo, hi;
233 
234  lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
235  hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
236  *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
237  *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238 }
239 
240 static force_inline void
241 pix_multiply_2x128 (__m128i* data_lo,
242  __m128i* data_hi,
243  __m128i* alpha_lo,
244  __m128i* alpha_hi,
245  __m128i* ret_lo,
246  __m128i* ret_hi)
247 {
248  __m128i lo, hi;
249 
250  lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
251  hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
252  lo = _mm_adds_epu16 (lo, mask_0080);
253  hi = _mm_adds_epu16 (hi, mask_0080);
254  *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
255  *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256 }
257 
258 static force_inline void
259 pix_add_multiply_2x128 (__m128i* src_lo,
260  __m128i* src_hi,
261  __m128i* alpha_dst_lo,
262  __m128i* alpha_dst_hi,
263  __m128i* dst_lo,
264  __m128i* dst_hi,
265  __m128i* alpha_src_lo,
266  __m128i* alpha_src_hi,
267  __m128i* ret_lo,
268  __m128i* ret_hi)
269 {
270  __m128i t1_lo, t1_hi;
271  __m128i t2_lo, t2_hi;
272 
273  pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
274  pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
275 
276  *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
277  *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278 }
279 
280 static force_inline void
281 negate_2x128 (__m128i data_lo,
282  __m128i data_hi,
283  __m128i* neg_lo,
284  __m128i* neg_hi)
285 {
286  *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
287  *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288 }
289 
290 static force_inline void
291 invert_colors_2x128 (__m128i data_lo,
292  __m128i data_hi,
293  __m128i* inv_lo,
294  __m128i* inv_hi)
295 {
296  __m128i lo, hi;
297 
298  lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
299  hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
300  *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
301  *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302 }
303 
304 static force_inline void
305 over_2x128 (__m128i* src_lo,
306  __m128i* src_hi,
307  __m128i* alpha_lo,
308  __m128i* alpha_hi,
309  __m128i* dst_lo,
310  __m128i* dst_hi)
311 {
312  __m128i t1, t2;
313 
314  negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
315 
316  pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
317 
318  *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
319  *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320 }
321 
322 static force_inline void
323 over_rev_non_pre_2x128 (__m128i src_lo,
324  __m128i src_hi,
325  __m128i* dst_lo,
326  __m128i* dst_hi)
327 {
328  __m128i lo, hi;
329  __m128i alpha_lo, alpha_hi;
330 
331  expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
332 
333  lo = _mm_or_si128 (alpha_lo, mask_alpha);
334  hi = _mm_or_si128 (alpha_hi, mask_alpha);
335 
336  invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
337 
338  pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
339 
340  over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341 }
342 
343 static force_inline void
344 in_over_2x128 (__m128i* src_lo,
345  __m128i* src_hi,
346  __m128i* alpha_lo,
347  __m128i* alpha_hi,
348  __m128i* mask_lo,
349  __m128i* mask_hi,
350  __m128i* dst_lo,
351  __m128i* dst_hi)
352 {
353  __m128i s_lo, s_hi;
354  __m128i a_lo, a_hi;
355 
356  pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
357  pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
358 
359  over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360 }
361 
362 /* load 4 pixels from a 16-byte boundary aligned address */
363 static force_inline __m128i
365 {
366  return _mm_load_si128 (src);
367 }
368 
369 /* load 4 pixels from a unaligned address */
370 static force_inline __m128i
371 load_128_unaligned (const __m128i* src)
372 {
373  return _mm_loadu_si128 (src);
374 }
375 
376 /* save 4 pixels using Write Combining memory on a 16-byte
377  * boundary aligned address
378  */
379 static force_inline void
381  __m128i data)
382 {
383  _mm_stream_si128 (dst, data);
384 }
385 
386 /* save 4 pixels on a 16-byte boundary aligned address */
387 static force_inline void
389  __m128i data)
390 {
391  _mm_store_si128 (dst, data);
392 }
393 
394 /* save 4 pixels on a unaligned address */
395 static force_inline void
397  __m128i data)
398 {
399  _mm_storeu_si128 (dst, data);
400 }
401 
402 static force_inline __m128i
404 {
405  return _mm_cvtsi32_si128 (data);
406 }
407 
408 static force_inline __m128i
410 {
411  return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
412 }
413 
414 static force_inline __m128i
416 {
417  return _mm_shufflelo_epi16 (
418  unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
419 }
420 
421 static force_inline __m128i
423  __m128i alpha)
424 {
425  return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
426  mask_0080),
427  mask_0101);
428 }
429 
430 static force_inline __m128i
432  __m128i* alpha_dst,
433  __m128i* dst,
434  __m128i* alpha_src)
435 {
436  __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
437  __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
438 
439  return _mm_adds_epu8 (t1, t2);
440 }
441 
442 static force_inline __m128i
444 {
445  return _mm_xor_si128 (data, mask_00ff);
446 }
447 
448 static force_inline __m128i
450 {
451  return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
452 }
453 
454 static force_inline __m128i
455 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
456 {
457  return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
458 }
459 
460 static force_inline __m128i
461 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
462 {
463  return over_1x128 (pix_multiply_1x128 (*src, *mask),
465  *dst);
466 }
467 
468 static force_inline __m128i
469 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
470 {
471  __m128i alpha = expand_alpha_1x128 (src);
472 
474  _mm_or_si128 (alpha, mask_alpha)),
475  alpha,
476  dst);
477 }
478 
479 static force_inline uint32_t
481 {
482  return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
483 }
484 
485 static force_inline __m128i
487 {
488  __m128i m = _mm_cvtsi32_si128 (pixel);
489 
490  m = unpack_565_to_8888 (m);
491 
492  return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
493 }
494 
495 static force_inline uint32_t
497 {
498  uint8_t a;
499  __m128i xmms;
500 
501  a = src >> 24;
502 
503  if (a == 0xff)
504  {
505  return src;
506  }
507  else if (src)
508  {
509  xmms = unpack_32_1x128 (src);
510  return pack_1x128_32 (
511  over_1x128 (xmms, expand_alpha_1x128 (xmms),
512  unpack_32_1x128 (dst)));
513  }
514 
515  return dst;
516 }
517 
518 static force_inline uint32_t
519 combine1 (const uint32_t *ps, const uint32_t *pm)
520 {
521  uint32_t s;
522  memcpy(&s, ps, sizeof(uint32_t));
523 
524  if (pm)
525  {
526  __m128i ms, mm;
527 
528  mm = unpack_32_1x128 (*pm);
529  mm = expand_alpha_1x128 (mm);
530 
531  ms = unpack_32_1x128 (s);
532  ms = pix_multiply_1x128 (ms, mm);
533 
534  s = pack_1x128_32 (ms);
535  }
536 
537  return s;
538 }
539 
540 static force_inline __m128i
541 combine4 (const __m128i *ps, const __m128i *pm)
542 {
543  __m128i xmm_src_lo, xmm_src_hi;
544  __m128i xmm_msk_lo, xmm_msk_hi;
545  __m128i s;
546 
547  if (pm)
548  {
549  xmm_msk_lo = load_128_unaligned (pm);
550 
551  if (is_transparent (xmm_msk_lo))
552  return _mm_setzero_si128 ();
553  }
554 
555  s = load_128_unaligned (ps);
556 
557  if (pm)
558  {
559  unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
560  unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
561 
562  expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
563 
564  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
565  &xmm_msk_lo, &xmm_msk_hi,
566  &xmm_src_lo, &xmm_src_hi);
567 
568  s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
569  }
570 
571  return s;
572 }
573 
574 static force_inline void
576  const uint32_t* ps,
577  const uint32_t* pm,
578  int w)
579 {
580  uint32_t s, d;
581 
582  /* Align dst on a 16-byte boundary */
583  while (w && ((uintptr_t)pd & 15))
584  {
585  d = *pd;
586  s = combine1 (ps, pm);
587 
588  if (s)
590  pd++;
591  ps++;
592  pm++;
593  w--;
594  }
595 
596  while (w >= 4)
597  {
598  __m128i mask = load_128_unaligned ((__m128i *)pm);
599 
600  if (!is_zero (mask))
601  {
602  __m128i src;
603  __m128i src_hi, src_lo;
604  __m128i mask_hi, mask_lo;
605  __m128i alpha_hi, alpha_lo;
606 
607  src = load_128_unaligned ((__m128i *)ps);
608 
609  if (is_opaque (_mm_and_si128 (src, mask)))
610  {
611  save_128_aligned ((__m128i *)pd, src);
612  }
613  else
614  {
615  __m128i dst = load_128_aligned ((__m128i *)pd);
616  __m128i dst_hi, dst_lo;
617 
618  unpack_128_2x128 (mask, &mask_lo, &mask_hi);
619  unpack_128_2x128 (src, &src_lo, &src_hi);
620 
621  expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
622  pix_multiply_2x128 (&src_lo, &src_hi,
623  &mask_lo, &mask_hi,
624  &src_lo, &src_hi);
625 
626  unpack_128_2x128 (dst, &dst_lo, &dst_hi);
627 
628  expand_alpha_2x128 (src_lo, src_hi,
629  &alpha_lo, &alpha_hi);
630 
631  over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
632  &dst_lo, &dst_hi);
633 
635  (__m128i *)pd,
636  pack_2x128_128 (dst_lo, dst_hi));
637  }
638  }
639 
640  pm += 4;
641  ps += 4;
642  pd += 4;
643  w -= 4;
644  }
645  while (w)
646  {
647  d = *pd;
648  s = combine1 (ps, pm);
649 
650  if (s)
652  pd++;
653  ps++;
654  pm++;
655 
656  w--;
657  }
658 }
659 
660 static force_inline void
662  const uint32_t* ps,
663  int w)
664 {
665  uint32_t s, d;
666 
667  /* Align dst on a 16-byte boundary */
668  while (w && ((uintptr_t)pd & 15))
669  {
670  d = *pd;
671  s = *ps;
672 
673  if (s)
675  pd++;
676  ps++;
677  w--;
678  }
679 
680  while (w >= 4)
681  {
682  __m128i src;
683  __m128i src_hi, src_lo, dst_hi, dst_lo;
684  __m128i alpha_hi, alpha_lo;
685 
686  src = load_128_unaligned ((__m128i *)ps);
687 
688  if (!is_zero (src))
689  {
690  if (is_opaque (src))
691  {
692  save_128_aligned ((__m128i *)pd, src);
693  }
694  else
695  {
696  __m128i dst = load_128_aligned ((__m128i *)pd);
697 
698  unpack_128_2x128 (src, &src_lo, &src_hi);
699  unpack_128_2x128 (dst, &dst_lo, &dst_hi);
700 
701  expand_alpha_2x128 (src_lo, src_hi,
702  &alpha_lo, &alpha_hi);
703  over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
704  &dst_lo, &dst_hi);
705 
707  (__m128i *)pd,
708  pack_2x128_128 (dst_lo, dst_hi));
709  }
710  }
711 
712  ps += 4;
713  pd += 4;
714  w -= 4;
715  }
716  while (w)
717  {
718  d = *pd;
719  s = *ps;
720 
721  if (s)
723  pd++;
724  ps++;
725 
726  w--;
727  }
728 }
729 
730 static force_inline void
732  pixman_op_t op,
733  uint32_t * pd,
734  const uint32_t * ps,
735  const uint32_t * pm,
736  int w)
737 {
738  if (pm)
740  else
742 }
743 
744 static void
746  pixman_op_t op,
747  uint32_t * pd,
748  const uint32_t * ps,
749  const uint32_t * pm,
750  int w)
751 {
752  uint32_t s, d;
753 
754  __m128i xmm_dst_lo, xmm_dst_hi;
755  __m128i xmm_src_lo, xmm_src_hi;
756  __m128i xmm_alpha_lo, xmm_alpha_hi;
757 
758  /* Align dst on a 16-byte boundary */
759  while (w &&
760  ((uintptr_t)pd & 15))
761  {
762  d = *pd;
763  s = combine1 (ps, pm);
764 
766  w--;
767  ps++;
768  if (pm)
769  pm++;
770  }
771 
772  while (w >= 4)
773  {
774  /* I'm loading unaligned because I'm not sure
775  * about the address alignment.
776  */
777  xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
778  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779 
780  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
781  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782 
783  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
784  &xmm_alpha_lo, &xmm_alpha_hi);
785 
786  over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
787  &xmm_alpha_lo, &xmm_alpha_hi,
788  &xmm_src_lo, &xmm_src_hi);
789 
790  /* rebuid the 4 pixel data and save*/
791  save_128_aligned ((__m128i*)pd,
792  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
793 
794  w -= 4;
795  ps += 4;
796  pd += 4;
797 
798  if (pm)
799  pm += 4;
800  }
801 
802  while (w)
803  {
804  d = *pd;
805  s = combine1 (ps, pm);
806 
808  ps++;
809  w--;
810  if (pm)
811  pm++;
812  }
813 }
814 
815 static force_inline uint32_t
817 {
818  uint32_t maska = src >> 24;
819 
820  if (maska == 0)
821  {
822  return 0;
823  }
824  else if (maska != 0xff)
825  {
826  return pack_1x128_32 (
829  }
830 
831  return dst;
832 }
833 
834 static void
836  pixman_op_t op,
837  uint32_t * pd,
838  const uint32_t * ps,
839  const uint32_t * pm,
840  int w)
841 {
842  uint32_t s, d;
843 
844  __m128i xmm_src_lo, xmm_src_hi;
845  __m128i xmm_dst_lo, xmm_dst_hi;
846 
847  while (w && ((uintptr_t)pd & 15))
848  {
849  s = combine1 (ps, pm);
850  d = *pd;
851 
853  w--;
854  ps++;
855  if (pm)
856  pm++;
857  }
858 
859  while (w >= 4)
860  {
861  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
862  xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
863 
864  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
865  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
866 
867  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
868  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
869  &xmm_dst_lo, &xmm_dst_hi,
870  &xmm_dst_lo, &xmm_dst_hi);
871 
872  save_128_aligned ((__m128i*)pd,
873  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
874 
875  ps += 4;
876  pd += 4;
877  w -= 4;
878  if (pm)
879  pm += 4;
880  }
881 
882  while (w)
883  {
884  s = combine1 (ps, pm);
885  d = *pd;
886 
888  w--;
889  ps++;
890  if (pm)
891  pm++;
892  }
893 }
894 
895 static void
897  pixman_op_t op,
898  uint32_t * pd,
899  const uint32_t * ps,
900  const uint32_t * pm,
901  int w)
902 {
903  uint32_t s, d;
904 
905  __m128i xmm_src_lo, xmm_src_hi;
906  __m128i xmm_dst_lo, xmm_dst_hi;
907 
908  while (w && ((uintptr_t)pd & 15))
909  {
910  s = combine1 (ps, pm);
911  d = *pd;
912 
914  ps++;
915  w--;
916  if (pm)
917  pm++;
918  }
919 
920  while (w >= 4)
921  {
922  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
923  xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
924 
925  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
926  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
927 
928  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
929  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
930  &xmm_src_lo, &xmm_src_hi,
931  &xmm_dst_lo, &xmm_dst_hi);
932 
934  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
935 
936  ps += 4;
937  pd += 4;
938  w -= 4;
939  if (pm)
940  pm += 4;
941  }
942 
943  while (w)
944  {
945  s = combine1 (ps, pm);
946  d = *pd;
947 
949  w--;
950  ps++;
951  if (pm)
952  pm++;
953  }
954 }
955 
956 static void
958  pixman_op_t op,
959  uint32_t * pd,
960  const uint32_t * ps,
961  const uint32_t * pm,
962  int w)
963 {
964  while (w && ((uintptr_t)pd & 15))
965  {
966  uint32_t s = combine1 (ps, pm);
967  uint32_t d = *pd;
968 
969  *pd++ = pack_1x128_32 (
973 
974  if (pm)
975  pm++;
976  ps++;
977  w--;
978  }
979 
980  while (w >= 4)
981  {
982  __m128i xmm_src_lo, xmm_src_hi;
983  __m128i xmm_dst_lo, xmm_dst_hi;
984 
985  xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
986  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
987 
988  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
990 
991  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
992  negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
993 
994  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
995  &xmm_src_lo, &xmm_src_hi,
996  &xmm_dst_lo, &xmm_dst_hi);
997 
999  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1000 
1001  ps += 4;
1002  pd += 4;
1003  if (pm)
1004  pm += 4;
1005 
1006  w -= 4;
1007  }
1008 
1009  while (w)
1010  {
1011  uint32_t s = combine1 (ps, pm);
1012  uint32_t d = *pd;
1013 
1014  *pd++ = pack_1x128_32 (
1018  ps++;
1019  if (pm)
1020  pm++;
1021  w--;
1022  }
1023 }
1024 
1025 static void
1027  pixman_op_t op,
1028  uint32_t * pd,
1029  const uint32_t * ps,
1030  const uint32_t * pm,
1031  int w)
1032 {
1033  while (w && ((uintptr_t)pd & 15))
1034  {
1035  uint32_t s = combine1 (ps, pm);
1036  uint32_t d = *pd;
1037 
1038  *pd++ = pack_1x128_32 (
1042  w--;
1043  ps++;
1044  if (pm)
1045  pm++;
1046  }
1047 
1048  while (w >= 4)
1049  {
1050  __m128i xmm_src_lo, xmm_src_hi;
1051  __m128i xmm_dst_lo, xmm_dst_hi;
1052 
1053  xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1054  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1055 
1056  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1057  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1058 
1059  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1060  negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1061 
1062  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1063  &xmm_dst_lo, &xmm_dst_hi,
1064  &xmm_dst_lo, &xmm_dst_hi);
1065 
1067  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1068 
1069  ps += 4;
1070  pd += 4;
1071  w -= 4;
1072  if (pm)
1073  pm += 4;
1074  }
1075 
1076  while (w)
1077  {
1078  uint32_t s = combine1 (ps, pm);
1079  uint32_t d = *pd;
1080 
1081  *pd++ = pack_1x128_32 (
1085  w--;
1086  ps++;
1087  if (pm)
1088  pm++;
1089  }
1090 }
1091 
1092 static force_inline uint32_t
1094  uint32_t dst)
1095 {
1096  __m128i s = unpack_32_1x128 (src);
1097  __m128i d = unpack_32_1x128 (dst);
1098 
1099  __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1100  __m128i da = expand_alpha_1x128 (d);
1101 
1102  return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1103 }
1104 
1105 static void
1107  pixman_op_t op,
1108  uint32_t * pd,
1109  const uint32_t * ps,
1110  const uint32_t * pm,
1111  int w)
1112 {
1113  uint32_t s, d;
1114 
1115  __m128i xmm_src_lo, xmm_src_hi;
1116  __m128i xmm_dst_lo, xmm_dst_hi;
1117  __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1118  __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1119 
1120  while (w && ((uintptr_t)pd & 15))
1121  {
1122  s = combine1 (ps, pm);
1123  d = *pd;
1124 
1126  w--;
1127  ps++;
1128  if (pm)
1129  pm++;
1130  }
1131 
1132  while (w >= 4)
1133  {
1134  xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1135  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1136 
1137  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1138  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1139 
1140  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1141  &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1142  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1143  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1144 
1145  negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1146  &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1147 
1149  &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1150  &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1151  &xmm_dst_lo, &xmm_dst_hi);
1152 
1154  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1155 
1156  ps += 4;
1157  pd += 4;
1158  w -= 4;
1159  if (pm)
1160  pm += 4;
1161  }
1162 
1163  while (w)
1164  {
1165  s = combine1 (ps, pm);
1166  d = *pd;
1167 
1169  w--;
1170  ps++;
1171  if (pm)
1172  pm++;
1173  }
1174 }
1175 
1176 static force_inline uint32_t
1178  uint32_t dst)
1179 {
1180  __m128i s = unpack_32_1x128 (src);
1181  __m128i d = unpack_32_1x128 (dst);
1182 
1183  __m128i sa = expand_alpha_1x128 (s);
1184  __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1185 
1186  return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1187 }
1188 
1189 static void
1191  pixman_op_t op,
1192  uint32_t * pd,
1193  const uint32_t * ps,
1194  const uint32_t * pm,
1195  int w)
1196 {
1197  uint32_t s, d;
1198 
1199  __m128i xmm_src_lo, xmm_src_hi;
1200  __m128i xmm_dst_lo, xmm_dst_hi;
1201  __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1202  __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1203 
1204  while (w && ((uintptr_t)pd & 15))
1205  {
1206  s = combine1 (ps, pm);
1207  d = *pd;
1208 
1210  ps++;
1211  w--;
1212  if (pm)
1213  pm++;
1214  }
1215 
1216  while (w >= 4)
1217  {
1218  xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1219  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1220 
1221  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1222  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1223 
1224  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1225  &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1226  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1227  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1228 
1229  negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1230  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1231 
1233  &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1234  &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1235  &xmm_dst_lo, &xmm_dst_hi);
1236 
1238  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1239 
1240  ps += 4;
1241  pd += 4;
1242  w -= 4;
1243  if (pm)
1244  pm += 4;
1245  }
1246 
1247  while (w)
1248  {
1249  s = combine1 (ps, pm);
1250  d = *pd;
1251 
1253  ps++;
1254  w--;
1255  if (pm)
1256  pm++;
1257  }
1258 }
1259 
1260 static force_inline uint32_t
1262  uint32_t dst)
1263 {
1264  __m128i s = unpack_32_1x128 (src);
1265  __m128i d = unpack_32_1x128 (dst);
1266 
1267  __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1268  __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1269 
1270  return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1271 }
1272 
1273 static void
1275  pixman_op_t op,
1276  uint32_t * dst,
1277  const uint32_t * src,
1278  const uint32_t * mask,
1279  int width)
1280 {
1281  int w = width;
1282  uint32_t s, d;
1283  uint32_t* pd = dst;
1284  const uint32_t* ps = src;
1285  const uint32_t* pm = mask;
1286 
1287  __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1288  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1289  __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1290  __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1291 
1292  while (w && ((uintptr_t)pd & 15))
1293  {
1294  s = combine1 (ps, pm);
1295  d = *pd;
1296 
1298  w--;
1299  ps++;
1300  if (pm)
1301  pm++;
1302  }
1303 
1304  while (w >= 4)
1305  {
1306  xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1307  xmm_dst = load_128_aligned ((__m128i*) pd);
1308 
1309  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1310  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1311 
1312  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1313  &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1314  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1315  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1316 
1317  negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1318  &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1319  negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1320  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1321 
1323  &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1324  &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1325  &xmm_dst_lo, &xmm_dst_hi);
1326 
1328  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1329 
1330  ps += 4;
1331  pd += 4;
1332  w -= 4;
1333  if (pm)
1334  pm += 4;
1335  }
1336 
1337  while (w)
1338  {
1339  s = combine1 (ps, pm);
1340  d = *pd;
1341 
1343  w--;
1344  ps++;
1345  if (pm)
1346  pm++;
1347  }
1348 }
1349 
1350 static force_inline void
1352  pixman_op_t op,
1353  uint32_t * dst,
1354  const uint32_t * src,
1355  const uint32_t * mask,
1356  int width)
1357 {
1358  int w = width;
1359  uint32_t s, d;
1360  uint32_t* pd = dst;
1361  const uint32_t* ps = src;
1362  const uint32_t* pm = mask;
1363 
1364  while (w && (uintptr_t)pd & 15)
1365  {
1366  s = combine1 (ps, pm);
1367  d = *pd;
1368 
1369  ps++;
1370  if (pm)
1371  pm++;
1372  *pd++ = _mm_cvtsi128_si32 (
1373  _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1374  w--;
1375  }
1376 
1377  while (w >= 4)
1378  {
1379  __m128i s;
1380 
1381  s = combine4 ((__m128i*)ps, (__m128i*)pm);
1382 
1384  (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1385 
1386  pd += 4;
1387  ps += 4;
1388  if (pm)
1389  pm += 4;
1390  w -= 4;
1391  }
1392 
1393  while (w--)
1394  {
1395  s = combine1 (ps, pm);
1396  d = *pd;
1397 
1398  ps++;
1399  *pd++ = _mm_cvtsi128_si32 (
1400  _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1401  if (pm)
1402  pm++;
1403  }
1404 }
1405 
1406 static force_inline uint32_t
1408  uint32_t dst)
1409 {
1410  __m128i ms = unpack_32_1x128 (src);
1411  __m128i md = unpack_32_1x128 (dst);
1412  uint32_t sa = src >> 24;
1413  uint32_t da = ~~dst >> 24;
1414 
1415  if (sa > da)
1416  {
1417  ms = pix_multiply_1x128 (
1418  ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1419  }
1420 
1421  return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1422 }
1423 
1424 static void
1426  pixman_op_t op,
1427  uint32_t * pd,
1428  const uint32_t * ps,
1429  const uint32_t * pm,
1430  int w)
1431 {
1432  uint32_t s, d;
1433 
1434  uint32_t pack_cmp;
1435  __m128i xmm_src, xmm_dst;
1436 
1437  while (w && (uintptr_t)pd & 15)
1438  {
1439  s = combine1 (ps, pm);
1440  d = *pd;
1441 
1443  w--;
1444  ps++;
1445  if (pm)
1446  pm++;
1447  }
1448 
1449  while (w >= 4)
1450  {
1451  xmm_dst = load_128_aligned ((__m128i*)pd);
1452  xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1453 
1454  pack_cmp = _mm_movemask_epi8 (
1455  _mm_cmpgt_epi32 (
1456  _mm_srli_epi32 (xmm_src, 24),
1457  _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1458 
1459  /* if some alpha src is grater than respective ~alpha dst */
1460  if (pack_cmp)
1461  {
1462  s = combine1 (ps++, pm);
1463  d = *pd;
1465  if (pm)
1466  pm++;
1467 
1468  s = combine1 (ps++, pm);
1469  d = *pd;
1471  if (pm)
1472  pm++;
1473 
1474  s = combine1 (ps++, pm);
1475  d = *pd;
1477  if (pm)
1478  pm++;
1479 
1480  s = combine1 (ps++, pm);
1481  d = *pd;
1483  if (pm)
1484  pm++;
1485  }
1486  else
1487  {
1488  save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1489 
1490  pd += 4;
1491  ps += 4;
1492  if (pm)
1493  pm += 4;
1494  }
1495 
1496  w -= 4;
1497  }
1498 
1499  while (w--)
1500  {
1501  s = combine1 (ps, pm);
1502  d = *pd;
1503 
1505  ps++;
1506  if (pm)
1507  pm++;
1508  }
1509 }
1510 
1511 static void
1513  pixman_op_t op,
1514  uint32_t * pd,
1515  const uint32_t * ps,
1516  const uint32_t * pm,
1517  int w)
1518 {
1519  uint32_t s, m;
1520 
1521  __m128i xmm_src_lo, xmm_src_hi;
1522  __m128i xmm_mask_lo, xmm_mask_hi;
1523  __m128i xmm_dst_lo, xmm_dst_hi;
1524 
1525  while (w && (uintptr_t)pd & 15)
1526  {
1527  s = *ps++;
1528  m = *pm++;
1529  *pd++ = pack_1x128_32 (
1531  w--;
1532  }
1533 
1534  while (w >= 4)
1535  {
1536  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1537  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1538 
1539  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1540  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1541 
1542  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1543  &xmm_mask_lo, &xmm_mask_hi,
1544  &xmm_dst_lo, &xmm_dst_hi);
1545 
1547  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1548 
1549  ps += 4;
1550  pd += 4;
1551  pm += 4;
1552  w -= 4;
1553  }
1554 
1555  while (w)
1556  {
1557  s = *ps++;
1558  m = *pm++;
1559  *pd++ = pack_1x128_32 (
1561  w--;
1562  }
1563 }
1564 
1565 static force_inline uint32_t
1567  uint32_t mask,
1568  uint32_t dst)
1569 {
1570  __m128i s = unpack_32_1x128 (src);
1571  __m128i expAlpha = expand_alpha_1x128 (s);
1572  __m128i unpk_mask = unpack_32_1x128 (mask);
1573  __m128i unpk_dst = unpack_32_1x128 (dst);
1574 
1575  return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1576 }
1577 
1578 static void
1580  pixman_op_t op,
1581  uint32_t * pd,
1582  const uint32_t * ps,
1583  const uint32_t * pm,
1584  int w)
1585 {
1586  uint32_t s, m, d;
1587 
1588  __m128i xmm_alpha_lo, xmm_alpha_hi;
1589  __m128i xmm_src_lo, xmm_src_hi;
1590  __m128i xmm_dst_lo, xmm_dst_hi;
1591  __m128i xmm_mask_lo, xmm_mask_hi;
1592 
1593  while (w && (uintptr_t)pd & 15)
1594  {
1595  s = *ps++;
1596  m = *pm++;
1597  d = *pd;
1598 
1600  w--;
1601  }
1602 
1603  while (w >= 4)
1604  {
1605  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1606  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1607  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1608 
1609  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1610  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1611  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1612 
1613  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1614  &xmm_alpha_lo, &xmm_alpha_hi);
1615 
1616  in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1617  &xmm_alpha_lo, &xmm_alpha_hi,
1618  &xmm_mask_lo, &xmm_mask_hi,
1619  &xmm_dst_lo, &xmm_dst_hi);
1620 
1622  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1623 
1624  ps += 4;
1625  pd += 4;
1626  pm += 4;
1627  w -= 4;
1628  }
1629 
1630  while (w)
1631  {
1632  s = *ps++;
1633  m = *pm++;
1634  d = *pd;
1635 
1637  w--;
1638  }
1639 }
1640 
1641 static force_inline uint32_t
1643  uint32_t mask,
1644  uint32_t dst)
1645 {
1646  __m128i d = unpack_32_1x128 (dst);
1647 
1648  return pack_1x128_32 (
1651  unpack_32_1x128 (mask))));
1652 }
1653 
1654 static void
1656  pixman_op_t op,
1657  uint32_t * pd,
1658  const uint32_t * ps,
1659  const uint32_t * pm,
1660  int w)
1661 {
1662  uint32_t s, m, d;
1663 
1664  __m128i xmm_alpha_lo, xmm_alpha_hi;
1665  __m128i xmm_src_lo, xmm_src_hi;
1666  __m128i xmm_dst_lo, xmm_dst_hi;
1667  __m128i xmm_mask_lo, xmm_mask_hi;
1668 
1669  while (w && (uintptr_t)pd & 15)
1670  {
1671  s = *ps++;
1672  m = *pm++;
1673  d = *pd;
1674 
1676  w--;
1677  }
1678 
1679  while (w >= 4)
1680  {
1681  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1682  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1683  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1684 
1685  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1686  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1687  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1688 
1689  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1690  &xmm_alpha_lo, &xmm_alpha_hi);
1691  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1692  &xmm_mask_lo, &xmm_mask_hi,
1693  &xmm_mask_lo, &xmm_mask_hi);
1694 
1695  over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1696  &xmm_alpha_lo, &xmm_alpha_hi,
1697  &xmm_mask_lo, &xmm_mask_hi);
1698 
1700  (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1701 
1702  ps += 4;
1703  pd += 4;
1704  pm += 4;
1705  w -= 4;
1706  }
1707 
1708  while (w)
1709  {
1710  s = *ps++;
1711  m = *pm++;
1712  d = *pd;
1713 
1715  w--;
1716  }
1717 }
1718 
1719 static void
1721  pixman_op_t op,
1722  uint32_t * pd,
1723  const uint32_t * ps,
1724  const uint32_t * pm,
1725  int w)
1726 {
1727  uint32_t s, m, d;
1728 
1729  __m128i xmm_alpha_lo, xmm_alpha_hi;
1730  __m128i xmm_src_lo, xmm_src_hi;
1731  __m128i xmm_dst_lo, xmm_dst_hi;
1732  __m128i xmm_mask_lo, xmm_mask_hi;
1733 
1734  while (w && (uintptr_t)pd & 15)
1735  {
1736  s = *ps++;
1737  m = *pm++;
1738  d = *pd;
1739 
1740  *pd++ = pack_1x128_32 (
1744 
1745  w--;
1746  }
1747 
1748  while (w >= 4)
1749  {
1750  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1751  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1752  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1753 
1754  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1755  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1756  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1757 
1758  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1759  &xmm_alpha_lo, &xmm_alpha_hi);
1760 
1761  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1762  &xmm_mask_lo, &xmm_mask_hi,
1763  &xmm_dst_lo, &xmm_dst_hi);
1764 
1765  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1766  &xmm_alpha_lo, &xmm_alpha_hi,
1767  &xmm_dst_lo, &xmm_dst_hi);
1768 
1770  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1771 
1772  ps += 4;
1773  pd += 4;
1774  pm += 4;
1775  w -= 4;
1776  }
1777 
1778  while (w)
1779  {
1780  s = *ps++;
1781  m = *pm++;
1782  d = *pd;
1783 
1784  *pd++ = pack_1x128_32 (
1789 
1790  w--;
1791  }
1792 }
1793 
1794 static void
1796  pixman_op_t op,
1797  uint32_t * pd,
1798  const uint32_t * ps,
1799  const uint32_t * pm,
1800  int w)
1801 {
1802  uint32_t s, m, d;
1803 
1804  __m128i xmm_alpha_lo, xmm_alpha_hi;
1805  __m128i xmm_src_lo, xmm_src_hi;
1806  __m128i xmm_dst_lo, xmm_dst_hi;
1807  __m128i xmm_mask_lo, xmm_mask_hi;
1808 
1809  while (w && (uintptr_t)pd & 15)
1810  {
1811  s = *ps++;
1812  m = *pm++;
1813  d = *pd;
1814 
1815  *pd++ = pack_1x128_32 (
1817  unpack_32_1x128 (d),
1820  w--;
1821  }
1822 
1823  while (w >= 4)
1824  {
1825  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1826  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1827  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1828 
1829  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1830  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1831  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1832 
1833  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1834  &xmm_alpha_lo, &xmm_alpha_hi);
1835  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1836  &xmm_alpha_lo, &xmm_alpha_hi,
1837  &xmm_alpha_lo, &xmm_alpha_hi);
1838 
1839  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1840  &xmm_alpha_lo, &xmm_alpha_hi,
1841  &xmm_dst_lo, &xmm_dst_hi);
1842 
1844  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1845 
1846  ps += 4;
1847  pd += 4;
1848  pm += 4;
1849  w -= 4;
1850  }
1851 
1852  while (w)
1853  {
1854  s = *ps++;
1855  m = *pm++;
1856  d = *pd;
1857 
1858  *pd++ = pack_1x128_32 (
1860  unpack_32_1x128 (d),
1863  w--;
1864  }
1865 }
1866 
1867 static void
1869  pixman_op_t op,
1870  uint32_t * pd,
1871  const uint32_t * ps,
1872  const uint32_t * pm,
1873  int w)
1874 {
1875  uint32_t s, m, d;
1876 
1877  __m128i xmm_alpha_lo, xmm_alpha_hi;
1878  __m128i xmm_src_lo, xmm_src_hi;
1879  __m128i xmm_dst_lo, xmm_dst_hi;
1880  __m128i xmm_mask_lo, xmm_mask_hi;
1881 
1882  while (w && (uintptr_t)pd & 15)
1883  {
1884  s = *ps++;
1885  m = *pm++;
1886  d = *pd;
1887 
1888  *pd++ = pack_1x128_32 (
1893  w--;
1894  }
1895 
1896  while (w >= 4)
1897  {
1898  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1899  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1900  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1901 
1902  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1903  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1904  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1905 
1906  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1907  &xmm_alpha_lo, &xmm_alpha_hi);
1908  negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1909  &xmm_alpha_lo, &xmm_alpha_hi);
1910 
1911  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1912  &xmm_mask_lo, &xmm_mask_hi,
1913  &xmm_dst_lo, &xmm_dst_hi);
1914  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1915  &xmm_alpha_lo, &xmm_alpha_hi,
1916  &xmm_dst_lo, &xmm_dst_hi);
1917 
1919  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1920 
1921  ps += 4;
1922  pd += 4;
1923  pm += 4;
1924  w -= 4;
1925  }
1926 
1927  while (w)
1928  {
1929  s = *ps++;
1930  m = *pm++;
1931  d = *pd;
1932 
1933  *pd++ = pack_1x128_32 (
1938 
1939  w--;
1940  }
1941 }
1942 
1943 static void
1945  pixman_op_t op,
1946  uint32_t * pd,
1947  const uint32_t * ps,
1948  const uint32_t * pm,
1949  int w)
1950 {
1951  uint32_t s, m, d;
1952 
1953  __m128i xmm_alpha_lo, xmm_alpha_hi;
1954  __m128i xmm_src_lo, xmm_src_hi;
1955  __m128i xmm_dst_lo, xmm_dst_hi;
1956  __m128i xmm_mask_lo, xmm_mask_hi;
1957 
1958  while (w && (uintptr_t)pd & 15)
1959  {
1960  s = *ps++;
1961  m = *pm++;
1962  d = *pd;
1963 
1964  *pd++ = pack_1x128_32 (
1966  unpack_32_1x128 (d),
1968  unpack_32_1x128 (m),
1970  w--;
1971  }
1972 
1973  while (w >= 4)
1974  {
1975  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978 
1979  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982 
1983  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984  &xmm_alpha_lo, &xmm_alpha_hi);
1985 
1986  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1987  &xmm_alpha_lo, &xmm_alpha_hi,
1988  &xmm_mask_lo, &xmm_mask_hi);
1989 
1990  negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1991  &xmm_mask_lo, &xmm_mask_hi);
1992 
1993  pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1994  &xmm_mask_lo, &xmm_mask_hi,
1995  &xmm_dst_lo, &xmm_dst_hi);
1996 
1998  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1999 
2000  ps += 4;
2001  pd += 4;
2002  pm += 4;
2003  w -= 4;
2004  }
2005 
2006  while (w)
2007  {
2008  s = *ps++;
2009  m = *pm++;
2010  d = *pd;
2011 
2012  *pd++ = pack_1x128_32 (
2014  unpack_32_1x128 (d),
2016  unpack_32_1x128 (m),
2018  w--;
2019  }
2020 }
2021 
2022 static force_inline uint32_t
2024  uint32_t mask,
2025  uint32_t dst)
2026 {
2027  __m128i m = unpack_32_1x128 (mask);
2028  __m128i s = unpack_32_1x128 (src);
2029  __m128i d = unpack_32_1x128 (dst);
2030  __m128i sa = expand_alpha_1x128 (s);
2031  __m128i da = expand_alpha_1x128 (d);
2032 
2033  s = pix_multiply_1x128 (s, m);
2034  m = negate_1x128 (pix_multiply_1x128 (m, sa));
2035 
2036  return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2037 }
2038 
2039 static void
2041  pixman_op_t op,
2042  uint32_t * pd,
2043  const uint32_t * ps,
2044  const uint32_t * pm,
2045  int w)
2046 {
2047  uint32_t s, m, d;
2048 
2049  __m128i xmm_src_lo, xmm_src_hi;
2050  __m128i xmm_dst_lo, xmm_dst_hi;
2051  __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2052  __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2053  __m128i xmm_mask_lo, xmm_mask_hi;
2054 
2055  while (w && (uintptr_t)pd & 15)
2056  {
2057  s = *ps++;
2058  m = *pm++;
2059  d = *pd;
2060 
2062  w--;
2063  }
2064 
2065  while (w >= 4)
2066  {
2067  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2068  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2069  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070 
2071  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2072  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2073  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074 
2075  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2076  &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2077  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2078  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079 
2080  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2081  &xmm_mask_lo, &xmm_mask_hi,
2082  &xmm_src_lo, &xmm_src_hi);
2083  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2084  &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2085  &xmm_mask_lo, &xmm_mask_hi);
2086 
2087  negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2088 
2090  &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091  &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092  &xmm_dst_lo, &xmm_dst_hi);
2093 
2095  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096 
2097  ps += 4;
2098  pd += 4;
2099  pm += 4;
2100  w -= 4;
2101  }
2102 
2103  while (w)
2104  {
2105  s = *ps++;
2106  m = *pm++;
2107  d = *pd;
2108 
2110  w--;
2111  }
2112 }
2113 
2114 static force_inline uint32_t
2116  uint32_t mask,
2117  uint32_t dst)
2118 {
2119  __m128i m = unpack_32_1x128 (mask);
2120  __m128i s = unpack_32_1x128 (src);
2121  __m128i d = unpack_32_1x128 (dst);
2122 
2123  __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2124  __m128i sa = expand_alpha_1x128 (s);
2125 
2126  s = pix_multiply_1x128 (s, m);
2127  m = pix_multiply_1x128 (m, sa);
2128 
2129  return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2130 }
2131 
2132 static void
2134  pixman_op_t op,
2135  uint32_t * pd,
2136  const uint32_t * ps,
2137  const uint32_t * pm,
2138  int w)
2139 {
2140  uint32_t s, m, d;
2141 
2142  __m128i xmm_src_lo, xmm_src_hi;
2143  __m128i xmm_dst_lo, xmm_dst_hi;
2144  __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145  __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146  __m128i xmm_mask_lo, xmm_mask_hi;
2147 
2148  while (w && (uintptr_t)pd & 15)
2149  {
2150  s = *ps++;
2151  m = *pm++;
2152  d = *pd;
2153 
2155  w--;
2156  }
2157 
2158  while (w >= 4)
2159  {
2160  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163 
2164  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167 
2168  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169  &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172 
2173  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174  &xmm_mask_lo, &xmm_mask_hi,
2175  &xmm_src_lo, &xmm_src_hi);
2176  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177  &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178  &xmm_mask_lo, &xmm_mask_hi);
2179 
2180  negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182 
2184  &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2185  &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2186  &xmm_dst_lo, &xmm_dst_hi);
2187 
2189  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2190 
2191  ps += 4;
2192  pd += 4;
2193  pm += 4;
2194  w -= 4;
2195  }
2196 
2197  while (w)
2198  {
2199  s = *ps++;
2200  m = *pm++;
2201  d = *pd;
2202 
2204  w--;
2205  }
2206 }
2207 
2208 static force_inline uint32_t
2210  uint32_t mask,
2211  uint32_t dst)
2212 {
2213  __m128i a = unpack_32_1x128 (mask);
2214  __m128i s = unpack_32_1x128 (src);
2215  __m128i d = unpack_32_1x128 (dst);
2216 
2217  __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2218  a, expand_alpha_1x128 (s)));
2219  __m128i dest = pix_multiply_1x128 (s, a);
2220  __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2221 
2223  &alpha_dst,
2224  &dest,
2225  &alpha_src));
2226 }
2227 
2228 static void
2230  pixman_op_t op,
2231  uint32_t * pd,
2232  const uint32_t * ps,
2233  const uint32_t * pm,
2234  int w)
2235 {
2236  uint32_t s, m, d;
2237 
2238  __m128i xmm_src_lo, xmm_src_hi;
2239  __m128i xmm_dst_lo, xmm_dst_hi;
2240  __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2241  __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2242  __m128i xmm_mask_lo, xmm_mask_hi;
2243 
2244  while (w && (uintptr_t)pd & 15)
2245  {
2246  s = *ps++;
2247  m = *pm++;
2248  d = *pd;
2249 
2251  w--;
2252  }
2253 
2254  while (w >= 4)
2255  {
2256  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2257  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2258  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2259 
2260  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2261  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2262  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2263 
2264  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2265  &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2266  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2267  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2268 
2269  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2270  &xmm_mask_lo, &xmm_mask_hi,
2271  &xmm_src_lo, &xmm_src_hi);
2272  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2273  &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2274  &xmm_mask_lo, &xmm_mask_hi);
2275 
2276  negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2277  &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2278  negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2279  &xmm_mask_lo, &xmm_mask_hi);
2280 
2282  &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2283  &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2284  &xmm_dst_lo, &xmm_dst_hi);
2285 
2287  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2288 
2289  ps += 4;
2290  pd += 4;
2291  pm += 4;
2292  w -= 4;
2293  }
2294 
2295  while (w)
2296  {
2297  s = *ps++;
2298  m = *pm++;
2299  d = *pd;
2300 
2302  w--;
2303  }
2304 }
2305 
2306 static void
2308  pixman_op_t op,
2309  uint32_t * pd,
2310  const uint32_t * ps,
2311  const uint32_t * pm,
2312  int w)
2313 {
2314  uint32_t s, m, d;
2315 
2316  __m128i xmm_src_lo, xmm_src_hi;
2317  __m128i xmm_dst_lo, xmm_dst_hi;
2318  __m128i xmm_mask_lo, xmm_mask_hi;
2319 
2320  while (w && (uintptr_t)pd & 15)
2321  {
2322  s = *ps++;
2323  m = *pm++;
2324  d = *pd;
2325 
2326  *pd++ = pack_1x128_32 (
2327  _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2328  unpack_32_1x128 (m)),
2329  unpack_32_1x128 (d)));
2330  w--;
2331  }
2332 
2333  while (w >= 4)
2334  {
2335  xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2336  xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2337  xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2338 
2339  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2340  unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2341  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2342 
2343  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2344  &xmm_mask_lo, &xmm_mask_hi,
2345  &xmm_src_lo, &xmm_src_hi);
2346 
2348  (__m128i*)pd, pack_2x128_128 (
2349  _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2350  _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2351 
2352  ps += 4;
2353  pd += 4;
2354  pm += 4;
2355  w -= 4;
2356  }
2357 
2358  while (w)
2359  {
2360  s = *ps++;
2361  m = *pm++;
2362  d = *pd;
2363 
2364  *pd++ = pack_1x128_32 (
2365  _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2366  unpack_32_1x128 (m)),
2367  unpack_32_1x128 (d)));
2368  w--;
2369  }
2370 }
2371 
2372 static force_inline __m128i
2374 {
2375  return _mm_set1_epi16 (mask);
2376 }
2377 
2378 /* Work around a code generation bug in Sun Studio 12. */
2379 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2380 # define create_mask_2x32_128(mask0, mask1) \
2381  (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2382 #else
2383 static force_inline __m128i
2385  uint32_t mask1)
2386 {
2387  return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2388 }
2389 #endif
2390 
2391 static void
2394 {
2396  uint32_t src;
2397  uint32_t *dst_line, *dst, d;
2398  int32_t w;
2399  int dst_stride;
2400  __m128i xmm_src, xmm_alpha;
2401  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2402 
2403  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2404 
2405  if (src == 0)
2406  return;
2407 
2409  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2410 
2411  xmm_src = expand_pixel_32_1x128 (src);
2412  xmm_alpha = expand_alpha_1x128 (xmm_src);
2413 
2414  while (height--)
2415  {
2416  dst = dst_line;
2417 
2418  dst_line += dst_stride;
2419  w = width;
2420 
2421  while (w && (uintptr_t)dst & 15)
2422  {
2423  d = *dst;
2424  *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2425  xmm_alpha,
2426  unpack_32_1x128 (d)));
2427  w--;
2428  }
2429 
2430  while (w >= 4)
2431  {
2432  xmm_dst = load_128_aligned ((__m128i*)dst);
2433 
2434  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2435 
2436  over_2x128 (&xmm_src, &xmm_src,
2437  &xmm_alpha, &xmm_alpha,
2438  &xmm_dst_lo, &xmm_dst_hi);
2439 
2440  /* rebuid the 4 pixel data and save*/
2442  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2443 
2444  w -= 4;
2445  dst += 4;
2446  }
2447 
2448  while (w)
2449  {
2450  d = *dst;
2451  *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2452  xmm_alpha,
2453  unpack_32_1x128 (d)));
2454  w--;
2455  }
2456 
2457  }
2458 }
2459 
2460 static void
2463 {
2465  uint32_t src;
2466  uint16_t *dst_line, *dst, d;
2467  int32_t w;
2468  int dst_stride;
2469  __m128i xmm_src, xmm_alpha;
2470  __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2471 
2472  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2473 
2474  if (src == 0)
2475  return;
2476 
2478  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2479 
2480  xmm_src = expand_pixel_32_1x128 (src);
2481  xmm_alpha = expand_alpha_1x128 (xmm_src);
2482 
2483  while (height--)
2484  {
2485  dst = dst_line;
2486 
2487  dst_line += dst_stride;
2488  w = width;
2489 
2490  while (w && (uintptr_t)dst & 15)
2491  {
2492  d = *dst;
2493 
2494  *dst++ = pack_565_32_16 (
2495  pack_1x128_32 (over_1x128 (xmm_src,
2496  xmm_alpha,
2497  expand565_16_1x128 (d))));
2498  w--;
2499  }
2500 
2501  while (w >= 8)
2502  {
2503  xmm_dst = load_128_aligned ((__m128i*)dst);
2504 
2505  unpack_565_128_4x128 (xmm_dst,
2506  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507 
2508  over_2x128 (&xmm_src, &xmm_src,
2509  &xmm_alpha, &xmm_alpha,
2510  &xmm_dst0, &xmm_dst1);
2511  over_2x128 (&xmm_src, &xmm_src,
2512  &xmm_alpha, &xmm_alpha,
2513  &xmm_dst2, &xmm_dst3);
2514 
2515  xmm_dst = pack_565_4x128_128 (
2516  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2517 
2518  save_128_aligned ((__m128i*)dst, xmm_dst);
2519 
2520  dst += 8;
2521  w -= 8;
2522  }
2523 
2524  while (w--)
2525  {
2526  d = *dst;
2527  *dst++ = pack_565_32_16 (
2528  pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2529  expand565_16_1x128 (d))));
2530  }
2531  }
2532 
2533 }
2534 
2535 static void
2538 {
2540  uint32_t src;
2541  uint32_t *dst_line, d;
2542  uint32_t *mask_line, m;
2543  uint32_t pack_cmp;
2544  int dst_stride, mask_stride;
2545 
2546  __m128i xmm_src;
2547  __m128i xmm_dst;
2548  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549 
2550  __m128i mmx_src, mmx_mask, mmx_dest;
2551 
2552  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2553 
2554  if (src == 0)
2555  return;
2556 
2558  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2560  mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2561 
2562  xmm_src = _mm_unpacklo_epi8 (
2563  create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2564  mmx_src = xmm_src;
2565 
2566  while (height--)
2567  {
2568  int w = width;
2569  const uint32_t *pm = (uint32_t *)mask_line;
2570  uint32_t *pd = (uint32_t *)dst_line;
2571 
2572  dst_line += dst_stride;
2573  mask_line += mask_stride;
2574 
2575  while (w && (uintptr_t)pd & 15)
2576  {
2577  m = *pm++;
2578 
2579  if (m)
2580  {
2581  d = *pd;
2582 
2583  mmx_mask = unpack_32_1x128 (m);
2584  mmx_dest = unpack_32_1x128 (d);
2585 
2586  *pd = pack_1x128_32 (
2587  _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2588  mmx_dest));
2589  }
2590 
2591  pd++;
2592  w--;
2593  }
2594 
2595  while (w >= 4)
2596  {
2597  xmm_mask = load_128_unaligned ((__m128i*)pm);
2598 
2599  pack_cmp =
2600  _mm_movemask_epi8 (
2601  _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2602 
2603  /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2604  if (pack_cmp != 0xffff)
2605  {
2606  xmm_dst = load_128_aligned ((__m128i*)pd);
2607 
2608  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2609 
2610  pix_multiply_2x128 (&xmm_src, &xmm_src,
2611  &xmm_mask_lo, &xmm_mask_hi,
2612  &xmm_mask_lo, &xmm_mask_hi);
2613  xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2614 
2616  (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2617  }
2618 
2619  pd += 4;
2620  pm += 4;
2621  w -= 4;
2622  }
2623 
2624  while (w)
2625  {
2626  m = *pm++;
2627 
2628  if (m)
2629  {
2630  d = *pd;
2631 
2632  mmx_mask = unpack_32_1x128 (m);
2633  mmx_dest = unpack_32_1x128 (d);
2634 
2635  *pd = pack_1x128_32 (
2636  _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2637  mmx_dest));
2638  }
2639 
2640  pd++;
2641  w--;
2642  }
2643  }
2644 
2645 }
2646 
2647 static void
2650 {
2652  uint32_t src;
2653  uint32_t *dst_line, d;
2654  uint32_t *mask_line, m;
2655  uint32_t pack_cmp;
2656  int dst_stride, mask_stride;
2657 
2658  __m128i xmm_src, xmm_alpha;
2659  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2660  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2661 
2662  __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2663 
2664  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2665 
2666  if (src == 0)
2667  return;
2668 
2670  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2672  mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2673 
2674  xmm_src = _mm_unpacklo_epi8 (
2675  create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2676  xmm_alpha = expand_alpha_1x128 (xmm_src);
2677  mmx_src = xmm_src;
2678  mmx_alpha = xmm_alpha;
2679 
2680  while (height--)
2681  {
2682  int w = width;
2683  const uint32_t *pm = (uint32_t *)mask_line;
2684  uint32_t *pd = (uint32_t *)dst_line;
2685 
2686  dst_line += dst_stride;
2687  mask_line += mask_stride;
2688 
2689  while (w && (uintptr_t)pd & 15)
2690  {
2691  m = *pm++;
2692 
2693  if (m)
2694  {
2695  d = *pd;
2696  mmx_mask = unpack_32_1x128 (m);
2697  mmx_dest = unpack_32_1x128 (d);
2698 
2699  *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2700  &mmx_alpha,
2701  &mmx_mask,
2702  &mmx_dest));
2703  }
2704 
2705  pd++;
2706  w--;
2707  }
2708 
2709  while (w >= 4)
2710  {
2711  xmm_mask = load_128_unaligned ((__m128i*)pm);
2712 
2713  pack_cmp =
2714  _mm_movemask_epi8 (
2715  _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2716 
2717  /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2718  if (pack_cmp != 0xffff)
2719  {
2720  xmm_dst = load_128_aligned ((__m128i*)pd);
2721 
2722  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2723  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2724 
2725  in_over_2x128 (&xmm_src, &xmm_src,
2726  &xmm_alpha, &xmm_alpha,
2727  &xmm_mask_lo, &xmm_mask_hi,
2728  &xmm_dst_lo, &xmm_dst_hi);
2729 
2731  (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2732  }
2733 
2734  pd += 4;
2735  pm += 4;
2736  w -= 4;
2737  }
2738 
2739  while (w)
2740  {
2741  m = *pm++;
2742 
2743  if (m)
2744  {
2745  d = *pd;
2746  mmx_mask = unpack_32_1x128 (m);
2747  mmx_dest = unpack_32_1x128 (d);
2748 
2749  *pd = pack_1x128_32 (
2750  in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2751  }
2752 
2753  pd++;
2754  w--;
2755  }
2756  }
2757 
2758 }
2759 
2760 static void
2763 {
2765  uint32_t *dst_line, *dst;
2766  uint32_t *src_line, *src;
2767  uint32_t mask;
2768  int32_t w;
2769  int dst_stride, src_stride;
2770 
2771  __m128i xmm_mask;
2772  __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2773  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2774  __m128i xmm_alpha_lo, xmm_alpha_hi;
2775 
2777  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2779  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2780 
2781  mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2782 
2783  xmm_mask = create_mask_16_128 (mask >> 24);
2784 
2785  while (height--)
2786  {
2787  dst = dst_line;
2788  dst_line += dst_stride;
2789  src = src_line;
2790  src_line += src_stride;
2791  w = width;
2792 
2793  while (w && (uintptr_t)dst & 15)
2794  {
2795  uint32_t s = *src++;
2796 
2797  if (s)
2798  {
2799  uint32_t d = *dst;
2800 
2801  __m128i ms = unpack_32_1x128 (s);
2802  __m128i alpha = expand_alpha_1x128 (ms);
2803  __m128i dest = xmm_mask;
2804  __m128i alpha_dst = unpack_32_1x128 (d);
2805 
2806  *dst = pack_1x128_32 (
2807  in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2808  }
2809  dst++;
2810  w--;
2811  }
2812 
2813  while (w >= 4)
2814  {
2815  xmm_src = load_128_unaligned ((__m128i*)src);
2816 
2817  if (!is_zero (xmm_src))
2818  {
2819  xmm_dst = load_128_aligned ((__m128i*)dst);
2820 
2821  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2822  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2823  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2824  &xmm_alpha_lo, &xmm_alpha_hi);
2825 
2826  in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2827  &xmm_alpha_lo, &xmm_alpha_hi,
2828  &xmm_mask, &xmm_mask,
2829  &xmm_dst_lo, &xmm_dst_hi);
2830 
2832  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2833  }
2834 
2835  dst += 4;
2836  src += 4;
2837  w -= 4;
2838  }
2839 
2840  while (w)
2841  {
2842  uint32_t s = *src++;
2843 
2844  if (s)
2845  {
2846  uint32_t d = *dst;
2847 
2848  __m128i ms = unpack_32_1x128 (s);
2849  __m128i alpha = expand_alpha_1x128 (ms);
2850  __m128i mask = xmm_mask;
2851  __m128i dest = unpack_32_1x128 (d);
2852 
2853  *dst = pack_1x128_32 (
2854  in_over_1x128 (&ms, &alpha, &mask, &dest));
2855  }
2856 
2857  dst++;
2858  w--;
2859  }
2860  }
2861 
2862 }
2863 
2864 static void
2867 {
2869  uint16_t *dst_line, *dst;
2870  uint32_t *src_line, *src, s;
2871  int dst_stride, src_stride;
2872  int32_t w;
2873 
2874  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2875  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2876 
2877  while (height--)
2878  {
2879  dst = dst_line;
2880  dst_line += dst_stride;
2881  src = src_line;
2882  src_line += src_stride;
2883  w = width;
2884 
2885  while (w && (uintptr_t)dst & 15)
2886  {
2887  s = *src++;
2888  *dst = convert_8888_to_0565 (s);
2889  dst++;
2890  w--;
2891  }
2892 
2893  while (w >= 8)
2894  {
2895  __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2896  __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2897 
2898  save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2899 
2900  w -= 8;
2901  src += 8;
2902  dst += 8;
2903  }
2904 
2905  while (w)
2906  {
2907  s = *src++;
2908  *dst = convert_8888_to_0565 (s);
2909  dst++;
2910  w--;
2911  }
2912  }
2913 }
2914 
2915 static void
2918 {
2920  uint32_t *dst_line, *dst;
2921  uint32_t *src_line, *src;
2922  int32_t w;
2923  int dst_stride, src_stride;
2924 
2925 
2927  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2929  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2930 
2931  while (height--)
2932  {
2933  dst = dst_line;
2934  dst_line += dst_stride;
2935  src = src_line;
2936  src_line += src_stride;
2937  w = width;
2938 
2939  while (w && (uintptr_t)dst & 15)
2940  {
2941  *dst++ = *src++ | 0xff000000;
2942  w--;
2943  }
2944 
2945  while (w >= 16)
2946  {
2947  __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2948 
2949  xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2950  xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2951  xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2952  xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2953 
2954  save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2955  save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2956  save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2957  save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2958 
2959  dst += 16;
2960  src += 16;
2961  w -= 16;
2962  }
2963 
2964  while (w)
2965  {
2966  *dst++ = *src++ | 0xff000000;
2967  w--;
2968  }
2969  }
2970 
2971 }
2972 
2973 static void
2976 {
2978  uint32_t *dst_line, *dst;
2979  uint32_t *src_line, *src;
2980  uint32_t mask;
2981  int dst_stride, src_stride;
2982  int32_t w;
2983 
2984  __m128i xmm_mask, xmm_alpha;
2985  __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2986  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2987 
2989  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2991  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2992 
2993  mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2994 
2995  xmm_mask = create_mask_16_128 (mask >> 24);
2996  xmm_alpha = mask_00ff;
2997 
2998  while (height--)
2999  {
3000  dst = dst_line;
3001  dst_line += dst_stride;
3002  src = src_line;
3003  src_line += src_stride;
3004  w = width;
3005 
3006  while (w && (uintptr_t)dst & 15)
3007  {
3008  uint32_t s = (*src++) | 0xff000000;
3009  uint32_t d = *dst;
3010 
3011  __m128i src = unpack_32_1x128 (s);
3012  __m128i alpha = xmm_alpha;
3013  __m128i mask = xmm_mask;
3014  __m128i dest = unpack_32_1x128 (d);
3015 
3016  *dst++ = pack_1x128_32 (
3017  in_over_1x128 (&src, &alpha, &mask, &dest));
3018 
3019  w--;
3020  }
3021 
3022  while (w >= 4)
3023  {
3024  xmm_src = _mm_or_si128 (
3025  load_128_unaligned ((__m128i*)src), mask_ff000000);
3026  xmm_dst = load_128_aligned ((__m128i*)dst);
3027 
3028  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3029  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3030 
3031  in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3032  &xmm_alpha, &xmm_alpha,
3033  &xmm_mask, &xmm_mask,
3034  &xmm_dst_lo, &xmm_dst_hi);
3035 
3037  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3038 
3039  dst += 4;
3040  src += 4;
3041  w -= 4;
3042 
3043  }
3044 
3045  while (w)
3046  {
3047  uint32_t s = (*src++) | 0xff000000;
3048  uint32_t d = *dst;
3049 
3050  __m128i src = unpack_32_1x128 (s);
3051  __m128i alpha = xmm_alpha;
3052  __m128i mask = xmm_mask;
3053  __m128i dest = unpack_32_1x128 (d);
3054 
3055  *dst++ = pack_1x128_32 (
3056  in_over_1x128 (&src, &alpha, &mask, &dest));
3057 
3058  w--;
3059  }
3060  }
3061 
3062 }
3063 
3064 static void
3067 {
3069  int dst_stride, src_stride;
3070  uint32_t *dst_line, *dst;
3071  uint32_t *src_line, *src;
3072 
3074  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3076  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3077 
3078  dst = dst_line;
3079  src = src_line;
3080 
3081  while (height--)
3082  {
3083  sse2_combine_over_u (imp, op, dst, src, NULL, width);
3084 
3085  dst += dst_stride;
3086  src += src_stride;
3087  }
3088 }
3089 
3090 static force_inline uint16_t
3092 {
3093  __m128i ms;
3094 
3095  ms = unpack_32_1x128 (src);
3096  return pack_565_32_16 (
3097  pack_1x128_32 (
3098  over_1x128 (
3099  ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3100 }
3101 
3102 static void
3105 {
3107  uint16_t *dst_line, *dst, d;
3108  uint32_t *src_line, *src, s;
3109  int dst_stride, src_stride;
3110  int32_t w;
3111 
3112  __m128i xmm_alpha_lo, xmm_alpha_hi;
3113  __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3114  __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3115 
3117  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3119  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3120 
3121  while (height--)
3122  {
3123  dst = dst_line;
3124  src = src_line;
3125 
3126  dst_line += dst_stride;
3127  src_line += src_stride;
3128  w = width;
3129 
3130  /* Align dst on a 16-byte boundary */
3131  while (w &&
3132  ((uintptr_t)dst & 15))
3133  {
3134  s = *src++;
3135  d = *dst;
3136 
3138  w--;
3139  }
3140 
3141  /* It's a 8 pixel loop */
3142  while (w >= 8)
3143  {
3144  /* I'm loading unaligned because I'm not sure
3145  * about the address alignment.
3146  */
3147  xmm_src = load_128_unaligned ((__m128i*) src);
3148  xmm_dst = load_128_aligned ((__m128i*) dst);
3149 
3150  /* Unpacking */
3151  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3152  unpack_565_128_4x128 (xmm_dst,
3153  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3154  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3155  &xmm_alpha_lo, &xmm_alpha_hi);
3156 
3157  /* I'm loading next 4 pixels from memory
3158  * before to optimze the memory read.
3159  */
3160  xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3161 
3162  over_2x128 (&xmm_src_lo, &xmm_src_hi,
3163  &xmm_alpha_lo, &xmm_alpha_hi,
3164  &xmm_dst0, &xmm_dst1);
3165 
3166  /* Unpacking */
3167  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3168  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3169  &xmm_alpha_lo, &xmm_alpha_hi);
3170 
3171  over_2x128 (&xmm_src_lo, &xmm_src_hi,
3172  &xmm_alpha_lo, &xmm_alpha_hi,
3173  &xmm_dst2, &xmm_dst3);
3174 
3176  (__m128i*)dst, pack_565_4x128_128 (
3177  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3178 
3179  w -= 8;
3180  dst += 8;
3181  src += 8;
3182  }
3183 
3184  while (w--)
3185  {
3186  s = *src++;
3187  d = *dst;
3188 
3190  }
3191  }
3192 
3193 }
3194 
3195 static void
3198 {
3200  uint32_t src, srca;
3201  uint32_t *dst_line, *dst;
3202  uint8_t *mask_line, *mask;
3203  int dst_stride, mask_stride;
3204  int32_t w;
3205  uint32_t m, d;
3206 
3207  __m128i xmm_src, xmm_alpha, xmm_def;
3208  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3209  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3210 
3211  __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3212 
3213  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3214 
3215  srca = src >> 24;
3216  if (src == 0)
3217  return;
3218 
3220  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3222  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3223 
3224  xmm_def = create_mask_2x32_128 (src, src);
3225  xmm_src = expand_pixel_32_1x128 (src);
3226  xmm_alpha = expand_alpha_1x128 (xmm_src);
3227  mmx_src = xmm_src;
3228  mmx_alpha = xmm_alpha;
3229 
3230  while (height--)
3231  {
3232  dst = dst_line;
3233  dst_line += dst_stride;
3234  mask = mask_line;
3235  mask_line += mask_stride;
3236  w = width;
3237 
3238  while (w && (uintptr_t)dst & 15)
3239  {
3240  uint8_t m = *mask++;
3241 
3242  if (m)
3243  {
3244  d = *dst;
3245  mmx_mask = expand_pixel_8_1x128 (m);
3246  mmx_dest = unpack_32_1x128 (d);
3247 
3248  *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3249  &mmx_alpha,
3250  &mmx_mask,
3251  &mmx_dest));
3252  }
3253 
3254  w--;
3255  dst++;
3256  }
3257 
3258  while (w >= 4)
3259  {
3260  memcpy(&m, mask, sizeof(uint32_t));
3261 
3262  if (srca == 0xff && m == 0xffffffff)
3263  {
3264  save_128_aligned ((__m128i*)dst, xmm_def);
3265  }
3266  else if (m)
3267  {
3268  xmm_dst = load_128_aligned ((__m128i*) dst);
3269  xmm_mask = unpack_32_1x128 (m);
3270  xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3271 
3272  /* Unpacking */
3273  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3274  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3275 
3276  expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3277  &xmm_mask_lo, &xmm_mask_hi);
3278 
3279  in_over_2x128 (&xmm_src, &xmm_src,
3280  &xmm_alpha, &xmm_alpha,
3281  &xmm_mask_lo, &xmm_mask_hi,
3282  &xmm_dst_lo, &xmm_dst_hi);
3283 
3285  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3286  }
3287 
3288  w -= 4;
3289  dst += 4;
3290  mask += 4;
3291  }
3292 
3293  while (w)
3294  {
3295  uint8_t m = *mask++;
3296 
3297  if (m)
3298  {
3299  d = *dst;
3300  mmx_mask = expand_pixel_8_1x128 (m);
3301  mmx_dest = unpack_32_1x128 (d);
3302 
3303  *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3304  &mmx_alpha,
3305  &mmx_mask,
3306  &mmx_dest));
3307  }
3308 
3309  w--;
3310  dst++;
3311  }
3312  }
3313 
3314 }
3315 
3316 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3317 __attribute__((__force_align_arg_pointer__))
3318 #endif
3319 static pixman_bool_t
3321  uint32_t * bits,
3322  int stride,
3323  int bpp,
3324  int x,
3325  int y,
3326  int width,
3327  int height,
3328  uint32_t filler)
3329 {
3330  uint32_t byte_width;
3331  uint8_t *byte_line;
3332 
3333  __m128i xmm_def;
3334 
3335  if (bpp == 8)
3336  {
3337  uint32_t b;
3338  uint32_t w;
3339 
3340  stride = stride * (int) sizeof (uint32_t) / 1;
3341  byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3342  byte_width = width;
3343  stride *= 1;
3344 
3345  b = filler & 0xff;
3346  w = (b << 8) | b;
3347  filler = (w << 16) | w;
3348  }
3349  else if (bpp == 16)
3350  {
3351  stride = stride * (int) sizeof (uint32_t) / 2;
3352  byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3353  byte_width = 2 * width;
3354  stride *= 2;
3355 
3356  filler = (filler & 0xffff) * 0x00010001;
3357  }
3358  else if (bpp == 32)
3359  {
3360  stride = stride * (int) sizeof (uint32_t) / 4;
3361  byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3362  byte_width = 4 * width;
3363  stride *= 4;
3364  }
3365  else
3366  {
3367  return FALSE;
3368  }
3369 
3370  xmm_def = create_mask_2x32_128 (filler, filler);
3371 
3372  while (height--)
3373  {
3374  int w;
3375  uint8_t *d = byte_line;
3376  byte_line += stride;
3377  w = byte_width;
3378 
3379  if (w >= 1 && ((uintptr_t)d & 1))
3380  {
3381  *(uint8_t *)d = filler;
3382  w -= 1;
3383  d += 1;
3384  }
3385 
3386  while (w >= 2 && ((uintptr_t)d & 3))
3387  {
3388  *(uint16_t *)d = filler;
3389  w -= 2;
3390  d += 2;
3391  }
3392 
3393  while (w >= 4 && ((uintptr_t)d & 15))
3394  {
3395  *(uint32_t *)d = filler;
3396 
3397  w -= 4;
3398  d += 4;
3399  }
3400 
3401  while (w >= 128)
3402  {
3403  save_128_aligned ((__m128i*)(d), xmm_def);
3404  save_128_aligned ((__m128i*)(d + 16), xmm_def);
3405  save_128_aligned ((__m128i*)(d + 32), xmm_def);
3406  save_128_aligned ((__m128i*)(d + 48), xmm_def);
3407  save_128_aligned ((__m128i*)(d + 64), xmm_def);
3408  save_128_aligned ((__m128i*)(d + 80), xmm_def);
3409  save_128_aligned ((__m128i*)(d + 96), xmm_def);
3410  save_128_aligned ((__m128i*)(d + 112), xmm_def);
3411 
3412  d += 128;
3413  w -= 128;
3414  }
3415 
3416  if (w >= 64)
3417  {
3418  save_128_aligned ((__m128i*)(d), xmm_def);
3419  save_128_aligned ((__m128i*)(d + 16), xmm_def);
3420  save_128_aligned ((__m128i*)(d + 32), xmm_def);
3421  save_128_aligned ((__m128i*)(d + 48), xmm_def);
3422 
3423  d += 64;
3424  w -= 64;
3425  }
3426 
3427  if (w >= 32)
3428  {
3429  save_128_aligned ((__m128i*)(d), xmm_def);
3430  save_128_aligned ((__m128i*)(d + 16), xmm_def);
3431 
3432  d += 32;
3433  w -= 32;
3434  }
3435 
3436  if (w >= 16)
3437  {
3438  save_128_aligned ((__m128i*)(d), xmm_def);
3439 
3440  d += 16;
3441  w -= 16;
3442  }
3443 
3444  while (w >= 4)
3445  {
3446  *(uint32_t *)d = filler;
3447 
3448  w -= 4;
3449  d += 4;
3450  }
3451 
3452  if (w >= 2)
3453  {
3454  *(uint16_t *)d = filler;
3455  w -= 2;
3456  d += 2;
3457  }
3458 
3459  if (w >= 1)
3460  {
3461  *(uint8_t *)d = filler;
3462  w -= 1;
3463  d += 1;
3464  }
3465  }
3466 
3467  return TRUE;
3468 }
3469 
3470 static void
3473 {
3475  uint32_t src, srca;
3476  uint32_t *dst_line, *dst;
3477  uint8_t *mask_line, *mask;
3478  int dst_stride, mask_stride;
3479  int32_t w;
3480  uint32_t m;
3481 
3482  __m128i xmm_src, xmm_def;
3483  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3484 
3485  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3486 
3487  srca = src >> 24;
3488  if (src == 0)
3489  {
3490  sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3491  PIXMAN_FORMAT_BPP (dest_image->bits.format),
3492  dest_x, dest_y, width, height, 0);
3493  return;
3494  }
3495 
3497  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3499  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3500 
3501  xmm_def = create_mask_2x32_128 (src, src);
3502  xmm_src = expand_pixel_32_1x128 (src);
3503 
3504  while (height--)
3505  {
3506  dst = dst_line;
3507  dst_line += dst_stride;
3508  mask = mask_line;
3509  mask_line += mask_stride;
3510  w = width;
3511 
3512  while (w && (uintptr_t)dst & 15)
3513  {
3514  uint8_t m = *mask++;
3515 
3516  if (m)
3517  {
3518  *dst = pack_1x128_32 (
3520  }
3521  else
3522  {
3523  *dst = 0;
3524  }
3525 
3526  w--;
3527  dst++;
3528  }
3529 
3530  while (w >= 4)
3531  {
3532  memcpy(&m, mask, sizeof(uint32_t));
3533 
3534  if (srca == 0xff && m == 0xffffffff)
3535  {
3536  save_128_aligned ((__m128i*)dst, xmm_def);
3537  }
3538  else if (m)
3539  {
3540  xmm_mask = unpack_32_1x128 (m);
3541  xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3542 
3543  /* Unpacking */
3544  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3545 
3546  expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3547  &xmm_mask_lo, &xmm_mask_hi);
3548 
3549  pix_multiply_2x128 (&xmm_src, &xmm_src,
3550  &xmm_mask_lo, &xmm_mask_hi,
3551  &xmm_mask_lo, &xmm_mask_hi);
3552 
3554  (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3555  }
3556  else
3557  {
3558  save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3559  }
3560 
3561  w -= 4;
3562  dst += 4;
3563  mask += 4;
3564  }
3565 
3566  while (w)
3567  {
3568  uint8_t m = *mask++;
3569 
3570  if (m)
3571  {
3572  *dst = pack_1x128_32 (
3574  xmm_src, expand_pixel_8_1x128 (m)));
3575  }
3576  else
3577  {
3578  *dst = 0;
3579  }
3580 
3581  w--;
3582  dst++;
3583  }
3584  }
3585 
3586 }
3587 
3588 static void
3591 {
3593  uint32_t src;
3594  uint16_t *dst_line, *dst, d;
3595  uint8_t *mask_line, *mask;
3596  int dst_stride, mask_stride;
3597  int32_t w;
3598  uint32_t m;
3599  __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3600 
3601  __m128i xmm_src, xmm_alpha;
3602  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3603  __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3604 
3605  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3606 
3607  if (src == 0)
3608  return;
3609 
3611  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3613  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3614 
3615  xmm_src = expand_pixel_32_1x128 (src);
3616  xmm_alpha = expand_alpha_1x128 (xmm_src);
3617  mmx_src = xmm_src;
3618  mmx_alpha = xmm_alpha;
3619 
3620  while (height--)
3621  {
3622  dst = dst_line;
3623  dst_line += dst_stride;
3624  mask = mask_line;
3625  mask_line += mask_stride;
3626  w = width;
3627 
3628  while (w && (uintptr_t)dst & 15)
3629  {
3630  m = *mask++;
3631 
3632  if (m)
3633  {
3634  d = *dst;
3635  mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3636  mmx_dest = expand565_16_1x128 (d);
3637 
3638  *dst = pack_565_32_16 (
3639  pack_1x128_32 (
3640  in_over_1x128 (
3641  &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3642  }
3643 
3644  w--;
3645  dst++;
3646  }
3647 
3648  while (w >= 8)
3649  {
3650  xmm_dst = load_128_aligned ((__m128i*) dst);
3651  unpack_565_128_4x128 (xmm_dst,
3652  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3653 
3654  memcpy(&m, mask, sizeof(uint32_t));
3655  mask += 4;
3656 
3657  if (m)
3658  {
3659  xmm_mask = unpack_32_1x128 (m);
3660  xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3661 
3662  /* Unpacking */
3663  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3664 
3665  expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3666  &xmm_mask_lo, &xmm_mask_hi);
3667 
3668  in_over_2x128 (&xmm_src, &xmm_src,
3669  &xmm_alpha, &xmm_alpha,
3670  &xmm_mask_lo, &xmm_mask_hi,
3671  &xmm_dst0, &xmm_dst1);
3672  }
3673 
3674  memcpy(&m, mask, sizeof(uint32_t));
3675  mask += 4;
3676 
3677  if (m)
3678  {
3679  xmm_mask = unpack_32_1x128 (m);
3680  xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3681 
3682  /* Unpacking */
3683  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3684 
3685  expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3686  &xmm_mask_lo, &xmm_mask_hi);
3687  in_over_2x128 (&xmm_src, &xmm_src,
3688  &xmm_alpha, &xmm_alpha,
3689  &xmm_mask_lo, &xmm_mask_hi,
3690  &xmm_dst2, &xmm_dst3);
3691  }
3692 
3694  (__m128i*)dst, pack_565_4x128_128 (
3695  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3696 
3697  w -= 8;
3698  dst += 8;
3699  }
3700 
3701  while (w)
3702  {
3703  m = *mask++;
3704 
3705  if (m)
3706  {
3707  d = *dst;
3708  mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3709  mmx_dest = expand565_16_1x128 (d);
3710 
3711  *dst = pack_565_32_16 (
3712  pack_1x128_32 (
3713  in_over_1x128 (
3714  &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3715  }
3716 
3717  w--;
3718  dst++;
3719  }
3720  }
3721 
3722 }
3723 
3724 static void
3727 {
3729  uint16_t *dst_line, *dst, d;
3730  uint32_t *src_line, *src, s;
3731  int dst_stride, src_stride;
3732  int32_t w;
3733  uint32_t opaque, zero;
3734 
3735  __m128i ms;
3736  __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3737  __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3738 
3740  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3742  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3743 
3744  while (height--)
3745  {
3746  dst = dst_line;
3747  dst_line += dst_stride;
3748  src = src_line;
3749  src_line += src_stride;
3750  w = width;
3751 
3752  while (w && (uintptr_t)dst & 15)
3753  {
3754  s = *src++;
3755  d = *dst;
3756 
3757  ms = unpack_32_1x128 (s);
3758 
3759  *dst++ = pack_565_32_16 (
3760  pack_1x128_32 (
3762  w--;
3763  }
3764 
3765  while (w >= 8)
3766  {
3767  /* First round */
3768  xmm_src = load_128_unaligned ((__m128i*)src);
3769  xmm_dst = load_128_aligned ((__m128i*)dst);
3770 
3771  opaque = is_opaque (xmm_src);
3772  zero = is_zero (xmm_src);
3773 
3774  unpack_565_128_4x128 (xmm_dst,
3775  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3776  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3777 
3778  /* preload next round*/
3779  xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3780 
3781  if (opaque)
3782  {
3783  invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3784  &xmm_dst0, &xmm_dst1);
3785  }
3786  else if (!zero)
3787  {
3788  over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3789  &xmm_dst0, &xmm_dst1);
3790  }
3791 
3792  /* Second round */
3793  opaque = is_opaque (xmm_src);
3794  zero = is_zero (xmm_src);
3795 
3796  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3797 
3798  if (opaque)
3799  {
3800  invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3801  &xmm_dst2, &xmm_dst3);
3802  }
3803  else if (!zero)
3804  {
3805  over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3806  &xmm_dst2, &xmm_dst3);
3807  }
3808 
3810  (__m128i*)dst, pack_565_4x128_128 (
3811  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3812 
3813  w -= 8;
3814  src += 8;
3815  dst += 8;
3816  }
3817 
3818  while (w)
3819  {
3820  s = *src++;
3821  d = *dst;
3822 
3823  ms = unpack_32_1x128 (s);
3824 
3825  *dst++ = pack_565_32_16 (
3826  pack_1x128_32 (
3828  w--;
3829  }
3830  }
3831 
3832 }
3833 
3834 static void
3837 {
3839  uint32_t *dst_line, *dst, d;
3840  uint32_t *src_line, *src, s;
3841  int dst_stride, src_stride;
3842  int32_t w;
3843  uint32_t opaque, zero;
3844 
3845  __m128i xmm_src_lo, xmm_src_hi;
3846  __m128i xmm_dst_lo, xmm_dst_hi;
3847 
3849  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3851  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3852 
3853  while (height--)
3854  {
3855  dst = dst_line;
3856  dst_line += dst_stride;
3857  src = src_line;
3858  src_line += src_stride;
3859  w = width;
3860 
3861  while (w && (uintptr_t)dst & 15)
3862  {
3863  s = *src++;
3864  d = *dst;
3865 
3866  *dst++ = pack_1x128_32 (
3869 
3870  w--;
3871  }
3872 
3873  while (w >= 4)
3874  {
3875  xmm_src_hi = load_128_unaligned ((__m128i*)src);
3876 
3877  opaque = is_opaque (xmm_src_hi);
3878  zero = is_zero (xmm_src_hi);
3879 
3880  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3881 
3882  if (opaque)
3883  {
3884  invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3885  &xmm_dst_lo, &xmm_dst_hi);
3886 
3888  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3889  }
3890  else if (!zero)
3891  {
3892  xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3893 
3894  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3895 
3896  over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3897  &xmm_dst_lo, &xmm_dst_hi);
3898 
3900  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3901  }
3902 
3903  w -= 4;
3904  dst += 4;
3905  src += 4;
3906  }
3907 
3908  while (w)
3909  {
3910  s = *src++;
3911  d = *dst;
3912 
3913  *dst++ = pack_1x128_32 (
3916 
3917  w--;
3918  }
3919  }
3920 
3921 }
3922 
3923 static void
3926 {
3928  uint32_t src;
3929  uint16_t *dst_line, *dst, d;
3930  uint32_t *mask_line, *mask, m;
3931  int dst_stride, mask_stride;
3932  int w;
3933  uint32_t pack_cmp;
3934 
3935  __m128i xmm_src, xmm_alpha;
3936  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3937  __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3938 
3939  __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3940 
3941  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3942 
3943  if (src == 0)
3944  return;
3945 
3947  dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3949  mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3950 
3951  xmm_src = expand_pixel_32_1x128 (src);
3952  xmm_alpha = expand_alpha_1x128 (xmm_src);
3953  mmx_src = xmm_src;
3954  mmx_alpha = xmm_alpha;
3955 
3956  while (height--)
3957  {
3958  w = width;
3959  mask = mask_line;
3960  dst = dst_line;
3961  mask_line += mask_stride;
3962  dst_line += dst_stride;
3963 
3964  while (w && ((uintptr_t)dst & 15))
3965  {
3966  m = *(uint32_t *) mask;
3967 
3968  if (m)
3969  {
3970  d = *dst;
3971  mmx_mask = unpack_32_1x128 (m);
3972  mmx_dest = expand565_16_1x128 (d);
3973 
3974  *dst = pack_565_32_16 (
3975  pack_1x128_32 (
3976  in_over_1x128 (
3977  &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3978  }
3979 
3980  w--;
3981  dst++;
3982  mask++;
3983  }
3984 
3985  while (w >= 8)
3986  {
3987  /* First round */
3988  xmm_mask = load_128_unaligned ((__m128i*)mask);
3989  xmm_dst = load_128_aligned ((__m128i*)dst);
3990 
3991  pack_cmp = _mm_movemask_epi8 (
3992  _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3993 
3994  unpack_565_128_4x128 (xmm_dst,
3995  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3996  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3997 
3998  /* preload next round */
3999  xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4000 
4001  /* preload next round */
4002  if (pack_cmp != 0xffff)
4003  {
4004  in_over_2x128 (&xmm_src, &xmm_src,
4005  &xmm_alpha, &xmm_alpha,
4006  &xmm_mask_lo, &xmm_mask_hi,
4007  &xmm_dst0, &xmm_dst1);
4008  }
4009 
4010  /* Second round */
4011  pack_cmp = _mm_movemask_epi8 (
4012  _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4013 
4014  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4015 
4016  if (pack_cmp != 0xffff)
4017  {
4018  in_over_2x128 (&xmm_src, &xmm_src,
4019  &xmm_alpha, &xmm_alpha,
4020  &xmm_mask_lo, &xmm_mask_hi,
4021  &xmm_dst2, &xmm_dst3);
4022  }
4023 
4025  (__m128i*)dst, pack_565_4x128_128 (
4026  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4027 
4028  w -= 8;
4029  dst += 8;
4030  mask += 8;
4031  }
4032 
4033  while (w)
4034  {
4035  m = *(uint32_t *) mask;
4036 
4037  if (m)
4038  {
4039  d = *dst;
4040  mmx_mask = unpack_32_1x128 (m);
4041  mmx_dest = expand565_16_1x128 (d);
4042 
4043  *dst = pack_565_32_16 (
4044  pack_1x128_32 (
4045  in_over_1x128 (
4046  &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4047  }
4048 
4049  w--;
4050  dst++;
4051  mask++;
4052  }
4053  }
4054 
4055 }
4056 
4057 static void
4060 {
4062  uint8_t *dst_line, *dst;
4063  uint8_t *mask_line, *mask;
4064  int dst_stride, mask_stride;
4065  uint32_t d, m;
4066  uint32_t src;
4067  int32_t w;
4068 
4069  __m128i xmm_alpha;
4070  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4071  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4072 
4074  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4076  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4077 
4078  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4079 
4081 
4082  while (height--)
4083  {
4084  dst = dst_line;
4085  dst_line += dst_stride;
4086  mask = mask_line;
4087  mask_line += mask_stride;
4088  w = width;
4089 
4090  while (w && ((uintptr_t)dst & 15))
4091  {
4092  m = (uint32_t) *mask++;
4093  d = (uint32_t) *dst;
4094 
4095  *dst++ = (uint8_t) pack_1x128_32 (
4097  pix_multiply_1x128 (xmm_alpha,
4098  unpack_32_1x128 (m)),
4099  unpack_32_1x128 (d)));
4100  w--;
4101  }
4102 
4103  while (w >= 16)
4104  {
4105  xmm_mask = load_128_unaligned ((__m128i*)mask);
4106  xmm_dst = load_128_aligned ((__m128i*)dst);
4107 
4108  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4109  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4110 
4111  pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4112  &xmm_mask_lo, &xmm_mask_hi,
4113  &xmm_mask_lo, &xmm_mask_hi);
4114 
4115  pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4116  &xmm_dst_lo, &xmm_dst_hi,
4117  &xmm_dst_lo, &xmm_dst_hi);
4118 
4120  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4121 
4122  mask += 16;
4123  dst += 16;
4124  w -= 16;
4125  }
4126 
4127  while (w)
4128  {
4129  m = (uint32_t) *mask++;
4130  d = (uint32_t) *dst;
4131 
4132  *dst++ = (uint8_t) pack_1x128_32 (
4135  xmm_alpha, unpack_32_1x128 (m)),
4136  unpack_32_1x128 (d)));
4137  w--;
4138  }
4139  }
4140 
4141 }
4142 
4143 static void
4146 {
4148  uint8_t *dst_line, *dst;
4149  int dst_stride;
4150  uint32_t d;
4151  uint32_t src;
4152  int32_t w;
4153 
4154  __m128i xmm_alpha;
4155  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4156 
4158  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4159 
4160  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4161 
4163 
4164  src = src >> 24;
4165 
4166  if (src == 0xff)
4167  return;
4168 
4169  if (src == 0x00)
4170  {
4171  pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4172  8, dest_x, dest_y, width, height, src);
4173 
4174  return;
4175  }
4176 
4177  while (height--)
4178  {
4179  dst = dst_line;
4180  dst_line += dst_stride;
4181  w = width;
4182 
4183  while (w && ((uintptr_t)dst & 15))
4184  {
4185  d = (uint32_t) *dst;
4186 
4187  *dst++ = (uint8_t) pack_1x128_32 (
4189  xmm_alpha,
4190  unpack_32_1x128 (d)));
4191  w--;
4192  }
4193 
4194  while (w >= 16)
4195  {
4196  xmm_dst = load_128_aligned ((__m128i*)dst);
4197 
4198  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4199 
4200  pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4201  &xmm_dst_lo, &xmm_dst_hi,
4202  &xmm_dst_lo, &xmm_dst_hi);
4203 
4205  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4206 
4207  dst += 16;
4208  w -= 16;
4209  }
4210 
4211  while (w)
4212  {
4213  d = (uint32_t) *dst;
4214 
4215  *dst++ = (uint8_t) pack_1x128_32 (
4217  xmm_alpha,
4218  unpack_32_1x128 (d)));
4219  w--;
4220  }
4221  }
4222 
4223 }
4224 
4225 static void
4228 {
4230  uint8_t *dst_line, *dst;
4231  uint8_t *src_line, *src;
4232  int src_stride, dst_stride;
4233  int32_t w;
4234  uint32_t s, d;
4235 
4236  __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4237  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4238 
4240  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4242  src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4243 
4244  while (height--)
4245  {
4246  dst = dst_line;
4247  dst_line += dst_stride;
4248  src = src_line;
4249  src_line += src_stride;
4250  w = width;
4251 
4252  while (w && ((uintptr_t)dst & 15))
4253  {
4254  s = (uint32_t) *src++;
4255  d = (uint32_t) *dst;
4256 
4257  *dst++ = (uint8_t) pack_1x128_32 (
4260  w--;
4261  }
4262 
4263  while (w >= 16)
4264  {
4265  xmm_src = load_128_unaligned ((__m128i*)src);
4266  xmm_dst = load_128_aligned ((__m128i*)dst);
4267 
4268  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4269  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4270 
4271  pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4272  &xmm_dst_lo, &xmm_dst_hi,
4273  &xmm_dst_lo, &xmm_dst_hi);
4274 
4276  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4277 
4278  src += 16;
4279  dst += 16;
4280  w -= 16;
4281  }
4282 
4283  while (w)
4284  {
4285  s = (uint32_t) *src++;
4286  d = (uint32_t) *dst;
4287 
4288  *dst++ = (uint8_t) pack_1x128_32 (
4290  w--;
4291  }
4292  }
4293 
4294 }
4295 
4296 static void
4299 {
4301  uint8_t *dst_line, *dst;
4302  uint8_t *mask_line, *mask;
4303  int dst_stride, mask_stride;
4304  int32_t w;
4305  uint32_t src;
4306  uint32_t m, d;
4307 
4308  __m128i xmm_alpha;
4309  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4310  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4311 
4313  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4315  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4316 
4317  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4318 
4320 
4321  while (height--)
4322  {
4323  dst = dst_line;
4324  dst_line += dst_stride;
4325  mask = mask_line;
4326  mask_line += mask_stride;
4327  w = width;
4328 
4329  while (w && ((uintptr_t)dst & 15))
4330  {
4331  m = (uint32_t) *mask++;
4332  d = (uint32_t) *dst;
4333 
4334  *dst++ = (uint8_t) pack_1x128_32 (
4335  _mm_adds_epu16 (
4337  xmm_alpha, unpack_32_1x128 (m)),
4338  unpack_32_1x128 (d)));
4339  w--;
4340  }
4341 
4342  while (w >= 16)
4343  {
4344  xmm_mask = load_128_unaligned ((__m128i*)mask);
4345  xmm_dst = load_128_aligned ((__m128i*)dst);
4346 
4347  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4348  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4349 
4350  pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4351  &xmm_mask_lo, &xmm_mask_hi,
4352  &xmm_mask_lo, &xmm_mask_hi);
4353 
4354  xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4355  xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4356 
4358  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4359 
4360  mask += 16;
4361  dst += 16;
4362  w -= 16;
4363  }
4364 
4365  while (w)
4366  {
4367  m = (uint32_t) *mask++;
4368  d = (uint32_t) *dst;
4369 
4370  *dst++ = (uint8_t) pack_1x128_32 (
4371  _mm_adds_epu16 (
4373  xmm_alpha, unpack_32_1x128 (m)),
4374  unpack_32_1x128 (d)));
4375 
4376  w--;
4377  }
4378  }
4379 
4380 }
4381 
4382 static void
4385 {
4387  uint8_t *dst_line, *dst;
4388  int dst_stride;
4389  int32_t w;
4390  uint32_t src;
4391 
4392  __m128i xmm_src;
4393 
4395  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4396 
4397  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4398 
4399  src >>= 24;
4400 
4401  if (src == 0x00)
4402  return;
4403 
4404  if (src == 0xff)
4405  {
4406  pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4407  8, dest_x, dest_y, width, height, 0xff);
4408 
4409  return;
4410  }
4411 
4412  src = (src << 24) | (src << 16) | (src << 8) | src;
4413  xmm_src = _mm_set_epi32 (src, src, src, src);
4414 
4415  while (height--)
4416  {
4417  dst = dst_line;
4418  dst_line += dst_stride;
4419  w = width;
4420 
4421  while (w && ((uintptr_t)dst & 15))
4422  {
4423  *dst = (uint8_t)_mm_cvtsi128_si32 (
4424  _mm_adds_epu8 (
4425  xmm_src,
4426  _mm_cvtsi32_si128 (*dst)));
4427 
4428  w--;
4429  dst++;
4430  }
4431 
4432  while (w >= 16)
4433  {
4435  (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4436 
4437  dst += 16;
4438  w -= 16;
4439  }
4440 
4441  while (w)
4442  {
4443  *dst = (uint8_t)_mm_cvtsi128_si32 (
4444  _mm_adds_epu8 (
4445  xmm_src,
4446  _mm_cvtsi32_si128 (*dst)));
4447 
4448  w--;
4449  dst++;
4450  }
4451  }
4452 
4453 }
4454 
4455 static void
4458 {
4460  uint8_t *dst_line, *dst;
4461  uint8_t *src_line, *src;
4462  int dst_stride, src_stride;
4463  int32_t w;
4464  uint16_t t;
4465 
4467  src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4469  dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4470 
4471  while (height--)
4472  {
4473  dst = dst_line;
4474  src = src_line;
4475 
4476  dst_line += dst_stride;
4477  src_line += src_stride;
4478  w = width;
4479 
4480  /* Small head */
4481  while (w && (uintptr_t)dst & 3)
4482  {
4483  t = (*dst) + (*src++);
4484  *dst++ = t | (0 - (t >> 8));
4485  w--;
4486  }
4487 
4488  sse2_combine_add_u (imp, op,
4489  (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4490 
4491  /* Small tail */
4492  dst += w & 0xfffc;
4493  src += w & 0xfffc;
4494 
4495  w &= 3;
4496 
4497  while (w)
4498  {
4499  t = (*dst) + (*src++);
4500  *dst++ = t | (0 - (t >> 8));
4501  w--;
4502  }
4503  }
4504 
4505 }
4506 
4507 static void
4510 {
4512  uint32_t *dst_line, *dst;
4513  uint32_t *src_line, *src;
4514  int dst_stride, src_stride;
4515 
4517  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4519  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4520 
4521  while (height--)
4522  {
4523  dst = dst_line;
4524  dst_line += dst_stride;
4525  src = src_line;
4526  src_line += src_stride;
4527 
4528  sse2_combine_add_u (imp, op, dst, src, NULL, width);
4529  }
4530 }
4531 
4532 static void
4535 {
4537  uint32_t *dst_line, *dst, src;
4538  int dst_stride;
4539 
4540  __m128i xmm_src;
4541 
4542  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4543 
4544  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4545  if (src == 0)
4546  return;
4547 
4548  if (src == ~0)
4549  {
4550  pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4551  dest_x, dest_y, width, height, ~0);
4552 
4553  return;
4554  }
4555 
4556  xmm_src = _mm_set_epi32 (src, src, src, src);
4557  while (height--)
4558  {
4559  int w = width;
4560  uint32_t d;
4561 
4562  dst = dst_line;
4563  dst_line += dst_stride;
4564 
4565  while (w && (uintptr_t)dst & 15)
4566  {
4567  d = *dst;
4568  *dst++ =
4569  _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4570  w--;
4571  }
4572 
4573  while (w >= 4)
4574  {
4576  ((__m128i*)dst,
4577  _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4578 
4579  dst += 4;
4580  w -= 4;
4581  }
4582 
4583  while (w--)
4584  {
4585  d = *dst;
4586  *dst++ =
4587  _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4588  _mm_cvtsi32_si128 (d)));
4589  }
4590  }
4591 }
4592 
4593 static void
4596 {
4598  uint32_t *dst_line, *dst;
4599  uint8_t *mask_line, *mask;
4600  int dst_stride, mask_stride;
4601  int32_t w;
4602  uint32_t src;
4603 
4604  __m128i xmm_src;
4605 
4606  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4607  if (src == 0)
4608  return;
4609  xmm_src = expand_pixel_32_1x128 (src);
4610 
4612  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4614  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4615 
4616  while (height--)
4617  {
4618  dst = dst_line;
4619  dst_line += dst_stride;
4620  mask = mask_line;
4621  mask_line += mask_stride;
4622  w = width;
4623 
4624  while (w && ((uintptr_t)dst & 15))
4625  {
4626  uint8_t m = *mask++;
4627  if (m)
4628  {
4629  *dst = pack_1x128_32
4630  (_mm_adds_epu16
4632  unpack_32_1x128 (*dst)));
4633  }
4634  dst++;
4635  w--;
4636  }
4637 
4638  while (w >= 4)
4639  {
4640  uint32_t m;
4641  memcpy(&m, mask, sizeof(uint32_t));
4642 
4643  if (m)
4644  {
4645  __m128i xmm_mask_lo, xmm_mask_hi;
4646  __m128i xmm_dst_lo, xmm_dst_hi;
4647 
4648  __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4649  __m128i xmm_mask =
4650  _mm_unpacklo_epi8 (unpack_32_1x128(m),
4651  _mm_setzero_si128 ());
4652 
4653  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4654  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4655 
4656  expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4657  &xmm_mask_lo, &xmm_mask_hi);
4658 
4659  pix_multiply_2x128 (&xmm_src, &xmm_src,
4660  &xmm_mask_lo, &xmm_mask_hi,
4661  &xmm_mask_lo, &xmm_mask_hi);
4662 
4663  xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4664  xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4665 
4667  (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4668  }
4669 
4670  w -= 4;
4671  dst += 4;
4672  mask += 4;
4673  }
4674 
4675  while (w)
4676  {
4677  uint8_t m = *mask++;
4678  if (m)
4679  {
4680  *dst = pack_1x128_32
4681  (_mm_adds_epu16
4683  unpack_32_1x128 (*dst)));
4684  }
4685  dst++;
4686  w--;
4687  }
4688  }
4689 }
4690 
4691 static pixman_bool_t
4693  uint32_t * src_bits,
4694  uint32_t * dst_bits,
4695  int src_stride,
4696  int dst_stride,
4697  int src_bpp,
4698  int dst_bpp,
4699  int src_x,
4700  int src_y,
4701  int dest_x,
4702  int dest_y,
4703  int width,
4704  int height)
4705 {
4706  uint8_t * src_bytes;
4707  uint8_t * dst_bytes;
4708  int byte_width;
4709 
4710  if (src_bpp != dst_bpp)
4711  return FALSE;
4712 
4713  if (src_bpp == 16)
4714  {
4715  src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4716  dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4717  src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4718  dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4719  byte_width = 2 * width;
4720  src_stride *= 2;
4721  dst_stride *= 2;
4722  }
4723  else if (src_bpp == 32)
4724  {
4725  src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4726  dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4727  src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4728  dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4729  byte_width = 4 * width;
4730  src_stride *= 4;
4731  dst_stride *= 4;
4732  }
4733  else
4734  {
4735  return FALSE;
4736  }
4737 
4738  while (height--)
4739  {
4740  int w;
4741  uint8_t *s = src_bytes;
4742  uint8_t *d = dst_bytes;
4743  src_bytes += src_stride;
4744  dst_bytes += dst_stride;
4745  w = byte_width;
4746 
4747  while (w >= 2 && ((uintptr_t)d & 3))
4748  {
4749  memmove(d, s, 2);
4750  w -= 2;
4751  s += 2;
4752  d += 2;
4753  }
4754 
4755  while (w >= 4 && ((uintptr_t)d & 15))
4756  {
4757  memmove(d, s, 4);
4758 
4759  w -= 4;
4760  s += 4;
4761  d += 4;
4762  }
4763 
4764  while (w >= 64)
4765  {
4766  __m128i xmm0, xmm1, xmm2, xmm3;
4767 
4768  xmm0 = load_128_unaligned ((__m128i*)(s));
4769  xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4770  xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4771  xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4772 
4773  save_128_aligned ((__m128i*)(d), xmm0);
4774  save_128_aligned ((__m128i*)(d + 16), xmm1);
4775  save_128_aligned ((__m128i*)(d + 32), xmm2);
4776  save_128_aligned ((__m128i*)(d + 48), xmm3);
4777 
4778  s += 64;
4779  d += 64;
4780  w -= 64;
4781  }
4782 
4783  while (w >= 16)
4784  {
4785  save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4786 
4787  w -= 16;
4788  d += 16;
4789  s += 16;
4790  }
4791 
4792  while (w >= 4)
4793  {
4794  memmove(d, s, 4);
4795 
4796  w -= 4;
4797  s += 4;
4798  d += 4;
4799  }
4800 
4801  if (w >= 2)
4802  {
4803  memmove(d, s, 2);
4804  w -= 2;
4805  s += 2;
4806  d += 2;
4807  }
4808  }
4809 
4810  return TRUE;
4811 }
4812 
4813 static void
4816 {
4818  sse2_blt (imp, src_image->bits.bits,
4819  dest_image->bits.bits,
4820  src_image->bits.rowstride,
4821  dest_image->bits.rowstride,
4822  PIXMAN_FORMAT_BPP (src_image->bits.format),
4823  PIXMAN_FORMAT_BPP (dest_image->bits.format),
4824  src_x, src_y, dest_x, dest_y, width, height);
4825 }
4826 
4827 static void
4830 {
4832  uint32_t *src, *src_line, s;
4833  uint32_t *dst, *dst_line, d;
4834  uint8_t *mask, *mask_line;
4835  uint32_t m;
4836  int src_stride, mask_stride, dst_stride;
4837  int32_t w;
4838  __m128i ms;
4839 
4840  __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4841  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4842  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4843 
4845  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4847  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4849  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4850 
4851  while (height--)
4852  {
4853  src = src_line;
4854  src_line += src_stride;
4855  dst = dst_line;
4856  dst_line += dst_stride;
4857  mask = mask_line;
4858  mask_line += mask_stride;
4859 
4860  w = width;
4861 
4862  while (w && (uintptr_t)dst & 15)
4863  {
4864  s = 0xff000000 | *src++;
4865  memcpy(&m, mask++, sizeof(uint32_t));
4866  d = *dst;
4867  ms = unpack_32_1x128 (s);
4868 
4869  if (m != 0xff)
4870  {
4871  __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4872  __m128i md = unpack_32_1x128 (d);
4873 
4874  ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4875  }
4876 
4877  *dst++ = pack_1x128_32 (ms);
4878  w--;
4879  }
4880 
4881  while (w >= 4)
4882  {
4883  memcpy(&m, mask, sizeof(uint32_t));
4884  xmm_src = _mm_or_si128 (
4885  load_128_unaligned ((__m128i*)src), mask_ff000000);
4886 
4887  if (m == 0xffffffff)
4888  {
4889  save_128_aligned ((__m128i*)dst, xmm_src);
4890  }
4891  else
4892  {
4893  xmm_dst = load_128_aligned ((__m128i*)dst);
4894 
4895  xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4896 
4897  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4898  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4899  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4900 
4902  xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4903 
4904  in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4905  &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4906  &xmm_dst_lo, &xmm_dst_hi);
4907 
4908  save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4909  }
4910 
4911  src += 4;
4912  dst += 4;
4913  mask += 4;
4914  w -= 4;
4915  }
4916 
4917  while (w)
4918  {
4919  memcpy(&m, mask++, sizeof(uint32_t));
4920 
4921  if (m)
4922  {
4923  s = 0xff000000 | *src;
4924 
4925  if (m == 0xff)
4926  {
4927  *dst = s;
4928  }
4929  else
4930  {
4931  __m128i ma, md, ms;
4932 
4933  d = *dst;
4934 
4936  md = unpack_32_1x128 (d);
4937  ms = unpack_32_1x128 (s);
4938 
4939  *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4940  }
4941 
4942  }
4943 
4944  src++;
4945  dst++;
4946  w--;
4947  }
4948  }
4949 
4950 }
4951 
4952 static void
4955 {
4957  uint32_t *src, *src_line, s;
4958  uint32_t *dst, *dst_line, d;
4959  uint8_t *mask, *mask_line;
4960  uint32_t m;
4961  int src_stride, mask_stride, dst_stride;
4962  int32_t w;
4963 
4964  __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4965  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4966  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4967 
4969  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4971  mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4973  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4974 
4975  while (height--)
4976  {
4977  src = src_line;
4978  src_line += src_stride;
4979  dst = dst_line;
4980  dst_line += dst_stride;
4981  mask = mask_line;
4982  mask_line += mask_stride;
4983 
4984  w = width;
4985 
4986  while (w && (uintptr_t)dst & 15)
4987  {
4988  uint32_t sa;
4989 
4990  s = *src++;
4991  m = (uint32_t) *mask++;
4992  d = *dst;
4993 
4994  sa = s >> 24;
4995 
4996  if (m)
4997  {
4998  if (sa == 0xff && m == 0xff)
4999  {
5000  *dst = s;
5001  }
5002  else
5003  {
5004  __m128i ms, md, ma, msa;
5005 
5007  ms = unpack_32_1x128 (s);
5008  md = unpack_32_1x128 (d);
5009 
5011 
5012  *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5013  }
5014  }
5015 
5016  dst++;
5017  w--;
5018  }
5019 
5020  while (w >= 4)
5021  {
5022  memcpy(&m, mask, sizeof(uint32_t));
5023 
5024  if (m)
5025  {
5026  xmm_src = load_128_unaligned ((__m128i*)src);
5027 
5028  if (m == 0xffffffff && is_opaque (xmm_src))
5029  {
5030  save_128_aligned ((__m128i *)dst, xmm_src);
5031  }
5032  else
5033  {
5034  xmm_dst = load_128_aligned ((__m128i *)dst);
5035 
5036  xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5037 
5038  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5039  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5040  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5041 
5042  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5043  expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5044 
5045  in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5046  &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5047 
5048  save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5049  }
5050  }
5051 
5052  src += 4;
5053  dst += 4;
5054  mask += 4;
5055  w -= 4;
5056  }
5057 
5058  while (w)
5059  {
5060  uint32_t sa;
5061 
5062  s = *src++;
5063  m = (uint32_t) *mask++;
5064  d = *dst;
5065 
5066  sa = s >> 24;
5067 
5068  if (m)
5069  {
5070  if (sa == 0xff && m == 0xff)
5071  {
5072  *dst = s;
5073  }
5074  else
5075  {
5076  __m128i ms, md, ma, msa;
5077 
5079  ms = unpack_32_1x128 (s);
5080  md = unpack_32_1x128 (d);
5081 
5083 
5084  *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5085  }
5086  }
5087 
5088  dst++;
5089  w--;
5090  }
5091  }
5092 
5093 }
5094 
5095 static void
5098 {
5100  uint32_t src;
5101  uint32_t *dst_line, *dst;
5102  __m128i xmm_src;
5103  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5104  __m128i xmm_dsta_hi, xmm_dsta_lo;
5105  int dst_stride;
5106  int32_t w;
5107 
5108  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5109 
5110  if (src == 0)
5111  return;
5112 
5114  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5115 
5116  xmm_src = expand_pixel_32_1x128 (src);
5117 
5118  while (height--)
5119  {
5120  dst = dst_line;
5121 
5122  dst_line += dst_stride;
5123  w = width;
5124 
5125  while (w && (uintptr_t)dst & 15)
5126  {
5127  __m128i vd;
5128 
5129  vd = unpack_32_1x128 (*dst);
5130 
5132  xmm_src));
5133  w--;
5134  dst++;
5135  }
5136 
5137  while (w >= 4)
5138  {
5139  __m128i tmp_lo, tmp_hi;
5140 
5141  xmm_dst = load_128_aligned ((__m128i*)dst);
5142 
5143  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5144  expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5145 
5146  tmp_lo = xmm_src;
5147  tmp_hi = xmm_src;
5148 
5149  over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5150  &xmm_dsta_lo, &xmm_dsta_hi,
5151  &tmp_lo, &tmp_hi);
5152 
5154  (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5155 
5156  w -= 4;
5157  dst += 4;
5158  }
5159 
5160  while (w)
5161  {
5162  __m128i vd;
5163 
5164  vd = unpack_32_1x128 (*dst);
5165 
5167  xmm_src));
5168  w--;
5169  dst++;
5170  }
5171 
5172  }
5173 
5174 }
5175 
5176 static void
5179 {
5181  uint32_t *src, *src_line, s;
5182  uint32_t *dst, *dst_line, d;
5183  uint32_t *mask, *mask_line;
5184  uint32_t m;
5185  int src_stride, mask_stride, dst_stride;
5186  int32_t w;
5187 
5188  __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5189  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5190  __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5191 
5193  dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5195  mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5197  src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5198 
5199  while (height--)
5200  {
5201  src = src_line;
5202  src_line += src_stride;
5203  dst = dst_line;
5204  dst_line += dst_stride;
5205  mask = mask_line;
5206  mask_line += mask_stride;
5207 
5208  w = width;
5209 
5210  while (w && (uintptr_t)dst & 15)
5211  {
5212  uint32_t sa;
5213 
5214  s = *src++;
5215  m = (*mask++) >> 24;
5216  d = *dst;
5217 
5218  sa = s >> 24;
5219 
5220  if (m)
5221  {
5222  if (sa == 0xff && m == 0xff)
5223  {
5224  *dst = s;
5225  }
5226  else
5227  {
5228  __m128i ms, md, ma, msa;
5229 
5231  ms = unpack_32_1x128 (s);
5232  md = unpack_32_1x128 (d);
5233 
5235 
5236  *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5237  }
5238  }
5239 
5240  dst++;
5241  w--;
5242  }
5243 
5244  while (w >= 4)
5245  {
5246  xmm_mask = load_128_unaligned ((__m128i*)mask);
5247 
5248  if (!is_transparent (xmm_mask))
5249  {
5250  xmm_src = load_128_unaligned ((__m128i*)src);
5251 
5252  if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5253  {
5254  save_128_aligned ((__m128i *)dst, xmm_src);
5255  }
5256  else
5257  {
5258  xmm_dst = load_128_aligned ((__m128i *)dst);
5259 
5260  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5261  unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5262  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5263 
5264  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5265  expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5266 
5267  in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5268  &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5269 
5270  save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5271  }
5272  }
5273 
5274  src += 4;
5275  dst += 4;
5276  mask += 4;
5277  w -= 4;
5278  }
5279 
5280  while (w)
5281  {
5282  uint32_t sa;
5283 
5284  s = *src++;
5285  m = (*mask++) >> 24;
5286  d = *dst;
5287 
5288  sa = s >> 24;
5289 
5290  if (m)
5291  {
5292  if (sa == 0xff && m == 0xff)
5293  {
5294  *dst = s;
5295  }
5296  else
5297  {
5298  __m128i ms, md, ma, msa;
5299 
5301  ms = unpack_32_1x128 (s);
5302  md = unpack_32_1x128 (d);
5303 
5305 
5306  *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5307  }
5308  }
5309 
5310  dst++;
5311  w--;
5312  }
5313  }
5314 
5315 }
5316 
5317 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5318 static force_inline void
5320  const uint32_t* ps,
5321  int32_t w,
5322  pixman_fixed_t vx,
5323  pixman_fixed_t unit_x,
5324  pixman_fixed_t src_width_fixed,
5325  pixman_bool_t fully_transparent_src)
5326 {
5327  uint32_t s, d;
5328  const uint32_t* pm = NULL;
5329 
5330  __m128i xmm_dst_lo, xmm_dst_hi;
5331  __m128i xmm_src_lo, xmm_src_hi;
5332  __m128i xmm_alpha_lo, xmm_alpha_hi;
5333 
5334  if (fully_transparent_src)
5335  return;
5336 
5337  /* Align dst on a 16-byte boundary */
5338  while (w && ((uintptr_t)pd & 15))
5339  {
5340  d = *pd;
5341  s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5342  vx += unit_x;
5343  while (vx >= 0)
5344  vx -= src_width_fixed;
5345 
5347  if (pm)
5348  pm++;
5349  w--;
5350  }
5351 
5352  while (w >= 4)
5353  {
5354  __m128i tmp;
5355  uint32_t tmp1, tmp2, tmp3, tmp4;
5356 
5357  tmp1 = *(ps + pixman_fixed_to_int (vx));
5358  vx += unit_x;
5359  while (vx >= 0)
5360  vx -= src_width_fixed;
5361  tmp2 = *(ps + pixman_fixed_to_int (vx));
5362  vx += unit_x;
5363  while (vx >= 0)
5364  vx -= src_width_fixed;
5365  tmp3 = *(ps + pixman_fixed_to_int (vx));
5366  vx += unit_x;
5367  while (vx >= 0)
5368  vx -= src_width_fixed;
5369  tmp4 = *(ps + pixman_fixed_to_int (vx));
5370  vx += unit_x;
5371  while (vx >= 0)
5372  vx -= src_width_fixed;
5373 
5374  tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5375 
5376  xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5377 
5378  if (is_opaque (xmm_src_hi))
5379  {
5380  save_128_aligned ((__m128i*)pd, xmm_src_hi);
5381  }
5382  else if (!is_zero (xmm_src_hi))
5383  {
5384  xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5385 
5386  unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5387  unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5388 
5390  xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5391 
5392  over_2x128 (&xmm_src_lo, &xmm_src_hi,
5393  &xmm_alpha_lo, &xmm_alpha_hi,
5394  &xmm_dst_lo, &xmm_dst_hi);
5395 
5396  /* rebuid the 4 pixel data and save*/
5397  save_128_aligned ((__m128i*)pd,
5398  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5399  }
5400 
5401  w -= 4;
5402  pd += 4;
5403  if (pm)
5404  pm += 4;
5405  }
5406 
5407  while (w)
5408  {
5409  d = *pd;
5410  s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5411  vx += unit_x;
5412  while (vx >= 0)
5413  vx -= src_width_fixed;
5414 
5416  if (pm)
5417  pm++;
5418 
5419  w--;
5420  }
5421 }
5422 
5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5429 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5432 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5434  uint32_t, uint32_t, NORMAL)
5435 
5436 static force_inline void
5438  uint32_t * dst,
5439  const uint32_t * src,
5440  int32_t w,
5441  pixman_fixed_t vx,
5442  pixman_fixed_t unit_x,
5443  pixman_fixed_t src_width_fixed,
5444  pixman_bool_t zero_src)
5445 {
5446  __m128i xmm_mask;
5447  __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5448  __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5449  __m128i xmm_alpha_lo, xmm_alpha_hi;
5450 
5451  if (zero_src || (*mask >> 24) == 0)
5452  return;
5453 
5454  xmm_mask = create_mask_16_128 (*mask >> 24);
5455 
5456  while (w && (uintptr_t)dst & 15)
5457  {
5458  uint32_t s = *(src + pixman_fixed_to_int (vx));
5459  vx += unit_x;
5460  while (vx >= 0)
5461  vx -= src_width_fixed;
5462 
5463  if (s)
5464  {
5465  uint32_t d = *dst;
5466 
5467  __m128i ms = unpack_32_1x128 (s);
5468  __m128i alpha = expand_alpha_1x128 (ms);
5469  __m128i dest = xmm_mask;
5470  __m128i alpha_dst = unpack_32_1x128 (d);
5471 
5472  *dst = pack_1x128_32 (
5473  in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5474  }
5475  dst++;
5476  w--;
5477  }
5478 
5479  while (w >= 4)
5480  {
5481  uint32_t tmp1, tmp2, tmp3, tmp4;
5482 
5483  tmp1 = *(src + pixman_fixed_to_int (vx));
5484  vx += unit_x;
5485  while (vx >= 0)
5486  vx -= src_width_fixed;
5487  tmp2 = *(src + pixman_fixed_to_int (vx));
5488  vx += unit_x;
5489  while (vx >= 0)
5490  vx -= src_width_fixed;
5491  tmp3 = *(src + pixman_fixed_to_int (vx));
5492  vx += unit_x;
5493  while (vx >= 0)
5494  vx -= src_width_fixed;
5495  tmp4 = *(src + pixman_fixed_to_int (vx));
5496  vx += unit_x;
5497  while (vx >= 0)
5498  vx -= src_width_fixed;
5499 
5500  xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5501 
5502  if (!is_zero (xmm_src))
5503  {
5504  xmm_dst = load_128_aligned ((__m128i*)dst);
5505 
5506  unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5507  unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5508  expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5509  &xmm_alpha_lo, &xmm_alpha_hi);
5510 
5511  in_ov