w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

pixman-mmx.c
Go to the documentation of this file.
1 /*
2  * Copyright © 2004, 2005 Red Hat, Inc.
3  * Copyright © 2004 Nicholas Miell
4  * Copyright © 2005 Trolltech AS
5  *
6  * Permission to use, copy, modify, distribute, and sell this software and its
7  * documentation for any purpose is hereby granted without fee, provided that
8  * the above copyright notice appear in all copies and that both that
9  * copyright notice and this permission notice appear in supporting
10  * documentation, and that the name of Red Hat not be used in advertising or
11  * publicity pertaining to distribution of the software without specific,
12  * written prior permission. Red Hat makes no representations about the
13  * suitability of this software for any purpose. It is provided "as is"
14  * without express or implied warranty.
15  *
16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23  * SOFTWARE.
24  *
25  * Author: Søren Sandmann (sandmann@redhat.com)
26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28  *
29  * Based on work by Owen Taylor
30  */
31 
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35 
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
37 
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
40 #else
41 #include <mmintrin.h>
42 #endif
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
46 
47 #ifdef VERBOSE
48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49 #else
50 #define CHECKPOINT()
51 #endif
52 
53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56 _mm_empty (void)
57 {
58 
59 }
60 #endif
61 
62 #ifdef USE_X86_MMX
63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64 # include <xmmintrin.h>
65 # else
66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67  * instructions to be generated that we don't want. Just duplicate the
68  * functions we want to use. */
69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70 _mm_movemask_pi8 (__m64 __A)
71 {
72  int ret;
73 
74  asm ("pmovmskb %1, %0\n\t"
75  : "=r" (ret)
76  : "y" (__A)
77  );
78 
79  return ret;
80 }
81 
82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
84 {
85  asm ("pmulhuw %1, %0\n\t"
86  : "+y" (__A)
87  : "y" (__B)
88  );
89  return __A;
90 }
91 
92 # define _mm_shuffle_pi16(A, N) \
93  ({ \
94  __m64 ret; \
95  \
96  asm ("pshufw %2, %1, %0\n\t" \
97  : "=y" (ret) \
98  : "y" (A), "K" ((const int8_t)N) \
99  ); \
100  \
101  ret; \
102  })
103 # endif
104 #endif
105 
106 #ifndef _MSC_VER
107 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
108  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
109 #endif
110 
111 /* Notes about writing mmx code
112  *
113  * give memory operands as the second operand. If you give it as the
114  * first, gcc will first load it into a register, then use that
115  * register
116  *
117  * ie. use
118  *
119  * _mm_mullo_pi16 (x, mmx_constant);
120  *
121  * not
122  *
123  * _mm_mullo_pi16 (mmx_constant, x);
124  *
125  * Also try to minimize dependencies. i.e. when you need a value, try
126  * to calculate it from a value that was calculated as early as
127  * possible.
128  */
129 
130 /* --------------- MMX primitives ------------------------------------- */
131 
132 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
133  * the name of the member used to access the data.
134  * If __m64 requires using mm_cvt* intrinsics functions to convert between
135  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
136  * If __m64 and uint64_t values can just be cast to each other directly,
137  * then define USE_M64_CASTS.
138  * If __m64 is a double datatype, then define USE_M64_DOUBLE.
139  */
140 #ifdef _MSC_VER
141 # define M64_MEMBER m64_u64
142 #elif defined(__ICC)
143 # define USE_CVT_INTRINSICS
144 #elif defined(USE_LOONGSON_MMI)
145 # define USE_M64_DOUBLE
146 #elif defined(__GNUC__)
147 # define USE_M64_CASTS
148 #elif defined(__SUNPRO_C)
149 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
150 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
151  * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
152  * is defined. If it is used, then the mm_cvt* intrinsics must be used.
153  */
154 # define USE_CVT_INTRINSICS
155 # else
156 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
157  * disabled, __m64 is defined as a struct containing "unsigned long long l_".
158  */
159 # define M64_MEMBER l_
160 # endif
161 #endif
162 
163 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
164 typedef uint64_t mmxdatafield;
165 #else
166 typedef __m64 mmxdatafield;
167 #endif
168 
169 typedef struct
170 {
171  mmxdatafield mmx_4x00ff;
172  mmxdatafield mmx_4x0080;
173  mmxdatafield mmx_565_rgb;
174  mmxdatafield mmx_565_unpack_multiplier;
175  mmxdatafield mmx_565_pack_multiplier;
176  mmxdatafield mmx_565_r;
177  mmxdatafield mmx_565_g;
178  mmxdatafield mmx_565_b;
179  mmxdatafield mmx_packed_565_rb;
180  mmxdatafield mmx_packed_565_g;
181  mmxdatafield mmx_expand_565_g;
182  mmxdatafield mmx_expand_565_b;
183  mmxdatafield mmx_expand_565_r;
184 #ifndef USE_LOONGSON_MMI
185  mmxdatafield mmx_mask_0;
186  mmxdatafield mmx_mask_1;
187  mmxdatafield mmx_mask_2;
188  mmxdatafield mmx_mask_3;
189 #endif
190  mmxdatafield mmx_full_alpha;
191  mmxdatafield mmx_4x0101;
192  mmxdatafield mmx_ff000000;
193 } mmx_data_t;
194 
195 #if defined(_MSC_VER)
196 # define MMXDATA_INIT(field, val) { val ## UI64 }
197 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
198 # define MMXDATA_INIT(field, val) field = { val ## ULL }
199 #else /* mmxdatafield is an integral type */
200 # define MMXDATA_INIT(field, val) field = val ## ULL
201 #endif
202 
203 static const mmx_data_t c =
204 {
205  MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
206  MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
207  MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
208  MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
209  MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
210  MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
211  MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
212  MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
213  MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
214  MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
215  MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
216  MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
217  MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
218 #ifndef USE_LOONGSON_MMI
219  MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
220  MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
221  MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
222  MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
223 #endif
224  MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
225  MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
226  MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
227 };
228 
229 #ifdef USE_CVT_INTRINSICS
230 # define MC(x) to_m64 (c.mmx_ ## x)
231 #elif defined(USE_M64_CASTS)
232 # define MC(x) ((__m64)c.mmx_ ## x)
233 #elif defined(USE_M64_DOUBLE)
234 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
235 #else
236 # define MC(x) c.mmx_ ## x
237 #endif
238 
239 static force_inline __m64
240 to_m64 (uint64_t x)
241 {
242 #ifdef USE_CVT_INTRINSICS
243  return _mm_cvtsi64_m64 (x);
244 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
245  __m64 res;
246 
247  res.M64_MEMBER = x;
248  return res;
249 #elif defined USE_M64_DOUBLE
250  return *(__m64 *)&x;
251 #else /* USE_M64_CASTS */
252  return (__m64)x;
253 #endif
254 }
255 
256 static force_inline uint64_t
257 to_uint64 (__m64 x)
258 {
259 #ifdef USE_CVT_INTRINSICS
260  return _mm_cvtm64_si64 (x);
261 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
262  uint64_t res = x.M64_MEMBER;
263  return res;
264 #elif defined USE_M64_DOUBLE
265  return *(uint64_t *)&x;
266 #else /* USE_M64_CASTS */
267  return (uint64_t)x;
268 #endif
269 }
270 
271 static force_inline __m64
272 shift (__m64 v,
273  int s)
274 {
275  if (s > 0)
276  return _mm_slli_si64 (v, s);
277  else if (s < 0)
278  return _mm_srli_si64 (v, -s);
279  else
280  return v;
281 }
282 
283 static force_inline __m64
284 negate (__m64 mask)
285 {
286  return _mm_xor_si64 (mask, MC (4x00ff));
287 }
288 
289 /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
290  * and maps its result to the same range.
291  *
292  * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
293  * Notation, Notation, Notation", the first of which is
294  *
295  * prod(a, b) = (a * b + 128) / 255.
296  *
297  * By approximating the division by 255 as 257/65536 it can be replaced by a
298  * multiply and a right shift. This is the implementation that we use in
299  * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
300  * 3DNow!, and unavailable at the time of the book's publication) to perform
301  * the multiplication and right shift in a single operation.
302  *
303  * prod(a, b) = ((a * b + 128) * 257) >> 16.
304  *
305  * A third way (how pix_multiply() was implemented prior to 14208344) exists
306  * also that performs the multiplication by 257 with adds and shifts.
307  *
308  * Where temp = a * b + 128
309  *
310  * prod(a, b) = (temp + (temp >> 8)) >> 8.
311  */
312 static force_inline __m64
314 {
315  __m64 res;
316 
317  res = _mm_mullo_pi16 (a, b);
318  res = _mm_adds_pu16 (res, MC (4x0080));
319  res = _mm_mulhi_pu16 (res, MC (4x0101));
320 
321  return res;
322 }
323 
324 static force_inline __m64
325 pix_add (__m64 a, __m64 b)
326 {
327  return _mm_adds_pu8 (a, b);
328 }
329 
330 static force_inline __m64
331 expand_alpha (__m64 pixel)
332 {
333  return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
334 }
335 
336 static force_inline __m64
337 expand_alpha_rev (__m64 pixel)
338 {
339  return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
340 }
341 
342 static force_inline __m64
343 invert_colors (__m64 pixel)
344 {
345  return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
346 }
347 
348 static force_inline __m64
349 over (__m64 src,
350  __m64 srca,
351  __m64 dest)
352 {
353  return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
354 }
355 
356 static force_inline __m64
357 over_rev_non_pre (__m64 src, __m64 dest)
358 {
359  __m64 srca = expand_alpha (src);
360  __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
361 
362  return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
363 }
364 
365 static force_inline __m64
366 in (__m64 src, __m64 mask)
367 {
368  return pix_multiply (src, mask);
369 }
370 
371 #ifndef _MSC_VER
372 static force_inline __m64
374 {
375  return over (in (src, mask), pix_multiply (srca, mask), dest);
376 }
377 
378 #else
379 
380 #define in_over(src, srca, mask, dest) \
381  over (in (src, mask), pix_multiply (srca, mask), dest)
382 
383 #endif
384 
385 /* Elemental unaligned loads */
386 
387 static force_inline __m64 ldq_u(__m64 *p)
388 {
389 #ifdef USE_X86_MMX
390  /* x86's alignment restrictions are very relaxed, but that's no excuse */
391  __m64 r;
392  memcpy(&r, p, sizeof(__m64));
393  return r;
394 #elif defined USE_ARM_IWMMXT
395  int align = (uintptr_t)p & 7;
396  __m64 *aligned_p;
397  if (align == 0)
398  return *p;
399  aligned_p = (__m64 *)((uintptr_t)p & ~7);
400  return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
401 #else
402  struct __una_u64 { __m64 x __attribute__((packed)); };
403  const struct __una_u64 *ptr = (const struct __una_u64 *) p;
404  return (__m64) ptr->x;
405 #endif
406 }
407 
408 static force_inline uint32_t ldl_u(const uint32_t *p)
409 {
410 #ifdef USE_X86_MMX
411  /* x86's alignment restrictions are very relaxed. */
412  uint32_t r;
413  memcpy(&r, p, sizeof(uint32_t));
414  return r;
415 #else
416  struct __una_u32 { uint32_t x __attribute__((packed)); };
417  const struct __una_u32 *ptr = (const struct __una_u32 *) p;
418  return ptr->x;
419 #endif
420 }
421 
422 static force_inline __m64
423 load (const uint32_t *v)
424 {
425 #ifdef USE_LOONGSON_MMI
426  __m64 ret;
427  asm ("lwc1 %0, %1\n\t"
428  : "=f" (ret)
429  : "m" (*v)
430  );
431  return ret;
432 #else
433  return _mm_cvtsi32_si64 (*v);
434 #endif
435 }
436 
437 static force_inline __m64
438 load8888 (const uint32_t *v)
439 {
440 #ifdef USE_LOONGSON_MMI
441  return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
442 #else
443  return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
444 #endif
445 }
446 
447 static force_inline __m64
448 load8888u (const uint32_t *v)
449 {
450  uint32_t l = ldl_u (v);
451  return load8888 (&l);
452 }
453 
454 static force_inline __m64
455 pack8888 (__m64 lo, __m64 hi)
456 {
457  return _mm_packs_pu16 (lo, hi);
458 }
459 
460 static force_inline void
462 {
463 #ifdef USE_LOONGSON_MMI
464  asm ("swc1 %1, %0\n\t"
465  : "=m" (*dest)
466  : "f" (v)
467  : "memory"
468  );
469 #else
470  *dest = _mm_cvtsi64_si32 (v);
471 #endif
472 }
473 
474 static force_inline void
475 store8888 (uint32_t *dest, __m64 v)
476 {
477  v = pack8888 (v, _mm_setzero_si64 ());
478  store (dest, v);
479 }
480 
482 is_equal (__m64 a, __m64 b)
483 {
484 #ifdef USE_LOONGSON_MMI
485  /* __m64 is double, we can compare directly. */
486  return a == b;
487 #else
488  return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
489 #endif
490 }
491 
493 is_opaque (__m64 v)
494 {
495 #ifdef USE_LOONGSON_MMI
496  return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
497 #else
498  __m64 ffs = _mm_cmpeq_pi8 (v, v);
499  return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
500 #endif
501 }
502 
504 is_zero (__m64 v)
505 {
506  return is_equal (v, _mm_setzero_si64 ());
507 }
508 
509 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
510  *
511  * 00RR00GG00BB
512  *
513  * --- Expanding 565 in the low word ---
514  *
515  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
516  * m = m & (01f0003f001f);
517  * m = m * (008404100840);
518  * m = m >> 8;
519  *
520  * Note the trick here - the top word is shifted by another nibble to
521  * avoid it bumping into the middle word
522  */
523 static force_inline __m64
524 expand565 (__m64 pixel, int pos)
525 {
526  __m64 p = pixel;
527  __m64 t1, t2;
528 
529  /* move pixel to low 16 bit and zero the rest */
530 #ifdef USE_LOONGSON_MMI
532 #else
533  p = shift (shift (p, (3 - pos) * 16), -48);
534 #endif
535 
536  t1 = shift (p, 36 - 11);
537  t2 = shift (p, 16 - 5);
538 
539  p = _mm_or_si64 (t1, p);
540  p = _mm_or_si64 (t2, p);
541  p = _mm_and_si64 (p, MC (565_rgb));
542 
543  pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
544  return _mm_srli_pi16 (pixel, 8);
545 }
546 
547 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
548  *
549  * AARRGGBBRRGGBB
550  */
551 static force_inline void
552 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
553 {
555  __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
556  __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
557  __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
558  if (full_alpha)
560 
561  /* Replicate high bits into empty low bits. */
562  r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
563  g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
564  b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
565 
566  r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
567  g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
568  b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
569 
570  t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
571  t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
572 
573  *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
574  *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
575 }
576 
577 static force_inline __m64
578 expand8888 (__m64 in, int pos)
579 {
580  if (pos == 0)
581  return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
582  else
583  return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
584 }
585 
586 static force_inline __m64
587 expandx888 (__m64 in, int pos)
588 {
589  return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
590 }
591 
592 static force_inline void
593 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
594 {
595  __m64 v0, v1;
596  expand_4xpacked565 (vin, &v0, &v1, full_alpha);
597  *vout0 = expand8888 (v0, 0);
598  *vout1 = expand8888 (v0, 1);
599  *vout2 = expand8888 (v1, 0);
600  *vout3 = expand8888 (v1, 1);
601 }
602 
603 static force_inline __m64
604 pack_565 (__m64 pixel, __m64 target, int pos)
605 {
606  __m64 p = pixel;
607  __m64 t = target;
608  __m64 r, g, b;
609 
610  r = _mm_and_si64 (p, MC (565_r));
611  g = _mm_and_si64 (p, MC (565_g));
612  b = _mm_and_si64 (p, MC (565_b));
613 
614 #ifdef USE_LOONGSON_MMI
615  r = shift (r, -(32 - 8));
616  g = shift (g, -(16 - 3));
617  b = shift (b, -(0 + 3));
618 
619  p = _mm_or_si64 (r, g);
620  p = _mm_or_si64 (p, b);
621  return loongson_insert_pi16 (t, p, pos);
622 #else
623  r = shift (r, -(32 - 8) + pos * 16);
624  g = shift (g, -(16 - 3) + pos * 16);
625  b = shift (b, -(0 + 3) + pos * 16);
626 
627  if (pos == 0)
628  t = _mm_and_si64 (t, MC (mask_0));
629  else if (pos == 1)
630  t = _mm_and_si64 (t, MC (mask_1));
631  else if (pos == 2)
632  t = _mm_and_si64 (t, MC (mask_2));
633  else if (pos == 3)
634  t = _mm_and_si64 (t, MC (mask_3));
635 
636  p = _mm_or_si64 (r, t);
637  p = _mm_or_si64 (g, p);
638 
639  return _mm_or_si64 (b, p);
640 #endif
641 }
642 
643 static force_inline __m64
644 pack_4xpacked565 (__m64 a, __m64 b)
645 {
646  __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
647  __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
648 
649  __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
650  __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
651 
652  __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
653  __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
654 
655  t0 = _mm_or_si64 (t0, g0);
656  t1 = _mm_or_si64 (t1, g1);
657 
658  t0 = shift(t0, -5);
659 #ifdef USE_ARM_IWMMXT
660  t1 = shift(t1, -5);
661  return _mm_packs_pu32 (t0, t1);
662 #else
663  t1 = shift(t1, -5 + 16);
664  return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
665 #endif
666 }
667 
668 #ifndef _MSC_VER
669 
670 static force_inline __m64
671 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
672 {
673  return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
674 }
675 
676 static force_inline __m64
678 {
679  x = pix_multiply (x, a);
680  y = pix_multiply (y, b);
681 
682  return pix_add (x, y);
683 }
684 
685 #else
686 
687 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
688 
689 #define pack_4x565(v0, v1, v2, v3) \
690  pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
691 
692 #define pix_add_mul(x, a, y, b) \
693  ( x = pix_multiply (x, a), \
694  y = pix_multiply (y, b), \
695  pix_add (x, y) )
696 
697 #endif
698 
699 /* --------------- MMX code patch for fbcompose.c --------------------- */
700 
701 static force_inline __m64
702 combine (const uint32_t *src, const uint32_t *mask)
703 {
704  __m64 vsrc = load8888 (src);
705 
706  if (mask)
707  {
708  __m64 m = load8888 (mask);
709 
710  m = expand_alpha (m);
711  vsrc = pix_multiply (vsrc, m);
712  }
713 
714  return vsrc;
715 }
716 
717 static force_inline __m64
718 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
719 {
720  vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
721 
722  if (is_opaque (vsrc))
723  {
724  return vsrc;
725  }
726  else if (!is_zero (vsrc))
727  {
728  return over (vsrc, expand_alpha (vsrc),
730  }
731 
732  return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
733 }
734 
735 static void
736 mmx_combine_over_u (pixman_implementation_t *imp,
737  pixman_op_t op,
738  uint32_t * dest,
739  const uint32_t * src,
740  const uint32_t * mask,
741  int width)
742 {
743  const uint32_t *end = dest + width;
744 
745  while (dest < end)
746  {
747  __m64 vsrc = combine (src, mask);
748 
749  if (is_opaque (vsrc))
750  {
751  store8888 (dest, vsrc);
752  }
753  else if (!is_zero (vsrc))
754  {
755  __m64 sa = expand_alpha (vsrc);
756  store8888 (dest, over (vsrc, sa, load8888 (dest)));
757  }
758 
759  ++dest;
760  ++src;
761  if (mask)
762  ++mask;
763  }
764  _mm_empty ();
765 }
766 
767 static void
768 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
769  pixman_op_t op,
770  uint32_t * dest,
771  const uint32_t * src,
772  const uint32_t * mask,
773  int width)
774 {
775  const uint32_t *end = dest + width;
776 
777  while (dest < end)
778  {
779  __m64 d, da;
780  __m64 s = combine (src, mask);
781 
782  d = load8888 (dest);
783  da = expand_alpha (d);
784  store8888 (dest, over (d, da, s));
785 
786  ++dest;
787  ++src;
788  if (mask)
789  mask++;
790  }
791  _mm_empty ();
792 }
793 
794 static void
795 mmx_combine_in_u (pixman_implementation_t *imp,
796  pixman_op_t op,
797  uint32_t * dest,
798  const uint32_t * src,
799  const uint32_t * mask,
800  int width)
801 {
802  const uint32_t *end = dest + width;
803 
804  while (dest < end)
805  {
806  __m64 a;
807  __m64 x = combine (src, mask);
808 
809  a = load8888 (dest);
810  a = expand_alpha (a);
811  x = pix_multiply (x, a);
812 
813  store8888 (dest, x);
814 
815  ++dest;
816  ++src;
817  if (mask)
818  mask++;
819  }
820  _mm_empty ();
821 }
822 
823 static void
824 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
825  pixman_op_t op,
826  uint32_t * dest,
827  const uint32_t * src,
828  const uint32_t * mask,
829  int width)
830 {
831  const uint32_t *end = dest + width;
832 
833  while (dest < end)
834  {
835  __m64 a = combine (src, mask);
836  __m64 x;
837 
838  x = load8888 (dest);
839  a = expand_alpha (a);
840  x = pix_multiply (x, a);
841  store8888 (dest, x);
842 
843  ++dest;
844  ++src;
845  if (mask)
846  mask++;
847  }
848  _mm_empty ();
849 }
850 
851 static void
852 mmx_combine_out_u (pixman_implementation_t *imp,
853  pixman_op_t op,
854  uint32_t * dest,
855  const uint32_t * src,
856  const uint32_t * mask,
857  int width)
858 {
859  const uint32_t *end = dest + width;
860 
861  while (dest < end)
862  {
863  __m64 a;
864  __m64 x = combine (src, mask);
865 
866  a = load8888 (dest);
867  a = expand_alpha (a);
868  a = negate (a);
869  x = pix_multiply (x, a);
870  store8888 (dest, x);
871 
872  ++dest;
873  ++src;
874  if (mask)
875  mask++;
876  }
877  _mm_empty ();
878 }
879 
880 static void
881 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
882  pixman_op_t op,
883  uint32_t * dest,
884  const uint32_t * src,
885  const uint32_t * mask,
886  int width)
887 {
888  const uint32_t *end = dest + width;
889 
890  while (dest < end)
891  {
892  __m64 a = combine (src, mask);
893  __m64 x;
894 
895  x = load8888 (dest);
896  a = expand_alpha (a);
897  a = negate (a);
898  x = pix_multiply (x, a);
899 
900  store8888 (dest, x);
901 
902  ++dest;
903  ++src;
904  if (mask)
905  mask++;
906  }
907  _mm_empty ();
908 }
909 
910 static void
911 mmx_combine_atop_u (pixman_implementation_t *imp,
912  pixman_op_t op,
913  uint32_t * dest,
914  const uint32_t * src,
915  const uint32_t * mask,
916  int width)
917 {
918  const uint32_t *end = dest + width;
919 
920  while (dest < end)
921  {
922  __m64 da, d, sia;
923  __m64 s = combine (src, mask);
924 
925  d = load8888 (dest);
926  sia = expand_alpha (s);
927  sia = negate (sia);
928  da = expand_alpha (d);
929  s = pix_add_mul (s, da, d, sia);
930  store8888 (dest, s);
931 
932  ++dest;
933  ++src;
934  if (mask)
935  mask++;
936  }
937  _mm_empty ();
938 }
939 
940 static void
941 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
942  pixman_op_t op,
943  uint32_t * dest,
944  const uint32_t * src,
945  const uint32_t * mask,
946  int width)
947 {
948  const uint32_t *end;
949 
950  end = dest + width;
951 
952  while (dest < end)
953  {
954  __m64 dia, d, sa;
955  __m64 s = combine (src, mask);
956 
957  d = load8888 (dest);
958  sa = expand_alpha (s);
959  dia = expand_alpha (d);
960  dia = negate (dia);
961  s = pix_add_mul (s, dia, d, sa);
962  store8888 (dest, s);
963 
964  ++dest;
965  ++src;
966  if (mask)
967  mask++;
968  }
969  _mm_empty ();
970 }
971 
972 static void
973 mmx_combine_xor_u (pixman_implementation_t *imp,
974  pixman_op_t op,
975  uint32_t * dest,
976  const uint32_t * src,
977  const uint32_t * mask,
978  int width)
979 {
980  const uint32_t *end = dest + width;
981 
982  while (dest < end)
983  {
984  __m64 dia, d, sia;
985  __m64 s = combine (src, mask);
986 
987  d = load8888 (dest);
988  sia = expand_alpha (s);
989  dia = expand_alpha (d);
990  sia = negate (sia);
991  dia = negate (dia);
992  s = pix_add_mul (s, dia, d, sia);
993  store8888 (dest, s);
994 
995  ++dest;
996  ++src;
997  if (mask)
998  mask++;
999  }
1000  _mm_empty ();
1001 }
1002 
1003 static void
1004 mmx_combine_add_u (pixman_implementation_t *imp,
1005  pixman_op_t op,
1006  uint32_t * dest,
1007  const uint32_t * src,
1008  const uint32_t * mask,
1009  int width)
1010 {
1011  const uint32_t *end = dest + width;
1012 
1013  while (dest < end)
1014  {
1015  __m64 d;
1016  __m64 s = combine (src, mask);
1017 
1018  d = load8888 (dest);
1019  s = pix_add (s, d);
1020  store8888 (dest, s);
1021 
1022  ++dest;
1023  ++src;
1024  if (mask)
1025  mask++;
1026  }
1027  _mm_empty ();
1028 }
1029 
1030 static void
1031 mmx_combine_saturate_u (pixman_implementation_t *imp,
1032  pixman_op_t op,
1033  uint32_t * dest,
1034  const uint32_t * src,
1035  const uint32_t * mask,
1036  int width)
1037 {
1038  const uint32_t *end = dest + width;
1039 
1040  while (dest < end)
1041  {
1042  uint32_t s, sa, da;
1043  uint32_t d = *dest;
1044  __m64 ms = combine (src, mask);
1045  __m64 md = load8888 (dest);
1046 
1047  store8888(&s, ms);
1048  da = ~~d >> 24;
1049  sa = s >> 24;
1050 
1051  if (sa > da)
1052  {
1053  uint32_t quot = DIV_UN8 (da, sa) << 24;
1054  __m64 msa = load8888 (&quot);
1055  msa = expand_alpha (msa);
1056  ms = pix_multiply (ms, msa);
1057  }
1058 
1059  md = pix_add (md, ms);
1060  store8888 (dest, md);
1061 
1062  ++src;
1063  ++dest;
1064  if (mask)
1065  mask++;
1066  }
1067  _mm_empty ();
1068 }
1069 
1070 static void
1071 mmx_combine_src_ca (pixman_implementation_t *imp,
1072  pixman_op_t op,
1073  uint32_t * dest,
1074  const uint32_t * src,
1075  const uint32_t * mask,
1076  int width)
1077 {
1078  const uint32_t *end = src + width;
1079 
1080  while (src < end)
1081  {
1082  __m64 a = load8888 (mask);
1083  __m64 s = load8888 (src);
1084 
1085  s = pix_multiply (s, a);
1086  store8888 (dest, s);
1087 
1088  ++src;
1089  ++mask;
1090  ++dest;
1091  }
1092  _mm_empty ();
1093 }
1094 
1095 static void
1096 mmx_combine_over_ca (pixman_implementation_t *imp,
1097  pixman_op_t op,
1098  uint32_t * dest,
1099  const uint32_t * src,
1100  const uint32_t * mask,
1101  int width)
1102 {
1103  const uint32_t *end = src + width;
1104 
1105  while (src < end)
1106  {
1107  __m64 a = load8888 (mask);
1108  __m64 s = load8888 (src);
1109  __m64 d = load8888 (dest);
1110  __m64 sa = expand_alpha (s);
1111 
1112  store8888 (dest, in_over (s, sa, a, d));
1113 
1114  ++src;
1115  ++dest;
1116  ++mask;
1117  }
1118  _mm_empty ();
1119 }
1120 
1121 static void
1122 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1123  pixman_op_t op,
1124  uint32_t * dest,
1125  const uint32_t * src,
1126  const uint32_t * mask,
1127  int width)
1128 {
1129  const uint32_t *end = src + width;
1130 
1131  while (src < end)
1132  {
1133  __m64 a = load8888 (mask);
1134  __m64 s = load8888 (src);
1135  __m64 d = load8888 (dest);
1136  __m64 da = expand_alpha (d);
1137 
1138  store8888 (dest, over (d, da, in (s, a)));
1139 
1140  ++src;
1141  ++dest;
1142  ++mask;
1143  }
1144  _mm_empty ();
1145 }
1146 
1147 static void
1148 mmx_combine_in_ca (pixman_implementation_t *imp,
1149  pixman_op_t op,
1150  uint32_t * dest,
1151  const uint32_t * src,
1152  const uint32_t * mask,
1153  int width)
1154 {
1155  const uint32_t *end = src + width;
1156 
1157  while (src < end)
1158  {
1159  __m64 a = load8888 (mask);
1160  __m64 s = load8888 (src);
1161  __m64 d = load8888 (dest);
1162  __m64 da = expand_alpha (d);
1163 
1164  s = pix_multiply (s, a);
1165  s = pix_multiply (s, da);
1166  store8888 (dest, s);
1167 
1168  ++src;
1169  ++dest;
1170  ++mask;
1171  }
1172  _mm_empty ();
1173 }
1174 
1175 static void
1176 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1177  pixman_op_t op,
1178  uint32_t * dest,
1179  const uint32_t * src,
1180  const uint32_t * mask,
1181  int width)
1182 {
1183  const uint32_t *end = src + width;
1184 
1185  while (src < end)
1186  {
1187  __m64 a = load8888 (mask);
1188  __m64 s = load8888 (src);
1189  __m64 d = load8888 (dest);
1190  __m64 sa = expand_alpha (s);
1191 
1192  a = pix_multiply (a, sa);
1193  d = pix_multiply (d, a);
1194  store8888 (dest, d);
1195 
1196  ++src;
1197  ++dest;
1198  ++mask;
1199  }
1200  _mm_empty ();
1201 }
1202 
1203 static void
1204 mmx_combine_out_ca (pixman_implementation_t *imp,
1205  pixman_op_t op,
1206  uint32_t * dest,
1207  const uint32_t * src,
1208  const uint32_t * mask,
1209  int width)
1210 {
1211  const uint32_t *end = src + width;
1212 
1213  while (src < end)
1214  {
1215  __m64 a = load8888 (mask);
1216  __m64 s = load8888 (src);
1217  __m64 d = load8888 (dest);
1218  __m64 da = expand_alpha (d);
1219 
1220  da = negate (da);
1221  s = pix_multiply (s, a);
1222  s = pix_multiply (s, da);
1223  store8888 (dest, s);
1224 
1225  ++src;
1226  ++dest;
1227  ++mask;
1228  }
1229  _mm_empty ();
1230 }
1231 
1232 static void
1233 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1234  pixman_op_t op,
1235  uint32_t * dest,
1236  const uint32_t * src,
1237  const uint32_t * mask,
1238  int width)
1239 {
1240  const uint32_t *end = src + width;
1241 
1242  while (src < end)
1243  {
1244  __m64 a = load8888 (mask);
1245  __m64 s = load8888 (src);
1246  __m64 d = load8888 (dest);
1247  __m64 sa = expand_alpha (s);
1248 
1249  a = pix_multiply (a, sa);
1250  a = negate (a);
1251  d = pix_multiply (d, a);
1252  store8888 (dest, d);
1253 
1254  ++src;
1255  ++dest;
1256  ++mask;
1257  }
1258  _mm_empty ();
1259 }
1260 
1261 static void
1262 mmx_combine_atop_ca (pixman_implementation_t *imp,
1263  pixman_op_t op,
1264  uint32_t * dest,
1265  const uint32_t * src,
1266  const uint32_t * mask,
1267  int width)
1268 {
1269  const uint32_t *end = src + width;
1270 
1271  while (src < end)
1272  {
1273  __m64 a = load8888 (mask);
1274  __m64 s = load8888 (src);
1275  __m64 d = load8888 (dest);
1276  __m64 da = expand_alpha (d);
1277  __m64 sa = expand_alpha (s);
1278 
1279  s = pix_multiply (s, a);
1280  a = pix_multiply (a, sa);
1281  a = negate (a);
1282  d = pix_add_mul (d, a, s, da);
1283  store8888 (dest, d);
1284 
1285  ++src;
1286  ++dest;
1287  ++mask;
1288  }
1289  _mm_empty ();
1290 }
1291 
1292 static void
1293 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1294  pixman_op_t op,
1295  uint32_t * dest,
1296  const uint32_t * src,
1297  const uint32_t * mask,
1298  int width)
1299 {
1300  const uint32_t *end = src + width;
1301 
1302  while (src < end)
1303  {
1304  __m64 a = load8888 (mask);
1305  __m64 s = load8888 (src);
1306  __m64 d = load8888 (dest);
1307  __m64 da = expand_alpha (d);
1308  __m64 sa = expand_alpha (s);
1309 
1310  s = pix_multiply (s, a);
1311  a = pix_multiply (a, sa);
1312  da = negate (da);
1313  d = pix_add_mul (d, a, s, da);
1314  store8888 (dest, d);
1315 
1316  ++src;
1317  ++dest;
1318  ++mask;
1319  }
1320  _mm_empty ();
1321 }
1322 
1323 static void
1324 mmx_combine_xor_ca (pixman_implementation_t *imp,
1325  pixman_op_t op,
1326  uint32_t * dest,
1327  const uint32_t * src,
1328  const uint32_t * mask,
1329  int width)
1330 {
1331  const uint32_t *end = src + width;
1332 
1333  while (src < end)
1334  {
1335  __m64 a = load8888 (mask);
1336  __m64 s = load8888 (src);
1337  __m64 d = load8888 (dest);
1338  __m64 da = expand_alpha (d);
1339  __m64 sa = expand_alpha (s);
1340 
1341  s = pix_multiply (s, a);
1342  a = pix_multiply (a, sa);
1343  da = negate (da);
1344  a = negate (a);
1345  d = pix_add_mul (d, a, s, da);
1346  store8888 (dest, d);
1347 
1348  ++src;
1349  ++dest;
1350  ++mask;
1351  }
1352  _mm_empty ();
1353 }
1354 
1355 static void
1356 mmx_combine_add_ca (pixman_implementation_t *imp,
1357  pixman_op_t op,
1358  uint32_t * dest,
1359  const uint32_t * src,
1360  const uint32_t * mask,
1361  int width)
1362 {
1363  const uint32_t *end = src + width;
1364 
1365  while (src < end)
1366  {
1367  __m64 a = load8888 (mask);
1368  __m64 s = load8888 (src);
1369  __m64 d = load8888 (dest);
1370 
1371  s = pix_multiply (s, a);
1372  d = pix_add (s, d);
1373  store8888 (dest, d);
1374 
1375  ++src;
1376  ++dest;
1377  ++mask;
1378  }
1379  _mm_empty ();
1380 }
1381 
1382 /* ------------- MMX code paths called from fbpict.c -------------------- */
1383 
1384 static void
1385 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1387 {
1389  uint32_t src;
1390  uint32_t *dst_line, *dst;
1391  int32_t w;
1392  int dst_stride;
1393  __m64 vsrc, vsrca;
1394 
1395  CHECKPOINT ();
1396 
1397  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1398 
1399  if (src == 0)
1400  return;
1401 
1402  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1403 
1404  vsrc = load8888 (&src);
1405  vsrca = expand_alpha (vsrc);
1406 
1407  while (height--)
1408  {
1409  dst = dst_line;
1410  dst_line += dst_stride;
1411  w = width;
1412 
1413  CHECKPOINT ();
1414 
1415  while (w && (uintptr_t)dst & 7)
1416  {
1417  store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1418 
1419  w--;
1420  dst++;
1421  }
1422 
1423  while (w >= 2)
1424  {
1425  __m64 vdest;
1426  __m64 dest0, dest1;
1427 
1428  vdest = *(__m64 *)dst;
1429 
1430  dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1431  dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1432 
1433  *(__m64 *)dst = pack8888 (dest0, dest1);
1434 
1435  dst += 2;
1436  w -= 2;
1437  }
1438 
1439  CHECKPOINT ();
1440 
1441  if (w)
1442  {
1443  store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1444  }
1445  }
1446 
1447  _mm_empty ();
1448 }
1449 
1450 static void
1451 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1453 {
1455  uint32_t src;
1456  uint16_t *dst_line, *dst;
1457  int32_t w;
1458  int dst_stride;
1459  __m64 vsrc, vsrca;
1460 
1461  CHECKPOINT ();
1462 
1463  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1464 
1465  if (src == 0)
1466  return;
1467 
1468  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1469 
1470  vsrc = load8888 (&src);
1471  vsrca = expand_alpha (vsrc);
1472 
1473  while (height--)
1474  {
1475  dst = dst_line;
1476  dst_line += dst_stride;
1477  w = width;
1478 
1479  CHECKPOINT ();
1480 
1481  while (w && (uintptr_t)dst & 7)
1482  {
1483  uint64_t d = *dst;
1484  __m64 vdest = expand565 (to_m64 (d), 0);
1485 
1486  vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1487  *dst = to_uint64 (vdest);
1488 
1489  w--;
1490  dst++;
1491  }
1492 
1493  while (w >= 4)
1494  {
1495  __m64 vdest = *(__m64 *)dst;
1496  __m64 v0, v1, v2, v3;
1497 
1498  expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1499 
1500  v0 = over (vsrc, vsrca, v0);
1501  v1 = over (vsrc, vsrca, v1);
1502  v2 = over (vsrc, vsrca, v2);
1503  v3 = over (vsrc, vsrca, v3);
1504 
1505  *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1506 
1507  dst += 4;
1508  w -= 4;
1509  }
1510 
1511  CHECKPOINT ();
1512 
1513  while (w)
1514  {
1515  uint64_t d = *dst;
1516  __m64 vdest = expand565 (to_m64 (d), 0);
1517 
1518  vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1519  *dst = to_uint64 (vdest);
1520 
1521  w--;
1522  dst++;
1523  }
1524  }
1525 
1526  _mm_empty ();
1527 }
1528 
1529 static void
1530 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1532 {
1534  uint32_t src;
1535  uint32_t *dst_line;
1536  uint32_t *mask_line;
1537  int dst_stride, mask_stride;
1538  __m64 vsrc, vsrca;
1539 
1540  CHECKPOINT ();
1541 
1542  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1543 
1544  if (src == 0)
1545  return;
1546 
1547  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1548  PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1549 
1550  vsrc = load8888 (&src);
1551  vsrca = expand_alpha (vsrc);
1552 
1553  while (height--)
1554  {
1555  int twidth = width;
1556  uint32_t *p = (uint32_t *)mask_line;
1557  uint32_t *q = (uint32_t *)dst_line;
1558 
1559  while (twidth && (uintptr_t)q & 7)
1560  {
1561  uint32_t m = *(uint32_t *)p;
1562 
1563  if (m)
1564  {
1565  __m64 vdest = load8888 (q);
1566  vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1567  store8888 (q, vdest);
1568  }
1569 
1570  twidth--;
1571  p++;
1572  q++;
1573  }
1574 
1575  while (twidth >= 2)
1576  {
1577  uint32_t m0, m1;
1578  m0 = *p;
1579  m1 = *(p + 1);
1580 
1581  if (m0 | m1)
1582  {
1583  __m64 dest0, dest1;
1584  __m64 vdest = *(__m64 *)q;
1585 
1586  dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1587  expand8888 (vdest, 0));
1588  dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1589  expand8888 (vdest, 1));
1590 
1591  *(__m64 *)q = pack8888 (dest0, dest1);
1592  }
1593 
1594  p += 2;
1595  q += 2;
1596  twidth -= 2;
1597  }
1598 
1599  if (twidth)
1600  {
1601  uint32_t m = *(uint32_t *)p;
1602 
1603  if (m)
1604  {
1605  __m64 vdest = load8888 (q);
1606  vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1607  store8888 (q, vdest);
1608  }
1609 
1610  twidth--;
1611  p++;
1612  q++;
1613  }
1614 
1615  dst_line += dst_stride;
1616  mask_line += mask_stride;
1617  }
1618 
1619  _mm_empty ();
1620 }
1621 
1622 static void
1623 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1625 {
1627  uint32_t *dst_line, *dst;
1628  uint32_t *src_line, *src;
1629  uint32_t mask;
1630  __m64 vmask;
1631  int dst_stride, src_stride;
1632  int32_t w;
1633 
1634  CHECKPOINT ();
1635 
1636  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1637  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1638 
1639  mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1640  vmask = expand_alpha (load8888 (&mask));
1641 
1642  while (height--)
1643  {
1644  dst = dst_line;
1645  dst_line += dst_stride;
1646  src = src_line;
1647  src_line += src_stride;
1648  w = width;
1649 
1650  while (w && (uintptr_t)dst & 7)
1651  {
1652  __m64 s = load8888 (src);
1653  __m64 d = load8888 (dst);
1654 
1655  store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1656 
1657  w--;
1658  dst++;
1659  src++;
1660  }
1661 
1662  while (w >= 2)
1663  {
1664  __m64 vs = ldq_u ((__m64 *)src);
1665  __m64 vd = *(__m64 *)dst;
1666  __m64 vsrc0 = expand8888 (vs, 0);
1667  __m64 vsrc1 = expand8888 (vs, 1);
1668 
1669  *(__m64 *)dst = pack8888 (
1670  in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1671  in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1672 
1673  w -= 2;
1674  dst += 2;
1675  src += 2;
1676  }
1677 
1678  if (w)
1679  {
1680  __m64 s = load8888 (src);
1681  __m64 d = load8888 (dst);
1682 
1683  store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1684  }
1685  }
1686 
1687  _mm_empty ();
1688 }
1689 
1690 static void
1691 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1693 {
1695  uint32_t *dst_line, *dst;
1696  uint32_t *src_line, *src;
1697  uint32_t mask;
1698  __m64 vmask;
1699  int dst_stride, src_stride;
1700  int32_t w;
1701  __m64 srca;
1702 
1703  CHECKPOINT ();
1704 
1705  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1706  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1707  mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1708 
1709  vmask = expand_alpha (load8888 (&mask));
1710  srca = MC (4x00ff);
1711 
1712  while (height--)
1713  {
1714  dst = dst_line;
1715  dst_line += dst_stride;
1716  src = src_line;
1717  src_line += src_stride;
1718  w = width;
1719 
1720  while (w && (uintptr_t)dst & 7)
1721  {
1722  uint32_t ssrc = *src | 0xff000000;
1723  __m64 s = load8888 (&ssrc);
1724  __m64 d = load8888 (dst);
1725 
1726  store8888 (dst, in_over (s, srca, vmask, d));
1727 
1728  w--;
1729  dst++;
1730  src++;
1731  }
1732 
1733  while (w >= 16)
1734  {
1735  __m64 vd0 = *(__m64 *)(dst + 0);
1736  __m64 vd1 = *(__m64 *)(dst + 2);
1737  __m64 vd2 = *(__m64 *)(dst + 4);
1738  __m64 vd3 = *(__m64 *)(dst + 6);
1739  __m64 vd4 = *(__m64 *)(dst + 8);
1740  __m64 vd5 = *(__m64 *)(dst + 10);
1741  __m64 vd6 = *(__m64 *)(dst + 12);
1742  __m64 vd7 = *(__m64 *)(dst + 14);
1743 
1744  __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1745  __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1746  __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1747  __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1748  __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1749  __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1750  __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1751  __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1752 
1753  vd0 = pack8888 (
1754  in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1755  in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1756 
1757  vd1 = pack8888 (
1758  in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1759  in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1760 
1761  vd2 = pack8888 (
1762  in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1763  in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1764 
1765  vd3 = pack8888 (
1766  in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1767  in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1768 
1769  vd4 = pack8888 (
1770  in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1771  in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1772 
1773  vd5 = pack8888 (
1774  in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1775  in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1776 
1777  vd6 = pack8888 (
1778  in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1779  in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1780 
1781  vd7 = pack8888 (
1782  in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1783  in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1784 
1785  *(__m64 *)(dst + 0) = vd0;
1786  *(__m64 *)(dst + 2) = vd1;
1787  *(__m64 *)(dst + 4) = vd2;
1788  *(__m64 *)(dst + 6) = vd3;
1789  *(__m64 *)(dst + 8) = vd4;
1790  *(__m64 *)(dst + 10) = vd5;
1791  *(__m64 *)(dst + 12) = vd6;
1792  *(__m64 *)(dst + 14) = vd7;
1793 
1794  w -= 16;
1795  dst += 16;
1796  src += 16;
1797  }
1798 
1799  while (w)
1800  {
1801  uint32_t ssrc = *src | 0xff000000;
1802  __m64 s = load8888 (&ssrc);
1803  __m64 d = load8888 (dst);
1804 
1805  store8888 (dst, in_over (s, srca, vmask, d));
1806 
1807  w--;
1808  dst++;
1809  src++;
1810  }
1811  }
1812 
1813  _mm_empty ();
1814 }
1815 
1816 static void
1817 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1819 {
1821  uint32_t *dst_line, *dst;
1822  uint32_t *src_line, *src;
1823  uint32_t s;
1824  int dst_stride, src_stride;
1825  uint8_t a;
1826  int32_t w;
1827 
1828  CHECKPOINT ();
1829 
1830  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1831  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1832 
1833  while (height--)
1834  {
1835  dst = dst_line;
1836  dst_line += dst_stride;
1837  src = src_line;
1838  src_line += src_stride;
1839  w = width;
1840 
1841  while (w--)
1842  {
1843  s = *src++;
1844  a = s >> 24;
1845 
1846  if (a == 0xff)
1847  {
1848  *dst = s;
1849  }
1850  else if (s)
1851  {
1852  __m64 ms, sa;
1853  ms = load8888 (&s);
1854  sa = expand_alpha (ms);
1855  store8888 (dst, over (ms, sa, load8888 (dst)));
1856  }
1857 
1858  dst++;
1859  }
1860  }
1861  _mm_empty ();
1862 }
1863 
1864 static void
1865 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1867 {
1869  uint16_t *dst_line, *dst;
1870  uint32_t *src_line, *src;
1871  int dst_stride, src_stride;
1872  int32_t w;
1873 
1874  CHECKPOINT ();
1875 
1876  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1877  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1878 
1879 #if 0
1880  /* FIXME */
1881  assert (src_image->drawable == mask_image->drawable);
1882 #endif
1883 
1884  while (height--)
1885  {
1886  dst = dst_line;
1887  dst_line += dst_stride;
1888  src = src_line;
1889  src_line += src_stride;
1890  w = width;
1891 
1892  CHECKPOINT ();
1893 
1894  while (w && (uintptr_t)dst & 7)
1895  {
1896  __m64 vsrc = load8888 (src);
1897  uint64_t d = *dst;
1898  __m64 vdest = expand565 (to_m64 (d), 0);
1899 
1900  vdest = pack_565 (
1901  over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1902 
1903  *dst = to_uint64 (vdest);
1904 
1905  w--;
1906  dst++;
1907  src++;
1908  }
1909 
1910  CHECKPOINT ();
1911 
1912  while (w >= 4)
1913  {
1914  __m64 vdest = *(__m64 *)dst;
1915  __m64 v0, v1, v2, v3;
1916  __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1917 
1918  expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1919 
1920  vsrc0 = load8888 ((src + 0));
1921  vsrc1 = load8888 ((src + 1));
1922  vsrc2 = load8888 ((src + 2));
1923  vsrc3 = load8888 ((src + 3));
1924 
1925  v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1926  v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1927  v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1928  v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1929 
1930  *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1931 
1932  w -= 4;
1933  dst += 4;
1934  src += 4;
1935  }
1936 
1937  CHECKPOINT ();
1938 
1939  while (w)
1940  {
1941  __m64 vsrc = load8888 (src);
1942  uint64_t d = *dst;
1943  __m64 vdest = expand565 (to_m64 (d), 0);
1944 
1945  vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1946 
1947  *dst = to_uint64 (vdest);
1948 
1949  w--;
1950  dst++;
1951  src++;
1952  }
1953  }
1954 
1955  _mm_empty ();
1956 }
1957 
1958 static void
1959 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1961 {
1963  uint32_t src, srca;
1964  uint32_t *dst_line, *dst;
1965  uint8_t *mask_line, *mask;
1966  int dst_stride, mask_stride;
1967  int32_t w;
1968  __m64 vsrc, vsrca;
1969  uint64_t srcsrc;
1970 
1971  CHECKPOINT ();
1972 
1973  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1974 
1975  srca = src >> 24;
1976  if (src == 0)
1977  return;
1978 
1979  srcsrc = (uint64_t)src << 32 | src;
1980 
1981  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1982  PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1983 
1984  vsrc = load8888 (&src);
1985  vsrca = expand_alpha (vsrc);
1986 
1987  while (height--)
1988  {
1989  dst = dst_line;
1990  dst_line += dst_stride;
1991  mask = mask_line;
1992  mask_line += mask_stride;
1993  w = width;
1994 
1995  CHECKPOINT ();
1996 
1997  while (w && (uintptr_t)dst & 7)
1998  {
1999  uint64_t m = *mask;
2000 
2001  if (m)
2002  {
2003  __m64 vdest = in_over (vsrc, vsrca,
2004  expand_alpha_rev (to_m64 (m)),
2005  load8888 (dst));
2006 
2007  store8888 (dst, vdest);
2008  }
2009 
2010  w--;
2011  mask++;
2012  dst++;
2013  }
2014 
2015  CHECKPOINT ();
2016 
2017  while (w >= 2)
2018  {
2019  uint64_t m0, m1;
2020 
2021  m0 = *mask;
2022  m1 = *(mask + 1);
2023 
2024  if (srca == 0xff && (m0 & m1) == 0xff)
2025  {
2026  *(uint64_t *)dst = srcsrc;
2027  }
2028  else if (m0 | m1)
2029  {
2030  __m64 vdest;
2031  __m64 dest0, dest1;
2032 
2033  vdest = *(__m64 *)dst;
2034 
2035  dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2036  expand8888 (vdest, 0));
2037  dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2038  expand8888 (vdest, 1));
2039 
2040  *(__m64 *)dst = pack8888 (dest0, dest1);
2041  }
2042 
2043  mask += 2;
2044  dst += 2;
2045  w -= 2;
2046  }
2047 
2048  CHECKPOINT ();
2049 
2050  if (w)
2051  {
2052  uint64_t m = *mask;
2053 
2054  if (m)
2055  {
2056  __m64 vdest = load8888 (dst);
2057 
2058  vdest = in_over (
2059  vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2060  store8888 (dst, vdest);
2061  }
2062  }
2063  }
2064 
2065  _mm_empty ();
2066 }
2067 
2068 static pixman_bool_t
2069 mmx_fill (pixman_implementation_t *imp,
2070  uint32_t * bits,
2071  int stride,
2072  int bpp,
2073  int x,
2074  int y,
2075  int width,
2076  int height,
2077  uint32_t filler)
2078 {
2079  uint64_t fill;
2080  __m64 vfill;
2081  uint32_t byte_width;
2082  uint8_t *byte_line;
2083 
2084 #if defined __GNUC__ && defined USE_X86_MMX
2085  __m64 v1, v2, v3, v4, v5, v6, v7;
2086 #endif
2087 
2088  if (bpp != 16 && bpp != 32 && bpp != 8)
2089  return FALSE;
2090 
2091  if (bpp == 8)
2092  {
2093  stride = stride * (int) sizeof (uint32_t) / 1;
2094  byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2095  byte_width = width;
2096  stride *= 1;
2097  filler = (filler & 0xff) * 0x01010101;
2098  }
2099  else if (bpp == 16)
2100  {
2101  stride = stride * (int) sizeof (uint32_t) / 2;
2102  byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2103  byte_width = 2 * width;
2104  stride *= 2;
2105  filler = (filler & 0xffff) * 0x00010001;
2106  }
2107  else
2108  {
2109  stride = stride * (int) sizeof (uint32_t) / 4;
2110  byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2111  byte_width = 4 * width;
2112  stride *= 4;
2113  }
2114 
2115  fill = ((uint64_t)filler << 32) | filler;
2116  vfill = to_m64 (fill);
2117 
2118 #if defined __GNUC__ && defined USE_X86_MMX
2119  __asm__ (
2120  "movq %7, %0\n"
2121  "movq %7, %1\n"
2122  "movq %7, %2\n"
2123  "movq %7, %3\n"
2124  "movq %7, %4\n"
2125  "movq %7, %5\n"
2126  "movq %7, %6\n"
2127  : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2128  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2129  : "y" (vfill));
2130 #endif
2131 
2132  while (height--)
2133  {
2134  int w;
2135  uint8_t *d = byte_line;
2136 
2137  byte_line += stride;
2138  w = byte_width;
2139 
2140  if (w >= 1 && ((uintptr_t)d & 1))
2141  {
2142  *(uint8_t *)d = (filler & 0xff);
2143  w--;
2144  d++;
2145  }
2146 
2147  if (w >= 2 && ((uintptr_t)d & 3))
2148  {
2149  *(uint16_t *)d = filler;
2150  w -= 2;
2151  d += 2;
2152  }
2153 
2154  while (w >= 4 && ((uintptr_t)d & 7))
2155  {
2156  *(uint32_t *)d = filler;
2157 
2158  w -= 4;
2159  d += 4;
2160  }
2161 
2162  while (w >= 64)
2163  {
2164 #if defined __GNUC__ && defined USE_X86_MMX
2165  __asm__ (
2166  "movq %1, (%0)\n"
2167  "movq %2, 8(%0)\n"
2168  "movq %3, 16(%0)\n"
2169  "movq %4, 24(%0)\n"
2170  "movq %5, 32(%0)\n"
2171  "movq %6, 40(%0)\n"
2172  "movq %7, 48(%0)\n"
2173  "movq %8, 56(%0)\n"
2174  :
2175  : "r" (d),
2176  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2177  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2178  : "memory");
2179 #else
2180  *(__m64*) (d + 0) = vfill;
2181  *(__m64*) (d + 8) = vfill;
2182  *(__m64*) (d + 16) = vfill;
2183  *(__m64*) (d + 24) = vfill;
2184  *(__m64*) (d + 32) = vfill;
2185  *(__m64*) (d + 40) = vfill;
2186  *(__m64*) (d + 48) = vfill;
2187  *(__m64*) (d + 56) = vfill;
2188 #endif
2189  w -= 64;
2190  d += 64;
2191  }
2192 
2193  while (w >= 4)
2194  {
2195  *(uint32_t *)d = filler;
2196 
2197  w -= 4;
2198  d += 4;
2199  }
2200  if (w >= 2)
2201  {
2202  *(uint16_t *)d = filler;
2203  w -= 2;
2204  d += 2;
2205  }
2206  if (w >= 1)
2207  {
2208  *(uint8_t *)d = (filler & 0xff);
2209  w--;
2210  d++;
2211  }
2212 
2213  }
2214 
2215  _mm_empty ();
2216  return TRUE;
2217 }
2218 
2219 static void
2220 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2222 {
2224  uint16_t *dst_line, *dst;
2225  uint32_t *src_line, *src, s;
2226  int dst_stride, src_stride;
2227  int32_t w;
2228 
2229  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2230  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2231 
2232  while (height--)
2233  {
2234  dst = dst_line;
2235  dst_line += dst_stride;
2236  src = src_line;
2237  src_line += src_stride;
2238  w = width;
2239 
2240  while (w && (uintptr_t)dst & 7)
2241  {
2242  s = *src++;
2243  *dst = convert_8888_to_0565 (s);
2244  dst++;
2245  w--;
2246  }
2247 
2248  while (w >= 4)
2249  {
2250  __m64 vdest;
2251  __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2252  __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2253 
2254  vdest = pack_4xpacked565 (vsrc0, vsrc1);
2255 
2256  *(__m64 *)dst = vdest;
2257 
2258  w -= 4;
2259  src += 4;
2260  dst += 4;
2261  }
2262 
2263  while (w)
2264  {
2265  s = *src++;
2266  *dst = convert_8888_to_0565 (s);
2267  dst++;
2268  w--;
2269  }
2270  }
2271 
2272  _mm_empty ();
2273 }
2274 
2275 static void
2276 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2278 {
2280  uint32_t src, srca;
2281  uint32_t *dst_line, *dst;
2282  uint8_t *mask_line, *mask;
2283  int dst_stride, mask_stride;
2284  int32_t w;
2285  __m64 vsrc;
2286  uint64_t srcsrc;
2287 
2288  CHECKPOINT ();
2289 
2290  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2291 
2292  srca = src >> 24;
2293  if (src == 0)
2294  {
2295  mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2296  PIXMAN_FORMAT_BPP (dest_image->bits.format),
2297  dest_x, dest_y, width, height, 0);
2298  return;
2299  }
2300 
2301  srcsrc = (uint64_t)src << 32 | src;
2302 
2303  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2304  PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2305 
2306  vsrc = load8888 (&src);
2307 
2308  while (height--)
2309  {
2310  dst = dst_line;
2311  dst_line += dst_stride;
2312  mask = mask_line;
2313  mask_line += mask_stride;
2314  w = width;
2315 
2316  CHECKPOINT ();
2317 
2318  while (w && (uintptr_t)dst & 7)
2319  {
2320  uint64_t m = *mask;
2321 
2322  if (m)
2323  {
2324  __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2325 
2326  store8888 (dst, vdest);
2327  }
2328  else
2329  {
2330  *dst = 0;
2331  }
2332 
2333  w--;
2334  mask++;
2335  dst++;
2336  }
2337 
2338  CHECKPOINT ();
2339 
2340  while (w >= 2)
2341  {
2342  uint64_t m0, m1;
2343  m0 = *mask;
2344  m1 = *(mask + 1);
2345 
2346  if (srca == 0xff && (m0 & m1) == 0xff)
2347  {
2348  *(uint64_t *)dst = srcsrc;
2349  }
2350  else if (m0 | m1)
2351  {
2352  __m64 dest0, dest1;
2353 
2354  dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2355  dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2356 
2357  *(__m64 *)dst = pack8888 (dest0, dest1);
2358  }
2359  else
2360  {
2361  *(uint64_t *)dst = 0;
2362  }
2363 
2364  mask += 2;
2365  dst += 2;
2366  w -= 2;
2367  }
2368 
2369  CHECKPOINT ();
2370 
2371  if (w)
2372  {
2373  uint64_t m = *mask;
2374 
2375  if (m)
2376  {
2377  __m64 vdest = load8888 (dst);
2378 
2379  vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2380  store8888 (dst, vdest);
2381  }
2382  else
2383  {
2384  *dst = 0;
2385  }
2386  }
2387  }
2388 
2389  _mm_empty ();
2390 }
2391 
2392 static void
2393 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2395 {
2397  uint32_t src, srca;
2398  uint16_t *dst_line, *dst;
2399  uint8_t *mask_line, *mask;
2400  int dst_stride, mask_stride;
2401  int32_t w;
2402  __m64 vsrc, vsrca, tmp;
2403  __m64 srcsrcsrcsrc;
2404 
2405  CHECKPOINT ();
2406 
2407  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2408 
2409  srca = src >> 24;
2410  if (src == 0)
2411  return;
2412 
2413  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2414  PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2415 
2416  vsrc = load8888 (&src);
2417  vsrca = expand_alpha (vsrc);
2418 
2419  tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2420  srcsrcsrcsrc = expand_alpha_rev (tmp);
2421 
2422  while (height--)
2423  {
2424  dst = dst_line;
2425  dst_line += dst_stride;
2426  mask = mask_line;
2427  mask_line += mask_stride;
2428  w = width;
2429 
2430  CHECKPOINT ();
2431 
2432  while (w && (uintptr_t)dst & 7)
2433  {
2434  uint64_t m = *mask;
2435 
2436  if (m)
2437  {
2438  uint64_t d = *dst;
2439  __m64 vd = to_m64 (d);
2440  __m64 vdest = in_over (
2441  vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2442 
2443  vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2444  *dst = to_uint64 (vd);
2445  }
2446 
2447  w--;
2448  mask++;
2449  dst++;
2450  }
2451 
2452  CHECKPOINT ();
2453 
2454  while (w >= 4)
2455  {
2456  uint64_t m0, m1, m2, m3;
2457  m0 = *mask;
2458  m1 = *(mask + 1);
2459  m2 = *(mask + 2);
2460  m3 = *(mask + 3);
2461 
2462  if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2463  {
2464  *(__m64 *)dst = srcsrcsrcsrc;
2465  }
2466  else if (m0 | m1 | m2 | m3)
2467  {
2468  __m64 vdest = *(__m64 *)dst;
2469  __m64 v0, v1, v2, v3;
2470  __m64 vm0, vm1, vm2, vm3;
2471 
2472  expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2473 
2474  vm0 = to_m64 (m0);
2475  v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2476 
2477  vm1 = to_m64 (m1);
2478  v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2479 
2480  vm2 = to_m64 (m2);
2481  v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2482 
2483  vm3 = to_m64 (m3);
2484  v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2485 
2486  *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2487  }
2488 
2489  w -= 4;
2490  mask += 4;
2491  dst += 4;
2492  }
2493 
2494  CHECKPOINT ();
2495 
2496  while (w)
2497  {
2498  uint64_t m = *mask;
2499 
2500  if (m)
2501  {
2502  uint64_t d = *dst;
2503  __m64 vd = to_m64 (d);
2504  __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2505  expand565 (vd, 0));
2506  vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2507  *dst = to_uint64 (vd);
2508  }
2509 
2510  w--;
2511  mask++;
2512  dst++;
2513  }
2514  }
2515 
2516  _mm_empty ();
2517 }
2518 
2519 static void
2520 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2522 {
2524  uint16_t *dst_line, *dst;
2525  uint32_t *src_line, *src;
2526  int dst_stride, src_stride;
2527  int32_t w;
2528 
2529  CHECKPOINT ();
2530 
2531  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2532  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2533 
2534 #if 0
2535  /* FIXME */
2536  assert (src_image->drawable == mask_image->drawable);
2537 #endif
2538 
2539  while (height--)
2540  {
2541  dst = dst_line;
2542  dst_line += dst_stride;
2543  src = src_line;
2544  src_line += src_stride;
2545  w = width;
2546 
2547  CHECKPOINT ();
2548 
2549  while (w && (uintptr_t)dst & 7)
2550  {
2551  __m64 vsrc = load8888 (src);
2552  uint64_t d = *dst;
2553  __m64 vdest = expand565 (to_m64 (d), 0);
2554 
2555  vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2556 
2557  *dst = to_uint64 (vdest);
2558 
2559  w--;
2560  dst++;
2561  src++;
2562  }
2563 
2564  CHECKPOINT ();
2565 
2566  while (w >= 4)
2567  {
2568  uint32_t s0, s1, s2, s3;
2569  unsigned char a0, a1, a2, a3;
2570 
2571  s0 = *src;
2572  s1 = *(src + 1);
2573  s2 = *(src + 2);
2574  s3 = *(src + 3);
2575 
2576  a0 = (s0 >> 24);
2577  a1 = (s1 >> 24);
2578  a2 = (s2 >> 24);
2579  a3 = (s3 >> 24);
2580 
2581  if ((a0 & a1 & a2 & a3) == 0xFF)
2582  {
2583  __m64 v0 = invert_colors (load8888 (&s0));
2584  __m64 v1 = invert_colors (load8888 (&s1));
2585  __m64 v2 = invert_colors (load8888 (&s2));
2586  __m64 v3 = invert_colors (load8888 (&s3));
2587 
2588  *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2589  }
2590  else if (s0 | s1 | s2 | s3)
2591  {
2592  __m64 vdest = *(__m64 *)dst;
2593  __m64 v0, v1, v2, v3;
2594 
2595  __m64 vsrc0 = load8888 (&s0);
2596  __m64 vsrc1 = load8888 (&s1);
2597  __m64 vsrc2 = load8888 (&s2);
2598  __m64 vsrc3 = load8888 (&s3);
2599 
2600  expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2601 
2602  v0 = over_rev_non_pre (vsrc0, v0);
2603  v1 = over_rev_non_pre (vsrc1, v1);
2604  v2 = over_rev_non_pre (vsrc2, v2);
2605  v3 = over_rev_non_pre (vsrc3, v3);
2606 
2607  *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2608  }
2609 
2610  w -= 4;
2611  dst += 4;
2612  src += 4;
2613  }
2614 
2615  CHECKPOINT ();
2616 
2617  while (w)
2618  {
2619  __m64 vsrc = load8888 (src);
2620  uint64_t d = *dst;
2621  __m64 vdest = expand565 (to_m64 (d), 0);
2622 
2623  vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2624 
2625  *dst = to_uint64 (vdest);
2626 
2627  w--;
2628  dst++;
2629  src++;
2630  }
2631  }
2632 
2633  _mm_empty ();
2634 }
2635 
2636 static void
2637 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2639 {
2641  uint32_t *dst_line, *dst;
2642  uint32_t *src_line, *src;
2643  int dst_stride, src_stride;
2644  int32_t w;
2645 
2646  CHECKPOINT ();
2647 
2648  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2649  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2650 
2651 #if 0
2652  /* FIXME */
2653  assert (src_image->drawable == mask_image->drawable);
2654 #endif
2655 
2656  while (height--)
2657  {
2658  dst = dst_line;
2659  dst_line += dst_stride;
2660  src = src_line;
2661  src_line += src_stride;
2662  w = width;
2663 
2664  while (w && (uintptr_t)dst & 7)
2665  {
2666  __m64 s = load8888 (src);
2667  __m64 d = load8888 (dst);
2668 
2669  store8888 (dst, over_rev_non_pre (s, d));
2670 
2671  w--;
2672  dst++;
2673  src++;
2674  }
2675 
2676  while (w >= 2)
2677  {
2678  uint32_t s0, s1;
2679  unsigned char a0, a1;
2680  __m64 d0, d1;
2681 
2682  s0 = *src;
2683  s1 = *(src + 1);
2684 
2685  a0 = (s0 >> 24);
2686  a1 = (s1 >> 24);
2687 
2688  if ((a0 & a1) == 0xFF)
2689  {
2690  d0 = invert_colors (load8888 (&s0));
2691  d1 = invert_colors (load8888 (&s1));
2692 
2693  *(__m64 *)dst = pack8888 (d0, d1);
2694  }
2695  else if (s0 | s1)
2696  {
2697  __m64 vdest = *(__m64 *)dst;
2698 
2699  d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2700  d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2701 
2702  *(__m64 *)dst = pack8888 (d0, d1);
2703  }
2704 
2705  w -= 2;
2706  dst += 2;
2707  src += 2;
2708  }
2709 
2710  if (w)
2711  {
2712  __m64 s = load8888 (src);
2713  __m64 d = load8888 (dst);
2714 
2715  store8888 (dst, over_rev_non_pre (s, d));
2716  }
2717  }
2718 
2719  _mm_empty ();
2720 }
2721 
2722 static void
2723 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2725 {
2727  uint32_t src;
2728  uint16_t *dst_line;
2729  uint32_t *mask_line;
2730  int dst_stride, mask_stride;
2731  __m64 vsrc, vsrca;
2732 
2733  CHECKPOINT ();
2734 
2735  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2736 
2737  if (src == 0)
2738  return;
2739 
2740  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2741  PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2742 
2743  vsrc = load8888 (&src);
2744  vsrca = expand_alpha (vsrc);
2745 
2746  while (height--)
2747  {
2748  int twidth = width;
2749  uint32_t *p = (uint32_t *)mask_line;
2750  uint16_t *q = (uint16_t *)dst_line;
2751 
2752  while (twidth && ((uintptr_t)q & 7))
2753  {
2754  uint32_t m = *(uint32_t *)p;
2755 
2756  if (m)
2757  {
2758  uint64_t d = *q;
2759  __m64 vdest = expand565 (to_m64 (d), 0);
2760  vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2761  *q = to_uint64 (vdest);
2762  }
2763 
2764  twidth--;
2765  p++;
2766  q++;
2767  }
2768 
2769  while (twidth >= 4)
2770  {
2771  uint32_t m0, m1, m2, m3;
2772 
2773  m0 = *p;
2774  m1 = *(p + 1);
2775  m2 = *(p + 2);
2776  m3 = *(p + 3);
2777 
2778  if ((m0 | m1 | m2 | m3))
2779  {
2780  __m64 vdest = *(__m64 *)q;
2781  __m64 v0, v1, v2, v3;
2782 
2783  expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2784 
2785  v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2786  v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2787  v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2788  v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2789 
2790  *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2791  }
2792  twidth -= 4;
2793  p += 4;
2794  q += 4;
2795  }
2796 
2797  while (twidth)
2798  {
2799  uint32_t m;
2800 
2801  m = *(uint32_t *)p;
2802  if (m)
2803  {
2804  uint64_t d = *q;
2805  __m64 vdest = expand565 (to_m64 (d), 0);
2806  vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2807  *q = to_uint64 (vdest);
2808  }
2809 
2810  twidth--;
2811  p++;
2812  q++;
2813  }
2814 
2815  mask_line += mask_stride;
2816  dst_line += dst_stride;
2817  }
2818 
2819  _mm_empty ();
2820 }
2821 
2822 static void
2823 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2825 {
2827  uint8_t *dst_line, *dst;
2828  uint8_t *mask_line, *mask;
2829  int dst_stride, mask_stride;
2830  int32_t w;
2831  uint32_t src;
2832  uint8_t sa;
2833  __m64 vsrc, vsrca;
2834 
2835  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2836  PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2837 
2838  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2839 
2840  sa = src >> 24;
2841 
2842  vsrc = load8888 (&src);
2843  vsrca = expand_alpha (vsrc);
2844 
2845  while (height--)
2846  {
2847  dst = dst_line;
2848  dst_line += dst_stride;
2849  mask = mask_line;
2850  mask_line += mask_stride;
2851  w = width;
2852 
2853  while (w && (uintptr_t)dst & 7)
2854  {
2855  uint16_t tmp;
2856  uint8_t a;
2857  uint32_t m, d;
2858 
2859  a = *mask++;
2860  d = *dst;
2861 
2862  m = MUL_UN8 (sa, a, tmp);
2863  d = MUL_UN8 (m, d, tmp);
2864 
2865  *dst++ = d;
2866  w--;
2867  }
2868 
2869  while (w >= 4)
2870  {
2871  __m64 vmask;
2872  __m64 vdest;
2873 
2874  vmask = load8888u ((uint32_t *)mask);
2875  vdest = load8888 ((uint32_t *)dst);
2876 
2877  store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2878 
2879  dst += 4;
2880  mask += 4;
2881  w -= 4;
2882  }
2883 
2884  while (w--)
2885  {
2886  uint16_t tmp;
2887  uint8_t a;
2888  uint32_t m, d;
2889 
2890  a = *mask++;
2891  d = *dst;
2892 
2893  m = MUL_UN8 (sa, a, tmp);
2894  d = MUL_UN8 (m, d, tmp);
2895 
2896  *dst++ = d;
2897  }
2898  }
2899 
2900  _mm_empty ();
2901 }
2902 
2903 static void
2904 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2906 {
2908  uint8_t *dst_line, *dst;
2909  uint8_t *src_line, *src;
2910  int src_stride, dst_stride;
2911  int32_t w;
2912 
2913  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2914  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2915 
2916  while (height--)
2917  {
2918  dst = dst_line;
2919  dst_line += dst_stride;
2920  src = src_line;
2921  src_line += src_stride;
2922  w = width;
2923 
2924  while (w && (uintptr_t)dst & 3)
2925  {
2926  uint8_t s, d;
2927  uint16_t tmp;
2928 
2929  s = *src;
2930  d = *dst;
2931 
2932  *dst = MUL_UN8 (s, d, tmp);
2933 
2934  src++;
2935  dst++;
2936  w--;
2937  }
2938 
2939  while (w >= 4)
2940  {
2941  uint32_t *s = (uint32_t *)src;
2942  uint32_t *d = (uint32_t *)dst;
2943 
2944  store8888 (d, in (load8888u (s), load8888 (d)));
2945 
2946  w -= 4;
2947  dst += 4;
2948  src += 4;
2949  }
2950 
2951  while (w--)
2952  {
2953  uint8_t s, d;
2954  uint16_t tmp;
2955 
2956  s = *src;
2957  d = *dst;
2958 
2959  *dst = MUL_UN8 (s, d, tmp);
2960 
2961  src++;
2962  dst++;
2963  }
2964  }
2965 
2966  _mm_empty ();
2967 }
2968 
2969 static void
2970 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2972 {
2974  uint8_t *dst_line, *dst;
2975  uint8_t *mask_line, *mask;
2976  int dst_stride, mask_stride;
2977  int32_t w;
2978  uint32_t src;
2979  uint8_t sa;
2980  __m64 vsrc, vsrca;
2981 
2982  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2983  PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2984 
2985  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2986 
2987  sa = src >> 24;
2988 
2989  if (src == 0)
2990  return;
2991 
2992  vsrc = load8888 (&src);
2993  vsrca = expand_alpha (vsrc);
2994 
2995  while (height--)
2996  {
2997  dst = dst_line;
2998  dst_line += dst_stride;
2999  mask = mask_line;
3000  mask_line += mask_stride;
3001  w = width;
3002 
3003  while (w && (uintptr_t)dst & 3)
3004  {
3005  uint16_t tmp;
3006  uint16_t a;
3007  uint32_t m, d;
3008  uint32_t r;
3009 
3010  a = *mask++;
3011  d = *dst;
3012 
3013  m = MUL_UN8 (sa, a, tmp);
3014  r = ADD_UN8 (m, d, tmp);
3015 
3016  *dst++ = r;
3017  w--;
3018  }
3019 
3020  while (w >= 4)
3021  {
3022  __m64 vmask;
3023  __m64 vdest;
3024 
3025  vmask = load8888u ((uint32_t *)mask);
3026  vdest = load8888 ((uint32_t *)dst);
3027 
3028  store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3029 
3030  dst += 4;
3031  mask += 4;
3032  w -= 4;
3033  }
3034 
3035  while (w--)
3036  {
3037  uint16_t tmp;
3038  uint16_t a;
3039  uint32_t m, d;
3040  uint32_t r;
3041 
3042  a = *mask++;
3043  d = *dst;
3044 
3045  m = MUL_UN8 (sa, a, tmp);
3046  r = ADD_UN8 (m, d, tmp);
3047 
3048  *dst++ = r;
3049  }
3050  }
3051 
3052  _mm_empty ();
3053 }
3054 
3055 static void
3056 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3058 {
3060  uint8_t *dst_line, *dst;
3061  uint8_t *src_line, *src;
3062  int dst_stride, src_stride;
3063  int32_t w;
3064  uint8_t s, d;
3065  uint16_t t;
3066 
3067  CHECKPOINT ();
3068 
3069  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3070  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3071 
3072  while (height--)
3073  {
3074  dst = dst_line;
3075  dst_line += dst_stride;
3076  src = src_line;
3077  src_line += src_stride;
3078  w = width;
3079 
3080  while (w && (uintptr_t)dst & 7)
3081  {
3082  s = *src;
3083  d = *dst;
3084  t = d + s;
3085  s = t | (0 - (t >> 8));
3086  *dst = s;
3087 
3088  dst++;
3089  src++;
3090  w--;
3091  }
3092 
3093  while (w >= 8)
3094  {
3095  *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3096  dst += 8;
3097  src += 8;
3098  w -= 8;
3099  }
3100 
3101  while (w)
3102  {
3103  s = *src;
3104  d = *dst;
3105  t = d + s;
3106  s = t | (0 - (t >> 8));
3107  *dst = s;
3108 
3109  dst++;
3110  src++;
3111  w--;
3112  }
3113  }
3114 
3115  _mm_empty ();
3116 }
3117 
3118 static void
3119 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3121 {
3123  uint16_t *dst_line, *dst;
3124  uint32_t d;
3125  uint16_t *src_line, *src;
3126  uint32_t s;
3127  int dst_stride, src_stride;
3128  int32_t w;
3129 
3130  CHECKPOINT ();
3131 
3132  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3133  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3134 
3135  while (height--)
3136  {
3137  dst = dst_line;
3138  dst_line += dst_stride;
3139  src = src_line;
3140  src_line += src_stride;
3141  w = width;
3142 
3143  while (w && (uintptr_t)dst & 7)
3144  {
3145  s = *src++;
3146  if (s)
3147  {
3148  d = *dst;
3149  s = convert_0565_to_8888 (s);
3150  if (d)
3151  {
3152  d = convert_0565_to_8888 (d);
3153  UN8x4_ADD_UN8x4 (s, d);
3154  }
3155  *dst = convert_8888_to_0565 (s);
3156  }
3157  dst++;
3158  w--;
3159  }
3160 
3161  while (w >= 4)
3162  {
3163  __m64 vdest = *(__m64 *)dst;
3164  __m64 vsrc = ldq_u ((__m64 *)src);
3165  __m64 vd0, vd1;
3166  __m64 vs0, vs1;
3167 
3168  expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3169  expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3170 
3171  vd0 = _mm_adds_pu8 (vd0, vs0);
3172  vd1 = _mm_adds_pu8 (vd1, vs1);
3173 
3174  *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3175 
3176  dst += 4;
3177  src += 4;
3178  w -= 4;
3179  }
3180 
3181  while (w--)
3182  {
3183  s = *src++;
3184  if (s)
3185  {
3186  d = *dst;
3187  s = convert_0565_to_8888 (s);
3188  if (d)
3189  {
3190  d = convert_0565_to_8888 (d);
3191  UN8x4_ADD_UN8x4 (s, d);
3192  }
3193  *dst = convert_8888_to_0565 (s);
3194  }
3195  dst++;
3196  }
3197  }
3198 
3199  _mm_empty ();
3200 }
3201 
3202 static void
3203 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3205 {
3207  uint32_t *dst_line, *dst;
3208  uint32_t *src_line, *src;
3209  int dst_stride, src_stride;
3210  int32_t w;
3211 
3212  CHECKPOINT ();
3213 
3214  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3215  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3216 
3217  while (height--)
3218  {
3219  dst = dst_line;
3220  dst_line += dst_stride;
3221  src = src_line;
3222  src_line += src_stride;
3223  w = width;
3224 
3225  while (w && (uintptr_t)dst & 7)
3226  {
3227  store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3228  load ((const uint32_t *)dst)));
3229  dst++;
3230  src++;
3231  w--;
3232  }
3233 
3234  while (w >= 2)
3235  {
3236  *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3237  dst += 2;
3238  src += 2;
3239  w -= 2;
3240  }
3241 
3242  if (w)
3243  {
3244  store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3245  load ((const uint32_t *)dst)));
3246 
3247  }
3248  }
3249 
3250  _mm_empty ();
3251 }
3252 
3253 static pixman_bool_t
3254 mmx_blt (pixman_implementation_t *imp,
3255  uint32_t * src_bits,
3256  uint32_t * dst_bits,
3257  int src_stride,
3258  int dst_stride,
3259  int src_bpp,
3260  int dst_bpp,
3261  int src_x,
3262  int src_y,
3263  int dest_x,
3264  int dest_y,
3265  int width,
3266  int height)
3267 {
3268  uint8_t * src_bytes;
3269  uint8_t * dst_bytes;
3270  int byte_width;
3271 
3272  if (src_bpp != dst_bpp)
3273  return FALSE;
3274 
3275  if (src_bpp == 16)
3276  {
3277  src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3278  dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3279  src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3280  dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3281  byte_width = 2 * width;
3282  src_stride *= 2;
3283  dst_stride *= 2;
3284  }
3285  else if (src_bpp == 32)
3286  {
3287  src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3288  dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3289  src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3290  dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3291  byte_width = 4 * width;
3292  src_stride *= 4;
3293  dst_stride *= 4;
3294  }
3295  else
3296  {
3297  return FALSE;
3298  }
3299 
3300  while (height--)
3301  {
3302  int w;
3303  uint8_t *s = src_bytes;
3304  uint8_t *d = dst_bytes;
3305  src_bytes += src_stride;
3306  dst_bytes += dst_stride;
3307  w = byte_width;
3308 
3309  if (w >= 1 && ((uintptr_t)d & 1))
3310  {
3311  *(uint8_t *)d = *(uint8_t *)s;
3312  w -= 1;
3313  s += 1;
3314  d += 1;
3315  }
3316 
3317  if (w >= 2 && ((uintptr_t)d & 3))
3318  {
3319  *(uint16_t *)d = *(uint16_t *)s;
3320  w -= 2;
3321  s += 2;
3322  d += 2;
3323  }
3324 
3325  while (w >= 4 && ((uintptr_t)d & 7))
3326  {
3327  *(uint32_t *)d = ldl_u ((uint32_t *)s);
3328 
3329  w -= 4;
3330  s += 4;
3331  d += 4;
3332  }
3333 
3334  while (w >= 64)
3335  {
3336 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3337  __asm__ (
3338  "movq (%1), %%mm0\n"
3339  "movq 8(%1), %%mm1\n"
3340  "movq 16(%1), %%mm2\n"
3341  "movq 24(%1), %%mm3\n"
3342  "movq 32(%1), %%mm4\n"
3343  "movq 40(%1), %%mm5\n"
3344  "movq 48(%1), %%mm6\n"
3345  "movq 56(%1), %%mm7\n"
3346 
3347  "movq %%mm0, (%0)\n"
3348  "movq %%mm1, 8(%0)\n"
3349  "movq %%mm2, 16(%0)\n"
3350  "movq %%mm3, 24(%0)\n"
3351  "movq %%mm4, 32(%0)\n"
3352  "movq %%mm5, 40(%0)\n"
3353  "movq %%mm6, 48(%0)\n"
3354  "movq %%mm7, 56(%0)\n"
3355  :
3356  : "r" (d), "r" (s)
3357  : "memory",
3358  "%mm0", "%mm1", "%mm2", "%mm3",
3359  "%mm4", "%mm5", "%mm6", "%mm7");
3360 #else
3361  __m64 v0 = ldq_u ((__m64 *)(s + 0));
3362  __m64 v1 = ldq_u ((__m64 *)(s + 8));
3363  __m64 v2 = ldq_u ((__m64 *)(s + 16));
3364  __m64 v3 = ldq_u ((__m64 *)(s + 24));
3365  __m64 v4 = ldq_u ((__m64 *)(s + 32));
3366  __m64 v5 = ldq_u ((__m64 *)(s + 40));
3367  __m64 v6 = ldq_u ((__m64 *)(s + 48));
3368  __m64 v7 = ldq_u ((__m64 *)(s + 56));
3369  *(__m64 *)(d + 0) = v0;
3370  *(__m64 *)(d + 8) = v1;
3371  *(__m64 *)(d + 16) = v2;
3372  *(__m64 *)(d + 24) = v3;
3373  *(__m64 *)(d + 32) = v4;
3374  *(__m64 *)(d + 40) = v5;
3375  *(__m64 *)(d + 48) = v6;
3376  *(__m64 *)(d + 56) = v7;
3377 #endif
3378 
3379  w -= 64;
3380  s += 64;
3381  d += 64;
3382  }
3383  while (w >= 4)
3384  {
3385  *(uint32_t *)d = ldl_u ((uint32_t *)s);
3386 
3387  w -= 4;
3388  s += 4;
3389  d += 4;
3390  }
3391  if (w >= 2)
3392  {
3393  *(uint16_t *)d = *(uint16_t *)s;
3394  w -= 2;
3395  s += 2;
3396  d += 2;
3397  }
3398  }
3399 
3400  _mm_empty ();
3401 
3402  return TRUE;
3403 }
3404 
3405 static void
3406 mmx_composite_copy_area (pixman_implementation_t *imp,
3408 {
3410 
3411  mmx_blt (imp, src_image->bits.bits,
3412  dest_image->bits.bits,
3413  src_image->bits.rowstride,
3414  dest_image->bits.rowstride,
3415  PIXMAN_FORMAT_BPP (src_image->bits.format),
3416  PIXMAN_FORMAT_BPP (dest_image->bits.format),
3417  src_x, src_y, dest_x, dest_y, width, height);
3418 }
3419 
3420 static void
3421 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3423 {
3425  uint32_t *src, *src_line;
3426  uint32_t *dst, *dst_line;
3427  uint8_t *mask, *mask_line;
3428  int src_stride, mask_stride, dst_stride;
3429  int32_t w;
3430 
3431  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3432  PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3433  PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3434 
3435  while (height--)
3436  {
3437  src = src_line;
3438  src_line += src_stride;
3439  dst = dst_line;
3440  dst_line += dst_stride;
3441  mask = mask_line;
3442  mask_line += mask_stride;
3443 
3444  w = width;
3445 
3446  while (w--)
3447  {
3448  uint64_t m = *mask;
3449 
3450  if (m)
3451  {
3452  uint32_t ssrc = *src | 0xff000000;
3453  __m64 s = load8888 (&ssrc);
3454 
3455  if (m == 0xff)
3456  {
3457  store8888 (dst, s);
3458  }
3459  else
3460  {
3461  __m64 sa = expand_alpha (s);
3462  __m64 vm = expand_alpha_rev (to_m64 (m));
3463  __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3464 
3465  store8888 (dst, vdest);
3466  }
3467  }
3468 
3469  mask++;
3470  dst++;
3471  src++;
3472  }
3473  }
3474 
3475  _mm_empty ();
3476 }
3477 
3478 static void
3479 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3481 {
3483  uint32_t src;
3484  uint32_t *dst_line, *dst;
3485  int32_t w;
3486  int dst_stride;
3487  __m64 vsrc;
3488 
3489  CHECKPOINT ();
3490 
3491  src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3492 
3493  if (src == 0)
3494  return;
3495 
3496  PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3497 
3498  vsrc = load8888 (&src);
3499 
3500  while (height--)
3501  {
3502  dst = dst_line;
3503  dst_line += dst_stride;
3504  w = width;
3505 
3506  CHECKPOINT ();
3507 
3508  while (w && (uintptr_t)dst & 7)
3509  {
3510  __m64 vdest = load8888 (dst);
3511 
3512  store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3513 
3514  w--;
3515  dst++;
3516  }
3517 
3518  while (w >= 2)
3519  {
3520  __m64 vdest = *(__m64 *)dst;
3521  __m64 dest0 = expand8888 (vdest, 0);
3522  __m64 dest1 = expand8888 (vdest, 1);
3523 
3524 
3525  dest0 = over (dest0, expand_alpha (dest0), vsrc);
3526  dest1 = over (dest1, expand_alpha (dest1), vsrc);
3527 
3528  *(__m64 *)dst = pack8888 (dest0, dest1);
3529 
3530  dst += 2;
3531  w -= 2;
3532  }
3533 
3534  CHECKPOINT ();
3535 
3536  if (w)
3537  {
3538  __m64 vdest = load8888 (dst);
3539 
3540  store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3541  }
3542  }
3543 
3544  _mm_empty ();
3545 }
3546 
3547 static force_inline void
3548 scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t* pd,
3549  const uint32_t* ps,
3550  int32_t w,
3551  pixman_fixed_t vx,
3552  pixman_fixed_t unit_x,
3553  pixman_fixed_t src_width_fixed,
3554  pixman_bool_t fully_transparent_src)
3555 {
3556  if (fully_transparent_src)
3557  return;
3558 
3559  while (w)
3560  {
3561  __m64 d = load (pd);
3562  __m64 s = load (ps + pixman_fixed_to_int (vx));
3563  vx += unit_x;
3564  while (vx >= 0)
3565  vx -= src_width_fixed;
3566 
3567  store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
3568  pd++;
3569 
3570  w--;
3571  }
3572 
3573  _mm_empty ();
3574 }
3575 
3576 FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER,
3577  scaled_nearest_scanline_mmx_8888_8888_OVER,
3578  uint32_t, uint32_t, COVER)
3579 FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER,
3580  scaled_nearest_scanline_mmx_8888_8888_OVER,
3582 FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER,
3583  scaled_nearest_scanline_mmx_8888_8888_OVER,
3584  uint32_t, uint32_t, PAD)
3585 FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER,
3586  scaled_nearest_scanline_mmx_8888_8888_OVER,
3587  uint32_t, uint32_t, NORMAL)
3588 
3589 static force_inline void
3590 scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask,
3591  uint32_t * dst,
3592  const uint32_t * src,
3593  int32_t w,
3594  pixman_fixed_t vx,
3595  pixman_fixed_t unit_x,
3596  pixman_fixed_t src_width_fixed,
3597  pixman_bool_t zero_src)
3598 {
3599  __m64 mm_mask;
3600 
3601  if (zero_src || (*mask >> 24) == 0)
3602  {
3603  /* A workaround for https://gcc.gnu.org/PR47759 */
3604  _mm_empty ();
3605  return;
3606  }
3607 
3608  mm_mask = expand_alpha (load8888 (mask));
3609 
3610  while (w)
3611  {
3612  uint32_t s = *(src + pixman_fixed_to_int (vx));
3613  vx += unit_x;
3614  while (vx >= 0)
3615  vx -= src_width_fixed;
3616 
3617  if (s)
3618  {
3619  __m64 ms = load8888 (&s);
3620  __m64 alpha = expand_alpha (ms);
3621  __m64 dest = load8888 (dst);
3622 
3623  store8888 (dst, (in_over (ms, alpha, mm_mask, dest)));
3624  }
3625 
3626  dst++;
3627  w--;
3628  }
3629 
3630  _mm_empty ();
3631 }
3632 
3633 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER,
3634  scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3635  uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
3636 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER,
3637  scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3639 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER,
3640  scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3642 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER,
3643  scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3644  uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
3645 
3646 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3647 #define BMSK (BSHIFT - 1)
3648 
3649 #define BILINEAR_DECLARE_VARIABLES \
3650  const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
3651  const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
3652  const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
3653  const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
3654  const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
3655  const __m64 mm_zero = _mm_setzero_si64 (); \
3656  __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3657 
3658 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
3659 do { \
3660  /* fetch 2x2 pixel block into 2 mmx registers */ \
3661  __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
3662  __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
3663  /* vertical interpolation */ \
3664  __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
3665  __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
3666  __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
3667  __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
3668  __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
3669  __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
3670  /* calculate horizontal weights */ \
3671  __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
3672  _mm_srli_pi16 (mm_x, \
3673  16 - BILINEAR_INTERPOLATION_BITS))); \
3674  /* horizontal interpolation */ \
3675  __m64 p = _mm_unpacklo_pi16 (lo, hi); \
3676  __m64 q = _mm_unpackhi_pi16 (lo, hi); \
3677  vx += unit_x; \
3678  lo = _mm_madd_pi16 (p, mm_wh); \
3679  hi = _mm_madd_pi16 (q, mm_wh); \
3680  mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3681  /* shift and pack the result */ \
3682  hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
3683  lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
3684  lo = _mm_packs_pi32 (lo, hi); \
3685  lo = _mm_packs_pu16 (lo, lo); \
3686  pix = lo; \
3687 } while (0)
3688 
3689 #define BILINEAR_SKIP_ONE_PIXEL() \
3690 do { \
3691  vx += unit_x; \
3692  mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3693 } while(0)
3694 
3695 static force_inline void
3696 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
3697  const uint32_t * mask,
3698  const uint32_t * src_top,
3699  const uint32_t * src_bottom,
3700  int32_t w,
3701  int wt,
3702  int wb,
3703  pixman_fixed_t vx,
3704  pixman_fixed_t unit_x,
3705  pixman_fixed_t max_vx,
3706  pixman_bool_t zero_src)
3707 {
3709  __m64 pix;
3710 
3711  while (w--)
3712  {
3714  store (dst, pix);
3715  dst++;
3716  }
3717 
3718  _mm_empty ();
3719 }
3720 
3721 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3722  scaled_bilinear_scanline_mmx_8888_8888_SRC,
3724  COVER, FLAG_NONE)
3725 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3726  scaled_bilinear_scanline_mmx_8888_8888_SRC,
3728  PAD, FLAG_NONE)
3729 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3730  scaled_bilinear_scanline_mmx_8888_8888_SRC,
3732  NONE, FLAG_NONE)
3733 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3734  scaled_bilinear_scanline_mmx_8888_8888_SRC,
3736  NORMAL, FLAG_NONE)
3737 
3738 static force_inline void
3739 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
3740  const uint32_t * mask,
3741  const uint32_t * src_top,
3742  const uint32_t * src_bottom,
3743  int32_t w,
3744  int wt,
3745  int wb,
3746  pixman_fixed_t vx,
3747  pixman_fixed_t unit_x,
3748  pixman_fixed_t max_vx,
3749  pixman_bool_t zero_src)
3750 {
3752  __m64 pix1, pix2;
3753 
3754  while (w)
3755  {
3757 
3758  if (!is_zero (pix1))
3759  {
3760  pix2 = load (dst);
3761  store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3762  }
3763 
3764  w--;
3765  dst++;
3766  }
3767 
3768  _mm_empty ();
3769 }
3770 
3771 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3772  scaled_bilinear_scanline_mmx_8888_8888_OVER,
3774  COVER, FLAG_NONE)
3775 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3776  scaled_bilinear_scanline_mmx_8888_8888_OVER,
3778  PAD, FLAG_NONE)
3779 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3780  scaled_bilinear_scanline_mmx_8888_8888_OVER,
3782  NONE, FLAG_NONE)
3783 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3784  scaled_bilinear_scanline_mmx_8888_8888_OVER,
3786  NORMAL, FLAG_NONE)
3787 
3788 static force_inline void
3789 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
3790  const uint8_t * mask,
3791  const uint32_t * src_top,
3792  const uint32_t * src_bottom,
3793  int32_t w,
3794  int wt,
3795  int wb,
3796  pixman_fixed_t vx,
3797  pixman_fixed_t unit_x,
3798  pixman_fixed_t max_vx,
3799  pixman_bool_t zero_src)
3800 {
3802  __m64 pix1, pix2;
3803  uint32_t m;
3804 
3805  while (w)
3806  {
3807  m = (uint32_t) *mask++;
3808 
3809  if (m)
3810  {
3812 
3813  if (m == 0xff && is_opaque (pix1))
3814  {
3815  store (dst, pix1);
3816  }
3817  else
3818  {
3819  __m64 ms, md, ma, msa;
3820 
3821  pix2 = load (dst);
3822  ma = expand_alpha_rev (to_m64 (m));
3823  ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3824  md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3825 
3826  msa = expand_alpha (ms);
3827 
3828  store8888 (dst, (in_over (ms, msa, ma, md)));
3829  }
3830  }
3831  else
3832  {
3834  }
3835 
3836  w--;
3837  dst++;
3838  }
3839 
3840  _mm_empty ();
3841 }
3842 
3843 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3844  scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3846  COVER, FLAG_HAVE_NON_SOLID_MASK)
3847 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3848  scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3851 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3852  scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3855 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3856  scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3858  NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3859 
3860 static uint32_t *
3861 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3862 {
3863  int w = iter->width;
3864  uint32_t *dst = iter->buffer;
3865  uint32_t *src = (uint32_t *)iter->bits;
3866 
3867  iter->bits += iter->stride;
3868 
3869  while (w && ((uintptr_t)dst) & 7)
3870  {
3871  *dst++ = (*src++) | 0xff000000;
3872  w--;
3873  }
3874 
3875  while (w >= 8)
3876  {
3877  __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3878  __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3879  __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3880  __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3881 
3882  *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3883  *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3884  *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3885  *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3886 
3887  dst += 8;
3888  src += 8;
3889  w -= 8;
3890  }
3891 
3892  while (w)
3893  {
3894  *dst++ = (*src++) | 0xff000000;
3895  w--;
3896  }
3897 
3898  _mm_empty ();
3899  return iter->buffer;
3900 }
3901 
3902 static uint32_t *
3903 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3904 {
3905  int w = iter->width;
3906  uint32_t *dst = iter->buffer;
3907  uint16_t *src = (uint16_t *)iter->bits;
3908 
3909  iter->bits += iter->stride;
3910 
3911  while (w && ((uintptr_t)dst) & 0x0f)
3912  {
3913  uint16_t s = *src++;
3914 
3915  *dst++ = convert_0565_to_8888 (s);
3916  w--;
3917  }
3918 
3919  while (w >= 4)
3920  {
3921  __m64 vsrc = ldq_u ((__m64 *)src);
3922  __m64 mm0, mm1;
3923 
3924  expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3925 
3926  *(__m64 *)(dst + 0) = mm0;
3927  *(__m64 *)(dst + 2) = mm1;
3928 
3929  dst += 4;
3930  src += 4;
3931  w -= 4;
3932  }
3933 
3934  while (w)
3935  {
3936  uint16_t s = *src++;
3937 
3938  *dst++ = convert_0565_to_8888 (s);
3939  w--;
3940  }
3941 
3942  _mm_empty ();
3943  return iter->buffer;
3944 }
3945 
3946 static uint32_t *
3947 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3948 {
3949  int w = iter->width;
3950  uint32_t *dst = iter->buffer;
3951  uint8_t *src = iter->bits;
3952 
3953  iter->bits += iter->stride;
3954 
3955  while (w && (((uintptr_t)dst) & 15))
3956  {
3957  *dst++ = (uint32_t)*(src++) << 24;
3958  w--;
3959  }
3960 
3961  while (w >= 8)
3962  {
3963  __m64 mm0 = ldq_u ((__m64 *)src);
3964 
3965  __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
3966  __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
3967  __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3968  __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3969  __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3970  __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3971 
3972  *(__m64 *)(dst + 0) = mm3;
3973  *(__m64 *)(dst + 2) = mm4;
3974  *(__m64 *)(dst + 4) = mm5;
3975  *(__m64 *)(dst + 6) = mm6;
3976 
3977  dst += 8;
3978  src += 8;
3979  w -= 8;
3980  }
3981 
3982  while (w)
3983  {
3984  *dst++ = (uint32_t)*(src++) << 24;
3985  w--;
3986  }
3987 
3988  _mm_empty ();
3989  return iter->buffer;
3990 }
3991 
3992 #define IMAGE_FLAGS \
3993  (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3994  FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3995 
3996 static const pixman_iter_info_t mmx_iters[] =
3997 {
3999  _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
4000  },
4002  _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
4003  },
4005  _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
4006  },
4007  { PIXMAN_null },
4008 };
4009 
4010 static const pixman_fast_path_t mmx_fast_paths[] =
4011 {
4012  PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
4013  PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
4014  PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
4015  PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
4016  PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
4017  PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
4018  PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4019  PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4020  PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
4021  PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4022  PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4023  PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
4024  PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
4025  PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
4026  PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
4027  PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
4028  PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
4029  PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
4030  PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
4031  PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
4032  PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
4033  PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
4034  PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
4035  PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
4036  PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
4037  PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
4038  PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
4039  PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
4040  PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
4041  PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
4042  PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
4043  PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
4044  PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
4045  PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
4046  PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4047  PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4048 
4049  PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
4050  PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
4051  PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
4052  PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
4053  PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
4054  PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
4055 
4056  PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
4057  PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
4058 
4059  PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
4060  PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
4061  PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
4062  PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
4063  PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
4064  PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
4065 
4066  PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4067  PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4068  PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4069  PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4070  PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
4071  PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
4072  PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
4073  PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
4074  PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
4075  PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
4076  PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4077  PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4078  PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4079  PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4080  PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
4081  PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
4082 
4083  PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
4084  PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
4085 
4086  SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4087  SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4088  SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4089  SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4090 
4091  SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888 ),
4092  SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888 ),
4093  SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888 ),
4094  SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888 ),
4095 
4096  SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4097  SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4098  SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4099  SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4100  SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4101  SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4102 
4103  SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4104  SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4105  SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4106  SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4107 
4108  SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
4109  SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
4110  SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
4111  SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
4112 
4113  { PIXMAN_OP_NONE },
4114 };
4115 
4117 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4118 {
4120 
4121  imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4122  imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4123  imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4124  imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4125  imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4126  imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4127  imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4128  imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4129  imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4130  imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4131  imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4132 
4133  imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4134  imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4135  imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4136  imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4137  imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4138  imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4139  imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4140  imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4141  imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4142  imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4143  imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4144 
4145  imp->blt = mmx_blt;
4146  imp->fill = mmx_fill;
4147 
4148  imp->iter_info = mmx_iters;
4149 
4150  return imp;
4151 }
4152 
4153 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
q
Definition: afm2pl.c:2287
#define negate(x)
Definition: aptex-macros.h:51
#define fill
Definition: aptex-macros.h:292
#define width(a)
Definition: aptex-macros.h:198
#define height(a)
Definition: aptex-macros.h:200
#define is_zero(w)
static struct brw_reg stride(struct brw_reg reg, uint32_t vstride, uint32_t width, uint32_t hstride)
#define b
Definition: jpegint.h:372
@ FALSE
Definition: dd.h:101
@ TRUE
Definition: dd.h:102
int w
Definition: dviconv.c:26
int v
Definition: dviconv.c:10
#define info
Definition: dviinfo.c:42
#define shift
Definition: exp3.c:154
#define a0
#define a1
#define v0
#define vm1
#define a2
#define v1
#define v2
#define a3
#define s
Definition: afcover.h:80
#define t
Definition: afcover.h:96
#define vm2
#define v3
#define c(n)
Definition: gpos-common.c:150
#define a(n)
Definition: gpos-common.c:148
#define d(n)
Definition: gpos-common.c:151
#define memcpy(d, s, n)
Definition: gsftopk.c:64
pix
Definition: in_pcx.cpp:383
assert(pcxLoadImage24((char *)((void *) 0), fp, pinfo, hdr))
#define NULL
Definition: ftobjs.h:61
small capitals from c petite p
Definition: afcover.h:72
#define bits
Definition: infblock.c:15
kerning y
Definition: ttdriver.c:212
unsigned short uint16_t
Definition: stdint.h:79
unsigned int uint32_t
Definition: stdint.h:80
unsigned int uintptr_t
Definition: stdint.h:119
signed int int32_t
Definition: stdint.h:77
unsigned char uint8_t
Definition: stdint.h:78
unsigned __int64 uint64_t
Definition: stdint.h:90
static int ret
Definition: convert.c:72
#define __attribute__(A)
Definition: synctex.c:338
#define dest
__inline __m64 _mm_adds_pu8(__m64 __m1, __m64 __m2)
__inline __m64 _mm_srli_si64(__m64 __m, int64_t __count)
__inline __m64 loongson_extract_pi16(__m64 __m, int64_t __pos)
__inline __m64 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
__inline __m64 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
__inline __m64 _mm_xor_si64(__m64 __m1, __m64 __m2)
__inline __m64 _mm_packs_pu16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
__inline __m64 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
__inline __m64 _mm_madd_pi16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
double __m64
__inline __m64 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_srli_pi16(__m64 __m, int64_t __count)
__inline __m64 _mm_slli_si64(__m64 __m, int64_t __count)
__inline __m64 _mm_slli_pi16(__m64 __m, int64_t __count)
__inline void _mm_empty(void)
__inline __m64 _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_or_si64(__m64 __m1, __m64 __m2)
float __m32
__inline __m64 _mm_setzero_si64(void)
__inline __m64 _mm_and_si64(__m64 __m1, __m64 __m2)
__inline __m64 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
__inline __m64 _mm_shuffle_pi16(__m64 __m, int64_t __n)
__inline __m64 loongson_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
__inline __m64 _mm_adds_pu16(__m64 __m1, __m64 __m2)
const int * pos
Definition: combiners.h:905
#define target(code, i)
Definition: lpeg.c:1165
float x
Definition: cordic.py:15
#define align(x, k)
Definition: obcommon.h:49
static struct glyph _g[190]
Definition: libpbm5.c:187
static bool ps
Definition: pdftocairo.cc:91
#define res(length)
Definition: picttoppm.c:287
set set set set set set set macro pixldst1 abits if abits op else op endif endm macro pixldst2 abits if abits op else op endif endm macro pixldst4 abits if abits op else op endif endm macro pixldst0 abits op endm macro pixldst3 mem_operand op endm macro pixldst30 mem_operand op endm macro pixldst abits if abits elseif abits elseif abits elseif abits elseif abits pixldst0 abits else pixldst0 abits pixldst0 abits pixldst0 abits pixldst0 abits endif elseif abits else pixldst0 abits pixldst0 abits endif elseif abits else error unsupported bpp *numpix else pixst endif endm macro pixld1_s mem_operand if asr adds SRC_WIDTH_FIXED bpl add asl mov asr adds SRC_WIDTH_FIXED bpl add asl mov asr adds SRC_WIDTH_FIXED bpl add asl mov asr adds SRC_WIDTH_FIXED bpl add asl elseif asr adds SRC_WIDTH_FIXED bpl add asl mov asr adds SRC_WIDTH_FIXED bpl add asl else error unsupported endif endm macro pixld2_s mem_operand if mov asr add asl add asl mov asr sub UNIT_X add asl mov asr add asl add asl mov asr add UNIT_X add asl else pixld1_s mem_operand pixld1_s mem_operand endif endm macro pixld0_s mem_operand if asr adds SRC_WIDTH_FIXED bpl add asl elseif asr adds SRC_WIDTH_FIXED bpl add asl endif endm macro pixld_s_internal mem_operand if mem_operand pixld2_s mem_operand pixdeinterleave basereg elseif mem_operand elseif mem_operand elseif mem_operand elseif mem_operand pixld0_s mem_operand else pixld0_s mem_operand pixld0_s mem_operand pixld0_s mem_operand pixld0_s mem_operand endif elseif mem_operand else pixld0_s mem_operand pixld0_s mem_operand endif elseif mem_operand else error unsupported mem_operand if bpp mem_operand endif endm macro vuzp8 reg2 vuzp d d &reg2 endm macro vzip8 reg2 vzip d d &reg2 endm macro pixdeinterleave basereg basereg basereg basereg basereg endif endm macro pixinterleave basereg basereg basereg basereg basereg endif endm macro PF boost_increment endif if endif PF tst PF addne PF subne PF cmp ORIG_W if endif if endif if endif PF subge ORIG_W PF subges if endif if endif if endif endif endm macro cache_preload_simple endif if dst_r_bpp pld[DST_R, #(PREFETCH_DISTANCE_SIMPLE *dst_r_bpp/8)] endif if mask_bpp pld src_bpp[MASK, #(PREFETCH_DISTANCE_SIMPLE *mask_bpp/8)] endif endif endm macro fetch_mask_pixblock pixld mask_basereg pixblock_size MASK endm macro ensure_destination_ptr_alignment process_pixblock_tail_head if beq irp skip1 beq endif
set set set set set set set macro pixldst1 abits if abits op else op endif endm macro pixldst2 abits if abits op else op endif endm macro pixldst4 abits if abits op else op endif endm macro pixldst0 abits op endm macro pixldst3 mem_operand op endm macro pixldst30 mem_operand op endm macro pixldst abits if abits elseif abits elseif abits elseif abits elseif abits pixldst0 abits else pixldst0 abits pixldst0 abits pixldst0 abits pixldst0 abits endif elseif abits else pixldst0 abits pixldst0 abits endif elseif abits else error unsupported bpp *numpix else pixst endif endm macro pixld1_s mem_operand if asr adds SRC_WIDTH_FIXED bpl add asl mov asr adds SRC_WIDTH_FIXED bpl add asl mov asr adds SRC_WIDTH_FIXED bpl add asl mov asr adds SRC_WIDTH_FIXED bpl add asl elseif asr adds SRC_WIDTH_FIXED bpl add asl mov asr adds SRC_WIDTH_FIXED bpl add asl else error unsupported endif endm macro pixld2_s mem_operand if mov asr add asl add asl mov asr sub UNIT_X add asl mov asr add asl add asl mov asr add UNIT_X add asl else pixld1_s mem_operand pixld1_s mem_operand endif endm macro pixld0_s mem_operand if asr adds SRC_WIDTH_FIXED bpl add asl elseif asr adds SRC_WIDTH_FIXED bpl add asl endif endm macro pixld_s_internal mem_operand if mem_operand pixld2_s mem_operand pixdeinterleave basereg elseif mem_operand elseif mem_operand elseif mem_operand elseif mem_operand pixld0_s mem_operand else pixld0_s mem_operand pixld0_s mem_operand pixld0_s mem_operand pixld0_s mem_operand endif elseif mem_operand else pixld0_s mem_operand pixld0_s mem_operand endif elseif mem_operand else error unsupported mem_operand if bpp mem_operand endif endm macro vuzp8 reg2 vuzp d d &reg2 endm macro vzip8 reg2 vzip d d &reg2 endm macro pixdeinterleave basereg basereg basereg basereg basereg endif endm macro pixinterleave basereg basereg basereg basereg basereg endif endm macro PF boost_increment endif if endif PF tst PF addne PF subne PF cmp ORIG_W if endif if endif if endif PF subge ORIG_W PF subges if endif if endif if endif endif endm macro cache_preload_simple endif if dst_r_bpp pld[DST_R, #(PREFETCH_DISTANCE_SIMPLE *dst_r_bpp/8)] endif if mask_bpp pld SRC[MASK, #(PREFETCH_DISTANCE_SIMPLE *mask_bpp/8)] endif endif endm macro fetch_mask_pixblock pixld mask_basereg pixblock_size MASK endm macro ensure_destination_ptr_alignment process_pixblock_tail_head if beq irp skip1 beq endif SRC MASK if dst_r_bpp DST_R else add endif PF add sub src_basereg pixdeinterleave mask_basereg pixdeinterleave dst_r_basereg process_pixblock_head pixblock_size cache_preload_simple process_pixblock_tail pixinterleave dst_w_basereg irp beq endif process_pixblock_tail_head tst beq irp if pixblock_size chunk_size tst beq pixld_src SRC pixld MASK if DST_R else pixld DST_R endif if src_basereg pixdeinterleave mask_basereg pixdeinterleave dst_r_basereg process_pixblock_head if pixblock_size cache_preload_simple endif process_pixblock_tail pixinterleave dst_w_basereg irp if pixblock_size chunk_size tst beq if DST_W else pixst DST_W else mov ORIG_W endif add lsl if
set set set set set set set macro pixldst1 abits if abits op else op endif endm macro pixldst2 abits if abits op else op endif endm macro pixldst4 abits if abits op else op endif endm macro pixldst0 abits op endm macro pixldst3 mem_operand op endm macro pixldst30 mem_operand op endm macro pixldst abits if abits elseif abits elseif abits elseif abits elseif abits pixldst0 abits else pixldst0 abits pixldst0 abits pixldst0 abits pixldst0 abits endif elseif abits else pixldst0 abits pixldst0 abits endif elseif abits else error unsupported bpp
set set set set set set set set set set set set set set set set set set set set *set set set macro pixldst op &r &cond WK op &r &cond WK op &r &cond WK else op &m &cond &ia op &r &cond WK else op &m &cond &ia elseif elseif else error unsupported base if elseif elseif else error unsupported unaligned pixldst unaligned endm macro pixst base base else pixldst base endif endm macro PF ptr
#define ADD_UN8(x, y, t)
#define MUL_UN8(a, b, t)
#define DIV_UN8(a, b)
#define UN8x4_ADD_UN8x4(x, y)
#define force_inline
static uint32_t over(uint32_t src, uint32_t dest)
#define IMAGE_FLAGS
uint32_t _pixman_image_get_solid(pixman_implementation_t *imp, pixman_image_t *image, pixman_format_code_t format)
Definition: pixman-image.c:946
pixman_implementation_t * _pixman_implementation_create(pixman_implementation_t *fallback, const pixman_fast_path_t *fast_paths)
#define SIMPLE_BILINEAR_FAST_PATH(op, s, d, func)
#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, dst_type_t, repeat_mode, flags)
#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, repeat_mode)
#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, dst_type_t, repeat_mode, have_mask, mask_is_solid)
#define SIMPLE_NEAREST_FAST_PATH(op, s, d, func)
#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op, s, d, func)
#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op, s, d, func)
#define FLAG_HAVE_NON_SOLID_MASK
#define FLAG_NONE
#define t0
#define t1
#define t2
void _pixman_iter_init_bits_stride(pixman_iter_t *iter, const pixman_iter_info_t *info)
Definition: pixman-utils.c:227
#define PIXMAN_STD_FAST_PATH_CA(op, src, mask, dest, func)
static uint32_t convert_0565_to_8888(uint16_t s)
@ ITER_NARROW
static uint16_t convert_8888_to_0565(uint32_t s)
#define PIXMAN_STD_FAST_PATH(op, src, mask, dest, func)
#define PIXMAN_null
#define PIXMAN_IMAGE_GET_LINE(image, x, y, type, out_stride, line, mul)
#define PIXMAN_COMPOSITE_ARGS(info)
#define BILINEAR_DECLARE_VARIABLES
Definition: pixman-sse2.c:5615
#define BILINEAR_SKIP_ONE_PIXEL()
Definition: pixman-sse2.c:5677
#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)
Definition: pixman-sse2.c:5656
static int is_opaque(__m128i x)
Definition: pixman-sse2.c:176
static vector unsigned int pix_add_mul(vector unsigned int x, vector unsigned int a, vector unsigned int y, vector unsigned int b)
Definition: pixman-vmx.c:140
#define in_over(src, srca, mask, dest)
Definition: pixman-vmx.c:173
static vector unsigned int pix_multiply(vector unsigned int p, vector unsigned int a)
Definition: pixman-vmx.c:71
static vector unsigned int pix_add(vector unsigned int a, vector unsigned int b)
Definition: pixman-vmx.c:133
#define PIXMAN_FORMAT_BPP(f)
Definition: pixman.h:837
pixman_fixed_16_16_t pixman_fixed_t
Definition: pixman.h:123
int pixman_bool_t
Definition: pixman.h:113
#define pixman_fixed_to_int(f)
Definition: pixman.h:129
pixman_op_t
Definition: pixman.h:389
@ PIXMAN_OP_OVER_REVERSE
Definition: pixman.h:394
@ PIXMAN_OP_OUT_REVERSE
Definition: pixman.h:398
@ PIXMAN_OP_SATURATE
Definition: pixman.h:403
@ PIXMAN_OP_ATOP
Definition: pixman.h:399
@ PIXMAN_OP_SRC
Definition: pixman.h:391
@ PIXMAN_OP_OVER
Definition: pixman.h:393
@ PIXMAN_OP_IN_REVERSE
Definition: pixman.h:396
@ PIXMAN_OP_OUT
Definition: pixman.h:397
@ PIXMAN_OP_IN
Definition: pixman.h:395
@ PIXMAN_OP_XOR
Definition: pixman.h:401
@ PIXMAN_OP_ATOP_REVERSE
Definition: pixman.h:400
@ PIXMAN_OP_ADD
Definition: pixman.h:402
@ PIXMAN_x8r8g8b8
Definition: pixman.h:879
@ PIXMAN_a8
Definition: pixman.h:913
@ PIXMAN_r5g6b5
Definition: pixman.h:900
unsigned long pixel
Definition: png22pnm.c:123
#define NONE
Definition: pngtopnm.c:48
static enum alpha_handling alpha
Definition: pngtopnm.c:70
int g
Definition: ppmqvga.c:68
int r
Definition: ppmqvga.c:68
#define PAD(n)
Definition: ppmtoilbm.c:115
static bool combine(Unicode base, Unicode add, Unicode *out)
#define mask(n)
Definition: lbitlib.c:93
#define MC
Definition: lstrlib.c:1117
#define store(x, y, t)
Definition: interp.c:112
#define load(x, t)
Definition: interp.c:111
d0
Definition: sec_div.c:122
d1
Definition: sec_div.c:81
static FILE * in
Definition: squeeze.c:36
#define uint32_t
Definition: stdint.in.h:168
#define uint64_t
Definition: stdint.in.h:215
#define uintptr_t
Definition: stdint.in.h:321
@ ADD
Definition: strexpr.c:18
Definition: namelist.c:170
Definition: sh.h:1226
Definition: ps.h:43
Definition: ppm.h:33
const pixman_iter_info_t * iter_info
pixman_combine_32_func_t combine_32_ca[PIXMAN_N_OPERATORS]
pixman_combine_32_func_t combine_32[PIXMAN_N_OPERATORS]
pixman_blt_func_t blt
pixman_fill_func_t fill
uint8_t * bits
uint32_t * buffer
Definition: sh2.c:920
Definition: dvips.h:235
s1
Definition: t4ht.c:1059
while(temp)
Definition: t4ht.c:858
char * s2
Definition: t4ht.c:1062
m
Definition: tex4ht.c:3990
return() int(((double) *(font_tbl[cur_fnt].wtbl+(int)(*(font_tbl[cur_fnt].char_wi+(int)(ch - font_tbl[cur_fnt].char_f)% 256)))/(double)(1L<< 20)) *(double) font_tbl[cur_fnt].scale)
#define s3
Definition: tokst.h:105
#define s0
Definition: tokst.h:45
static UBool fallback(char *loc)
Definition: ucurr.cpp:604
#define end(cp)
Definition: zic.c:71