w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

loongson-mmintrin.h
Go to the documentation of this file.
1 /* The gcc-provided loongson intrinsic functions are way too fucking broken
2  * to be of any use, otherwise I'd use them.
3  *
4  * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
5  * close enough that they could have implemented the _mm_*-style intrinsic
6  * interface and had a ton of optimized code available to them. Instead they
7  * implemented something much, much worse.
8  *
9  * - pshuf takes a dead first argument, causing extra instructions to be
10  * generated.
11  *
12  * - There are no 64-bit shift or logical intrinsics, which means you have
13  * to implement them with inline assembly, but this is a nightmare because
14  * gcc doesn't understand that the integer vector datatypes are actually in
15  * floating-point registers, so you end up with braindead code like
16  *
17  * punpcklwd $f9,$f9,$f5
18  * dmtc1 v0,$f8
19  * punpcklwd $f19,$f19,$f5
20  * dmfc1 t9,$f9
21  * dmtc1 v0,$f9
22  * dmtc1 t9,$f20
23  * dmfc1 s0,$f19
24  * punpcklbh $f20,$f20,$f2
25  *
26  * where crap just gets copied back and forth between integer and floating-
27  * point registers ad nauseum.
28  *
29  * Instead of trying to workaround the problems from these crap intrinsics, I
30  * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
31  * assembly.
32  */
33 
34 #include <stdint.h>
35 
36 /* vectors are stored in 64-bit floating-point registers */
37 typedef double __m64;
38 /* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
39 typedef float __m32;
40 
41 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
43 {
44  return 0.0;
45 }
46 
47 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
48 _mm_add_pi16 (__m64 __m1, __m64 __m2)
49 {
50  __m64 ret;
51  asm("paddh %0, %1, %2\n\t"
52  : "=f" (ret)
53  : "f" (__m1), "f" (__m2)
54  );
55  return ret;
56 }
57 
58 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
59 _mm_add_pi32 (__m64 __m1, __m64 __m2)
60 {
61  __m64 ret;
62  asm("paddw %0, %1, %2\n\t"
63  : "=f" (ret)
64  : "f" (__m1), "f" (__m2)
65  );
66  return ret;
67 }
68 
69 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
71 {
72  __m64 ret;
73  asm("paddush %0, %1, %2\n\t"
74  : "=f" (ret)
75  : "f" (__m1), "f" (__m2)
76  );
77  return ret;
78 }
79 
80 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
82 {
83  __m64 ret;
84  asm("paddusb %0, %1, %2\n\t"
85  : "=f" (ret)
86  : "f" (__m1), "f" (__m2)
87  );
88  return ret;
89 }
90 
91 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92 _mm_and_si64 (__m64 __m1, __m64 __m2)
93 {
94  __m64 ret;
95  asm("and %0, %1, %2\n\t"
96  : "=f" (ret)
97  : "f" (__m1), "f" (__m2)
98  );
99  return ret;
100 }
101 
102 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104 {
105  __m64 ret;
106  asm("pcmpeqw %0, %1, %2\n\t"
107  : "=f" (ret)
108  : "f" (__m1), "f" (__m2)
109  );
110  return ret;
111 }
112 
113 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
114 _mm_empty (void)
115 {
116 
117 }
118 
119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
121 {
122  __m64 ret;
123  asm("pmaddhw %0, %1, %2\n\t"
124  : "=f" (ret)
125  : "f" (__m1), "f" (__m2)
126  );
127  return ret;
128 }
129 
130 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 {
133  __m64 ret;
134  asm("pmulhuh %0, %1, %2\n\t"
135  : "=f" (ret)
136  : "f" (__m1), "f" (__m2)
137  );
138  return ret;
139 }
140 
141 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143 {
144  __m64 ret;
145  asm("pmullh %0, %1, %2\n\t"
146  : "=f" (ret)
147  : "f" (__m1), "f" (__m2)
148  );
149  return ret;
150 }
151 
152 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm_or_si64 (__m64 __m1, __m64 __m2)
154 {
155  __m64 ret;
156  asm("or %0, %1, %2\n\t"
157  : "=f" (ret)
158  : "f" (__m1), "f" (__m2)
159  );
160  return ret;
161 }
162 
163 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 {
166  __m64 ret;
167  asm("packushb %0, %1, %2\n\t"
168  : "=f" (ret)
169  : "f" (__m1), "f" (__m2)
170  );
171  return ret;
172 }
173 
174 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 {
177  __m64 ret;
178  asm("packsswh %0, %1, %2\n\t"
179  : "=f" (ret)
180  : "f" (__m1), "f" (__m2)
181  );
182  return ret;
183 }
184 
185 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
186  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189 {
190  if (__builtin_constant_p (__w3) &&
191  __builtin_constant_p (__w2) &&
192  __builtin_constant_p (__w1) &&
193  __builtin_constant_p (__w0))
194  {
195  uint64_t val = ((uint64_t)__w3 << 48)
196  | ((uint64_t)__w2 << 32)
197  | ((uint64_t)__w1 << 16)
198  | ((uint64_t)__w0 << 0);
199  return *(__m64 *)&val;
200  }
201  else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
202  {
203  /* TODO: handle other cases */
204  uint64_t val = __w3;
205  uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
206  __m64 ret;
207  asm("pshufh %0, %1, %2\n\t"
208  : "=f" (ret)
209  : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
210  );
211  return ret;
212  } else {
213  uint64_t val = ((uint64_t)__w3 << 48)
214  | ((uint64_t)__w2 << 32)
215  | ((uint64_t)__w1 << 16)
216  | ((uint64_t)__w0 << 0);
217  return *(__m64 *)&val;
218  }
219 }
220 
221 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222 _mm_set_pi32 (unsigned __i1, unsigned __i0)
223 {
224  if (__builtin_constant_p (__i1) &&
225  __builtin_constant_p (__i0))
226  {
227  uint64_t val = ((uint64_t)__i1 << 32)
228  | ((uint64_t)__i0 << 0);
229  return *(__m64 *)&val;
230  }
231  else if (__i1 == __i0)
232  {
233  uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
234  __m64 ret;
235  asm("pshufh %0, %1, %2\n\t"
236  : "=f" (ret)
237  : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
238  );
239  return ret;
240  } else {
241  uint64_t val = ((uint64_t)__i1 << 32)
242  | ((uint64_t)__i0 << 0);
243  return *(__m64 *)&val;
244  }
245 }
246 #undef _MM_SHUFFLE
247 
248 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250 {
251  __m64 ret;
252  asm("pshufh %0, %1, %2\n\t"
253  : "=f" (ret)
254  : "f" (__m), "f" (*(__m64 *)&__n)
255  );
256  return ret;
257 }
258 
259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261 {
262  __m64 ret;
263  asm("psllh %0, %1, %2\n\t"
264  : "=f" (ret)
265  : "f" (__m), "f" (*(__m64 *)&__count)
266  );
267  return ret;
268 }
269 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
271 {
272  __m64 ret;
273  asm("dsll %0, %1, %2\n\t"
274  : "=f" (ret)
275  : "f" (__m), "f" (*(__m64 *)&__count)
276  );
277  return ret;
278 }
279 
280 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282 {
283  __m64 ret;
284  asm("psrlh %0, %1, %2\n\t"
285  : "=f" (ret)
286  : "f" (__m), "f" (*(__m64 *)&__count)
287  );
288  return ret;
289 }
290 
291 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 {
294  __m64 ret;
295  asm("psrlw %0, %1, %2\n\t"
296  : "=f" (ret)
297  : "f" (__m), "f" (*(__m64 *)&__count)
298  );
299  return ret;
300 }
301 
302 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304 {
305  __m64 ret;
306  asm("dsrl %0, %1, %2\n\t"
307  : "=f" (ret)
308  : "f" (__m), "f" (*(__m64 *)&__count)
309  );
310  return ret;
311 }
312 
313 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315 {
316  __m64 ret;
317  asm("psubh %0, %1, %2\n\t"
318  : "=f" (ret)
319  : "f" (__m1), "f" (__m2)
320  );
321  return ret;
322 }
323 
324 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
326 {
327  __m64 ret;
328  asm("punpckhbh %0, %1, %2\n\t"
329  : "=f" (ret)
330  : "f" (__m1), "f" (__m2)
331  );
332  return ret;
333 }
334 
335 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
337 {
338  __m64 ret;
339  asm("punpckhhw %0, %1, %2\n\t"
340  : "=f" (ret)
341  : "f" (__m1), "f" (__m2)
342  );
343  return ret;
344 }
345 
346 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
348 {
349  __m64 ret;
350  asm("punpcklbh %0, %1, %2\n\t"
351  : "=f" (ret)
352  : "f" (__m1), "f" (__m2)
353  );
354  return ret;
355 }
356 
357 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
358  * allows load8888 to use 32-bit loads */
359 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 {
362  __m64 ret;
363  asm("punpcklbh %0, %1, %2\n\t"
364  : "=f" (ret)
365  : "f" (__m1), "f" (__m2)
366  );
367  return ret;
368 }
369 
370 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372 {
373  __m64 ret;
374  asm("punpcklhw %0, %1, %2\n\t"
375  : "=f" (ret)
376  : "f" (__m1), "f" (__m2)
377  );
378  return ret;
379 }
380 
381 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
383 {
384  __m64 ret;
385  asm("xor %0, %1, %2\n\t"
386  : "=f" (ret)
387  : "f" (__m1), "f" (__m2)
388  );
389  return ret;
390 }
391 
392 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394 {
395  __m64 ret;
396  asm("pextrh %0, %1, %2\n\t"
397  : "=f" (ret)
398  : "f" (__m), "f" (*(__m64 *)&__pos)
399  );
400  return ret;
401 }
402 
403 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405 {
406  __m64 ret;
407  asm("pinsrh_%3 %0, %1, %2\n\t"
408  : "=f" (ret)
409  : "f" (__m1), "f" (__m2), "i" (__pos)
410  );
411  return ret;
412 }
#define __builtin_constant_p(x)
Definition: gmp-impl.h:734
unsigned short uint16_t
Definition: stdint.h:79
signed __int64 int64_t
Definition: stdint.h:89
unsigned __int64 uint64_t
Definition: stdint.h:90
static int ret
Definition: convert.c:72
#define __attribute__(A)
Definition: synctex.c:338
__inline __m64 _mm_set_pi16(uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
__inline __m64 _mm_add_pi32(__m64 __m1, __m64 __m2)
__inline __m64 _mm_adds_pu8(__m64 __m1, __m64 __m2)
__inline __m64 _mm_srli_si64(__m64 __m, int64_t __count)
__inline __m64 loongson_extract_pi16(__m64 __m, int64_t __pos)
__inline __m64 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
__inline __m64 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
__inline __m64 _mm_xor_si64(__m64 __m1, __m64 __m2)
__inline __m64 _mm_packs_pu16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
__inline __m64 _mm_packs_pi32(__m64 __m1, __m64 __m2)
__inline __m64 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
__inline __m64 _mm_madd_pi16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
double __m64
__inline __m64 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_srli_pi16(__m64 __m, int64_t __count)
__inline __m64 _mm_add_pi16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_slli_si64(__m64 __m, int64_t __count)
__inline __m64 _mm_slli_pi16(__m64 __m, int64_t __count)
__inline void _mm_empty(void)
__inline __m64 _mm_srli_pi32(__m64 __m, int64_t __count)
__inline __m64 _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_or_si64(__m64 __m1, __m64 __m2)
float __m32
__inline __m64 _mm_setzero_si64(void)
__inline __m64 _mm_and_si64(__m64 __m1, __m64 __m2)
__inline __m64 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
__inline __m64 _mm_sub_pi16(__m64 __m1, __m64 __m2)
__inline __m64 _mm_shuffle_pi16(__m64 __m, int64_t __n)
__inline __m64 _mm_set_pi32(unsigned __i1, unsigned __i0)
__inline __m64 loongson_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
__inline __m64 _mm_adds_pu16(__m64 __m1, __m64 __m2)
__m
Definition: sec_div.c:123
#define uint64_t
Definition: stdint.in.h:215
Definition: strexpr.c:21
val
Definition: tex4ht.c:3227