"Fossies" - the Fresh Open Source Software Archive 
Member "qt-everywhere-src-6.3.1/qtwebengine/src/3rdparty/chromium/third_party/zlib/patches/0001-simd.patch" (8 Jun 2022, 39766 Bytes) of package /linux/misc/qt-everywhere-src-6.3.1.tar.xz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Diff source code syntax highlighting (style:
standard) with prefixed line numbers.
Alternatively you can here
view or
download the uninterpreted source code file.
1 diff --git a/crc32.c b/crc32.c
2 index 9580440c0e6b..9162429cc7b4 100644
3 --- a/crc32.c
4 +++ b/crc32.c
5 @@ -28,6 +28,8 @@
6 # endif /* !DYNAMIC_CRC_TABLE */
7 #endif /* MAKECRCH */
8
9 +#include "deflate.h"
10 +#include "x86.h"
11 #include "zutil.h" /* for STDC and FAR definitions */
12
13 /* Definitions for doing the crc four data bytes at a time. */
14 @@ -440,3 +442,28 @@ uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
15 {
16 return crc32_combine_(crc1, crc2, len2);
17 }
18 +
19 +ZLIB_INTERNAL void crc_reset(deflate_state *const s)
20 +{
21 + if (x86_cpu_enable_simd) {
22 + crc_fold_init(s);
23 + return;
24 + }
25 + s->strm->adler = crc32(0L, Z_NULL, 0);
26 +}
27 +
28 +ZLIB_INTERNAL void crc_finalize(deflate_state *const s)
29 +{
30 + if (x86_cpu_enable_simd)
31 + s->strm->adler = crc_fold_512to32(s);
32 +}
33 +
34 +ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size)
35 +{
36 + if (x86_cpu_enable_simd) {
37 + crc_fold_copy(strm->state, dst, strm->next_in, size);
38 + return;
39 + }
40 + zmemcpy(dst, strm->next_in, size);
41 + strm->adler = crc32(strm->adler, dst, size);
42 +}
43 diff --git a/crc_folding.c b/crc_folding.c
44 new file mode 100644
45 index 000000000000..48d77744aaf4
46 --- /dev/null
47 +++ b/crc_folding.c
48 @@ -0,0 +1,493 @@
49 +/*
50 + * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
51 + * instruction.
52 + *
53 + * A white paper describing this algorithm can be found at:
54 + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
55 + *
56 + * Copyright (C) 2013 Intel Corporation. All rights reserved.
57 + * Authors:
58 + * Wajdi Feghali <wajdi.k.feghali@intel.com>
59 + * Jim Guilford <james.guilford@intel.com>
60 + * Vinodh Gopal <vinodh.gopal@intel.com>
61 + * Erdinc Ozturk <erdinc.ozturk@intel.com>
62 + * Jim Kukunas <james.t.kukunas@linux.intel.com>
63 + *
64 + * For conditions of distribution and use, see copyright notice in zlib.h
65 + */
66 +
67 +#include "deflate.h"
68 +
69 +#include <inttypes.h>
70 +#include <emmintrin.h>
71 +#include <immintrin.h>
72 +#include <wmmintrin.h>
73 +
74 +#define CRC_LOAD(s) \
75 + do { \
76 + __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
77 + __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
78 + __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
79 + __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
80 + __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
81 +
82 +#define CRC_SAVE(s) \
83 + _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
84 + _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
85 + _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
86 + _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
87 + _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
88 + } while (0);
89 +
90 +ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
91 +{
92 + CRC_LOAD(s)
93 +
94 + xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
95 + xmm_crc1 = _mm_setzero_si128();
96 + xmm_crc2 = _mm_setzero_si128();
97 + xmm_crc3 = _mm_setzero_si128();
98 +
99 + CRC_SAVE(s)
100 +
101 + s->strm->adler = 0;
102 +}
103 +
104 +local void fold_1(deflate_state *const s,
105 + __m128i *xmm_crc0, __m128i *xmm_crc1,
106 + __m128i *xmm_crc2, __m128i *xmm_crc3)
107 +{
108 + const __m128i xmm_fold4 = _mm_set_epi32(
109 + 0x00000001, 0x54442bd4,
110 + 0x00000001, 0xc6e41596);
111 +
112 + __m128i x_tmp3;
113 + __m128 ps_crc0, ps_crc3, ps_res;
114 +
115 + x_tmp3 = *xmm_crc3;
116 +
117 + *xmm_crc3 = *xmm_crc0;
118 + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
119 + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
120 + ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
121 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
122 + ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
123 +
124 + *xmm_crc0 = *xmm_crc1;
125 + *xmm_crc1 = *xmm_crc2;
126 + *xmm_crc2 = x_tmp3;
127 + *xmm_crc3 = _mm_castps_si128(ps_res);
128 +}
129 +
130 +local void fold_2(deflate_state *const s,
131 + __m128i *xmm_crc0, __m128i *xmm_crc1,
132 + __m128i *xmm_crc2, __m128i *xmm_crc3)
133 +{
134 + const __m128i xmm_fold4 = _mm_set_epi32(
135 + 0x00000001, 0x54442bd4,
136 + 0x00000001, 0xc6e41596);
137 +
138 + __m128i x_tmp3, x_tmp2;
139 + __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
140 +
141 + x_tmp3 = *xmm_crc3;
142 + x_tmp2 = *xmm_crc2;
143 +
144 + *xmm_crc3 = *xmm_crc1;
145 + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
146 + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
147 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
148 + ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
149 + ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
150 +
151 + *xmm_crc2 = *xmm_crc0;
152 + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
153 + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
154 + ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
155 + ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
156 + ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
157 +
158 + *xmm_crc0 = x_tmp2;
159 + *xmm_crc1 = x_tmp3;
160 + *xmm_crc2 = _mm_castps_si128(ps_res20);
161 + *xmm_crc3 = _mm_castps_si128(ps_res31);
162 +}
163 +
164 +local void fold_3(deflate_state *const s,
165 + __m128i *xmm_crc0, __m128i *xmm_crc1,
166 + __m128i *xmm_crc2, __m128i *xmm_crc3)
167 +{
168 + const __m128i xmm_fold4 = _mm_set_epi32(
169 + 0x00000001, 0x54442bd4,
170 + 0x00000001, 0xc6e41596);
171 +
172 + __m128i x_tmp3;
173 + __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
174 +
175 + x_tmp3 = *xmm_crc3;
176 +
177 + *xmm_crc3 = *xmm_crc2;
178 + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
179 + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
180 + ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
181 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
182 + ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
183 +
184 + *xmm_crc2 = *xmm_crc1;
185 + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
186 + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
187 + ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
188 + ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
189 + ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
190 +
191 + *xmm_crc1 = *xmm_crc0;
192 + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
193 + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
194 + ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
195 + ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
196 + ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
197 +
198 + *xmm_crc0 = x_tmp3;
199 + *xmm_crc1 = _mm_castps_si128(ps_res10);
200 + *xmm_crc2 = _mm_castps_si128(ps_res21);
201 + *xmm_crc3 = _mm_castps_si128(ps_res32);
202 +}
203 +
204 +local void fold_4(deflate_state *const s,
205 + __m128i *xmm_crc0, __m128i *xmm_crc1,
206 + __m128i *xmm_crc2, __m128i *xmm_crc3)
207 +{
208 + const __m128i xmm_fold4 = _mm_set_epi32(
209 + 0x00000001, 0x54442bd4,
210 + 0x00000001, 0xc6e41596);
211 +
212 + __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
213 + __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
214 + __m128 ps_t0, ps_t1, ps_t2, ps_t3;
215 + __m128 ps_res0, ps_res1, ps_res2, ps_res3;
216 +
217 + x_tmp0 = *xmm_crc0;
218 + x_tmp1 = *xmm_crc1;
219 + x_tmp2 = *xmm_crc2;
220 + x_tmp3 = *xmm_crc3;
221 +
222 + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
223 + x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
224 + ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
225 + ps_t0 = _mm_castsi128_ps(x_tmp0);
226 + ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
227 +
228 + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
229 + x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
230 + ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
231 + ps_t1 = _mm_castsi128_ps(x_tmp1);
232 + ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
233 +
234 + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
235 + x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
236 + ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
237 + ps_t2 = _mm_castsi128_ps(x_tmp2);
238 + ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
239 +
240 + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
241 + x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
242 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
243 + ps_t3 = _mm_castsi128_ps(x_tmp3);
244 + ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
245 +
246 + *xmm_crc0 = _mm_castps_si128(ps_res0);
247 + *xmm_crc1 = _mm_castps_si128(ps_res1);
248 + *xmm_crc2 = _mm_castps_si128(ps_res2);
249 + *xmm_crc3 = _mm_castps_si128(ps_res3);
250 +}
251 +
252 +local const unsigned zalign(32) pshufb_shf_table[60] = {
253 + 0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
254 + 0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
255 + 0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
256 + 0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
257 + 0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
258 + 0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
259 + 0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl 9 (16 - 7)/shr7 */
260 + 0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl 8 (16 - 8)/shr8 */
261 + 0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl 7 (16 - 9)/shr9 */
262 + 0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl 6 (16 -10)/shr10*/
263 + 0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl 5 (16 -11)/shr11*/
264 + 0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl 4 (16 -12)/shr12*/
265 + 0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
266 + 0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
267 + 0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
268 +};
269 +
270 +local void partial_fold(deflate_state *const s, const size_t len,
271 + __m128i *xmm_crc0, __m128i *xmm_crc1,
272 + __m128i *xmm_crc2, __m128i *xmm_crc3,
273 + __m128i *xmm_crc_part)
274 +{
275 +
276 + const __m128i xmm_fold4 = _mm_set_epi32(
277 + 0x00000001, 0x54442bd4,
278 + 0x00000001, 0xc6e41596);
279 + const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
280 +
281 + __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
282 + __m128i xmm_a0_0, xmm_a0_1;
283 + __m128 ps_crc3, psa0_0, psa0_1, ps_res;
284 +
285 + xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
286 + xmm_shr = xmm_shl;
287 + xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
288 +
289 + xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
290 +
291 + *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
292 + xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
293 + *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
294 +
295 + *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
296 + xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
297 + *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
298 +
299 + *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
300 + xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
301 + *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
302 +
303 + *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
304 + *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
305 + *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
306 +
307 + xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
308 + xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
309 +
310 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
311 + psa0_0 = _mm_castsi128_ps(xmm_a0_0);
312 + psa0_1 = _mm_castsi128_ps(xmm_a0_1);
313 +
314 + ps_res = _mm_xor_ps(ps_crc3, psa0_0);
315 + ps_res = _mm_xor_ps(ps_res, psa0_1);
316 +
317 + *xmm_crc3 = _mm_castps_si128(ps_res);
318 +}
319 +
320 +ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
321 + unsigned char *dst, const unsigned char *src, long len)
322 +{
323 + unsigned long algn_diff;
324 + __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
325 +
326 + CRC_LOAD(s)
327 +
328 + if (len < 16) {
329 + if (len == 0)
330 + return;
331 + goto partial;
332 + }
333 +
334 + algn_diff = 0 - (uintptr_t)src & 0xF;
335 + if (algn_diff) {
336 + xmm_crc_part = _mm_loadu_si128((__m128i *)src);
337 + _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
338 +
339 + dst += algn_diff;
340 + src += algn_diff;
341 + len -= algn_diff;
342 +
343 + partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
344 + &xmm_crc_part);
345 + }
346 +
347 + while ((len -= 64) >= 0) {
348 + xmm_t0 = _mm_load_si128((__m128i *)src);
349 + xmm_t1 = _mm_load_si128((__m128i *)src + 1);
350 + xmm_t2 = _mm_load_si128((__m128i *)src + 2);
351 + xmm_t3 = _mm_load_si128((__m128i *)src + 3);
352 +
353 + fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
354 +
355 + _mm_storeu_si128((__m128i *)dst, xmm_t0);
356 + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
357 + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
358 + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
359 +
360 + xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
361 + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
362 + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
363 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
364 +
365 + src += 64;
366 + dst += 64;
367 + }
368 +
369 + /*
370 + * len = num bytes left - 64
371 + */
372 + if (len + 16 >= 0) {
373 + len += 16;
374 +
375 + xmm_t0 = _mm_load_si128((__m128i *)src);
376 + xmm_t1 = _mm_load_si128((__m128i *)src + 1);
377 + xmm_t2 = _mm_load_si128((__m128i *)src + 2);
378 +
379 + fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
380 +
381 + _mm_storeu_si128((__m128i *)dst, xmm_t0);
382 + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
383 + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
384 +
385 + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
386 + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
387 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
388 +
389 + if (len == 0)
390 + goto done;
391 +
392 + dst += 48;
393 + src += 48;
394 + } else if (len + 32 >= 0) {
395 + len += 32;
396 +
397 + xmm_t0 = _mm_load_si128((__m128i *)src);
398 + xmm_t1 = _mm_load_si128((__m128i *)src + 1);
399 +
400 + fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
401 +
402 + _mm_storeu_si128((__m128i *)dst, xmm_t0);
403 + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
404 +
405 + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
406 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
407 +
408 + if (len == 0)
409 + goto done;
410 +
411 + dst += 32;
412 + src += 32;
413 + } else if (len + 48 >= 0) {
414 + len += 48;
415 +
416 + xmm_t0 = _mm_load_si128((__m128i *)src);
417 +
418 + fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419 +
420 + _mm_storeu_si128((__m128i *)dst, xmm_t0);
421 +
422 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
423 +
424 + if (len == 0)
425 + goto done;
426 +
427 + dst += 16;
428 + src += 16;
429 + } else {
430 + len += 64;
431 + if (len == 0)
432 + goto done;
433 + }
434 +
435 +partial:
436 +
437 +#if defined(_MSC_VER)
438 + /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
439 + {
440 + int32_t parts[4] = {0, 0, 0, 0};
441 + memcpy(&parts, src, len);
442 + xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
443 + }
444 +#else
445 + {
446 + int64_t parts[2] = {0, 0};
447 + memcpy(&parts, src, len);
448 + xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
449 + }
450 +#endif
451 +
452 + _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
453 + partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
454 + &xmm_crc_part);
455 +done:
456 + CRC_SAVE(s)
457 +}
458 +
459 +local const unsigned zalign(16) crc_k[] = {
460 + 0xccaa009e, 0x00000000, /* rk1 */
461 + 0x751997d0, 0x00000001, /* rk2 */
462 + 0xccaa009e, 0x00000000, /* rk5 */
463 + 0x63cd6124, 0x00000001, /* rk6 */
464 + 0xf7011640, 0x00000001, /* rk7 */
465 + 0xdb710640, 0x00000001 /* rk8 */
466 +};
467 +
468 +local const unsigned zalign(16) crc_mask[4] = {
469 + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
470 +};
471 +
472 +local const unsigned zalign(16) crc_mask2[4] = {
473 + 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
474 +};
475 +
476 +unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
477 +{
478 + const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
479 + const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
480 +
481 + unsigned crc;
482 + __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
483 +
484 + CRC_LOAD(s)
485 +
486 + /*
487 + * k1
488 + */
489 + crc_fold = _mm_load_si128((__m128i *)crc_k);
490 +
491 + x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
492 + xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
493 + xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
494 + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
495 +
496 + x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
497 + xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
498 + xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
499 + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
500 +
501 + x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
502 + xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
503 + xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
504 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
505 +
506 + /*
507 + * k5
508 + */
509 + crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
510 +
511 + xmm_crc0 = xmm_crc3;
512 + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
513 + xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
514 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
515 +
516 + xmm_crc0 = xmm_crc3;
517 + xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
518 + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
519 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
520 + xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
521 +
522 + /*
523 + * k7
524 + */
525 + xmm_crc1 = xmm_crc3;
526 + xmm_crc2 = xmm_crc3;
527 + crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
528 +
529 + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
530 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
531 + xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
532 +
533 + xmm_crc2 = xmm_crc3;
534 + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
535 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
536 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
537 +
538 + crc = _mm_extract_epi32(xmm_crc3, 2);
539 + return ~crc;
540 + CRC_SAVE(s)
541 +}
542 diff --git a/deflate.c b/deflate.c
543 index 1ec761448de9..aa0c9c67a6dc 100644
544 --- a/deflate.c
545 +++ b/deflate.c
546 @@ -48,8 +48,9 @@
547 */
548
549 /* @(#) $Id$ */
550 -
551 +#include <assert.h>
552 #include "deflate.h"
553 +#include "x86.h"
554
555 const char deflate_copyright[] =
556 " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
557 @@ -86,7 +87,7 @@ local block_state deflate_huff OF((deflate_state *s, int flush));
558 local void lm_init OF((deflate_state *s));
559 local void putShortMSB OF((deflate_state *s, uInt b));
560 local void flush_pending OF((z_streamp strm));
561 -local unsigned read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
562 +unsigned ZLIB_INTERNAL deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
563 #ifdef ASMV
564 # pragma message("Assembler code may have bugs -- use at your own risk")
565 void match_init OF((void)); /* asm code initialization */
566 @@ -100,6 +101,20 @@ local void check_match OF((deflate_state *s, IPos start, IPos match,
567 int length));
568 #endif
569
570 +/* From crc32.c */
571 +extern void ZLIB_INTERNAL crc_reset(deflate_state *const s);
572 +extern void ZLIB_INTERNAL crc_finalize(deflate_state *const s);
573 +extern void ZLIB_INTERNAL copy_with_crc(z_streamp strm, Bytef *dst, long size);
574 +
575 +#ifdef _MSC_VER
576 +#define INLINE __inline
577 +#else
578 +#define INLINE inline
579 +#endif
580 +
581 +/* Inline optimisation */
582 +local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str);
583 +
584 /* ===========================================================================
585 * Local data
586 */
587 @@ -162,7 +177,6 @@ local const config configuration_table[10] = {
588 */
589 #define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
590
591 -
592 /* ===========================================================================
593 * Insert string str in the dictionary and set match_head to the previous head
594 * of the hash chain (the most recent string with same hash key). Return
595 @@ -173,17 +187,28 @@ local const config configuration_table[10] = {
596 * characters and the first MIN_MATCH bytes of str are valid (except for
597 * the last MIN_MATCH-1 bytes of the input file).
598 */
599 +local INLINE Pos insert_string_c(deflate_state *const s, const Pos str)
600 +{
601 + Pos ret;
602 +
603 + UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]);
604 #ifdef FASTEST
605 -#define INSERT_STRING(s, str, match_head) \
606 - (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
607 - match_head = s->head[s->ins_h], \
608 - s->head[s->ins_h] = (Pos)(str))
609 + ret = s->head[s->ins_h];
610 #else
611 -#define INSERT_STRING(s, str, match_head) \
612 - (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
613 - match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
614 - s->head[s->ins_h] = (Pos)(str))
615 + ret = s->prev[str & s->w_mask] = s->head[s->ins_h];
616 #endif
617 + s->head[s->ins_h] = str;
618 +
619 + return ret;
620 +}
621 +
622 +local INLINE Pos insert_string(deflate_state *const s, const Pos str)
623 +{
624 + if (x86_cpu_enable_simd)
625 + return insert_string_sse(s, str);
626 + return insert_string_c(s, str);
627 +}
628 +
629
630 /* ===========================================================================
631 * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
632 @@ -248,6 +273,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
633 const char *version;
634 int stream_size;
635 {
636 + unsigned window_padding = 8;
637 deflate_state *s;
638 int wrap = 1;
639 static const char my_version[] = ZLIB_VERSION;
640 @@ -257,6 +283,8 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
641 * output size for (length,distance) codes is <= 24 bits.
642 */
643
644 + x86_check_features();
645 +
646 if (version == Z_NULL || version[0] != my_version[0] ||
647 stream_size != sizeof(z_stream)) {
648 return Z_VERSION_ERROR;
649 @@ -313,12 +341,19 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
650 s->w_size = 1 << s->w_bits;
651 s->w_mask = s->w_size - 1;
652
653 - s->hash_bits = (uInt)memLevel + 7;
654 + if (x86_cpu_enable_simd) {
655 + s->hash_bits = 15;
656 + } else {
657 + s->hash_bits = memLevel + 7;
658 + }
659 +
660 s->hash_size = 1 << s->hash_bits;
661 s->hash_mask = s->hash_size - 1;
662 s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
663
664 - s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
665 + s->window = (Bytef *) ZALLOC(strm,
666 + s->w_size + window_padding,
667 + 2*sizeof(Byte));
668 s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos));
669 s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos));
670
671 @@ -418,11 +453,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
672 str = s->strstart;
673 n = s->lookahead - (MIN_MATCH-1);
674 do {
675 - UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
676 -#ifndef FASTEST
677 - s->prev[str & s->w_mask] = s->head[s->ins_h];
678 -#endif
679 - s->head[s->ins_h] = (Pos)str;
680 + insert_string(s, str);
681 str++;
682 } while (--n);
683 s->strstart = str;
684 @@ -848,7 +879,7 @@ int ZEXPORT deflate (strm, flush)
685 #ifdef GZIP
686 if (s->status == GZIP_STATE) {
687 /* gzip header */
688 - strm->adler = crc32(0L, Z_NULL, 0);
689 + crc_reset(s);
690 put_byte(s, 31);
691 put_byte(s, 139);
692 put_byte(s, 8);
693 @@ -1049,6 +1080,7 @@ int ZEXPORT deflate (strm, flush)
694 /* Write the trailer */
695 #ifdef GZIP
696 if (s->wrap == 2) {
697 + crc_finalize(s);
698 put_byte(s, (Byte)(strm->adler & 0xff));
699 put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
700 put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
701 @@ -1161,7 +1193,7 @@ int ZEXPORT deflateCopy (dest, source)
702 * allocating a large strm->next_in buffer and copying from it.
703 * (See also flush_pending()).
704 */
705 -local unsigned read_buf(strm, buf, size)
706 +ZLIB_INTERNAL unsigned deflate_read_buf(strm, buf, size)
707 z_streamp strm;
708 Bytef *buf;
709 unsigned size;
710 @@ -1173,15 +1205,16 @@ local unsigned read_buf(strm, buf, size)
711
712 strm->avail_in -= len;
713
714 - zmemcpy(buf, strm->next_in, len);
715 - if (strm->state->wrap == 1) {
716 - strm->adler = adler32(strm->adler, buf, len);
717 - }
718 #ifdef GZIP
719 - else if (strm->state->wrap == 2) {
720 - strm->adler = crc32(strm->adler, buf, len);
721 - }
722 + if (strm->state->wrap == 2)
723 + copy_with_crc(strm, buf, len);
724 + else
725 #endif
726 + {
727 + zmemcpy(buf, strm->next_in, len);
728 + if (strm->state->wrap == 1)
729 + strm->adler = adler32(strm->adler, buf, len);
730 + }
731 strm->next_in += len;
732 strm->total_in += len;
733
734 @@ -1479,7 +1512,19 @@ local void check_match(s, start, match, length)
735 * performed for at least two bytes (required for the zip translate_eol
736 * option -- not supported here).
737 */
738 -local void fill_window(s)
739 +local void fill_window_c(deflate_state *s);
740 +
741 +local void fill_window(deflate_state *s)
742 +{
743 + if (x86_cpu_enable_simd) {
744 + fill_window_sse(s);
745 + return;
746 + }
747 +
748 + fill_window_c(s);
749 +}
750 +
751 +local void fill_window_c(s)
752 deflate_state *s;
753 {
754 unsigned n;
755 @@ -1847,7 +1892,7 @@ local block_state deflate_fast(s, flush)
756 */
757 hash_head = NIL;
758 if (s->lookahead >= MIN_MATCH) {
759 - INSERT_STRING(s, s->strstart, hash_head);
760 + hash_head = insert_string(s, s->strstart);
761 }
762
763 /* Find the longest match, discarding those <= prev_length.
764 @@ -1878,7 +1923,7 @@ local block_state deflate_fast(s, flush)
765 s->match_length--; /* string at strstart already in table */
766 do {
767 s->strstart++;
768 - INSERT_STRING(s, s->strstart, hash_head);
769 + hash_head = insert_string(s, s->strstart);
770 /* strstart never exceeds WSIZE-MAX_MATCH, so there are
771 * always MIN_MATCH bytes ahead.
772 */
773 @@ -1950,7 +1995,7 @@ local block_state deflate_slow(s, flush)
774 */
775 hash_head = NIL;
776 if (s->lookahead >= MIN_MATCH) {
777 - INSERT_STRING(s, s->strstart, hash_head);
778 + hash_head = insert_string(s, s->strstart);
779 }
780
781 /* Find the longest match, discarding those <= prev_length.
782 @@ -2001,7 +2046,7 @@ local block_state deflate_slow(s, flush)
783 s->prev_length -= 2;
784 do {
785 if (++s->strstart <= max_insert) {
786 - INSERT_STRING(s, s->strstart, hash_head);
787 + hash_head = insert_string(s, s->strstart);
788 }
789 } while (--s->prev_length != 0);
790 s->match_available = 0;
791 @@ -2161,3 +2206,37 @@ local block_state deflate_huff(s, flush)
792 FLUSH_BLOCK(s, 0);
793 return block_done;
794 }
795 +
796 +/* Safe to inline this as GCC/clang will use inline asm and Visual Studio will
797 + * use intrinsic without extra params
798 + */
799 +local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str)
800 +{
801 + Pos ret;
802 + unsigned *ip, val, h = 0;
803 +
804 + ip = (unsigned *)&s->window[str];
805 + val = *ip;
806 +
807 + if (s->level >= 6)
808 + val &= 0xFFFFFF;
809 +
810 +/* Windows clang should use inline asm */
811 +#if defined(_MSC_VER) && !defined(__clang__)
812 + h = _mm_crc32_u32(h, val);
813 +#elif defined(__i386__) || defined(__amd64__)
814 + __asm__ __volatile__ (
815 + "crc32 %1,%0\n\t"
816 + : "+r" (h)
817 + : "r" (val)
818 + );
819 +#else
820 + /* This should never happen */
821 + assert(0);
822 +#endif
823 +
824 + ret = s->head[h & s->hash_mask];
825 + s->head[h & s->hash_mask] = str;
826 + s->prev[str & s->w_mask] = ret;
827 + return ret;
828 +}
829 diff --git a/deflate.h b/deflate.h
830 index 23ecdd312bc0..ab56df7663b6 100644
831 --- a/deflate.h
832 +++ b/deflate.h
833 @@ -109,7 +109,7 @@ typedef struct internal_state {
834 ulg gzindex; /* where in extra, name, or comment */
835 Byte method; /* can only be DEFLATED */
836 int last_flush; /* value of flush param for previous deflate call */
837 -
838 + unsigned zalign(16) crc0[4 * 5];
839 /* used by deflate.c: */
840
841 uInt w_size; /* LZ77 window size (32K by default) */
842 @@ -346,4 +346,14 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
843 flush = _tr_tally(s, distance, length)
844 #endif
845
846 +/* Functions that are SIMD optimised on x86 */
847 +void ZLIB_INTERNAL crc_fold_init(deflate_state* const s);
848 +void ZLIB_INTERNAL crc_fold_copy(deflate_state* const s,
849 + unsigned char* dst,
850 + const unsigned char* src,
851 + long len);
852 +unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state* const s);
853 +
854 +void ZLIB_INTERNAL fill_window_sse(deflate_state* s);
855 +
856 #endif /* DEFLATE_H */
857 diff --git a/fill_window_sse.c b/fill_window_sse.c
858 new file mode 100644
859 index 000000000000..949ccce1ba9c
860 --- /dev/null
861 +++ b/fill_window_sse.c
862 @@ -0,0 +1,177 @@
863 +/*
864 + * Fill Window with SSE2-optimized hash shifting
865 + *
866 + * Copyright (C) 2013 Intel Corporation
867 + * Authors:
868 + * Arjan van de Ven <arjan@linux.intel.com>
869 + * Jim Kukunas <james.t.kukunas@linux.intel.com>
870 + *
871 + * For conditions of distribution and use, see copyright notice in zlib.h
872 + */
873 +
874 +#include <immintrin.h>
875 +#include "deflate.h"
876 +
877 +#define UPDATE_HASH(s,h,i) \
878 + {\
879 + if (s->level < 6) { \
880 + h = (3483 * (s->window[i]) +\
881 + 23081* (s->window[i+1]) +\
882 + 6954 * (s->window[i+2]) +\
883 + 20947* (s->window[i+3])) & s->hash_mask;\
884 + } else {\
885 + h = (25881* (s->window[i]) +\
886 + 24674* (s->window[i+1]) +\
887 + 25811* (s->window[i+2])) & s->hash_mask;\
888 + }\
889 + }\
890 +
891 +extern int deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
892 +
893 +void fill_window_sse(deflate_state *s)
894 +{
895 + const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
896 +
897 + register unsigned n;
898 + register Posf *p;
899 + unsigned more; /* Amount of free space at the end of the window. */
900 + uInt wsize = s->w_size;
901 +
902 + Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
903 +
904 + do {
905 + more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
906 +
907 + /* Deal with !@#$% 64K limit: */
908 + if (sizeof(int) <= 2) {
909 + if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
910 + more = wsize;
911 +
912 + } else if (more == (unsigned)(-1)) {
913 + /* Very unlikely, but possible on 16 bit machine if
914 + * strstart == 0 && lookahead == 1 (input done a byte at time)
915 + */
916 + more--;
917 + }
918 + }
919 +
920 + /* If the window is almost full and there is insufficient lookahead,
921 + * move the upper half to the lower one to make room in the upper half.
922 + */
923 + if (s->strstart >= wsize+MAX_DIST(s)) {
924 +
925 + zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
926 + s->match_start -= wsize;
927 + s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
928 + s->block_start -= (long) wsize;
929 +
930 + /* Slide the hash table (could be avoided with 32 bit values
931 + at the expense of memory usage). We slide even when level == 0
932 + to keep the hash table consistent if we switch back to level > 0
933 + later. (Using level 0 permanently is not an optimal usage of
934 + zlib, so we don't care about this pathological case.)
935 + */
936 + n = s->hash_size;
937 + p = &s->head[n];
938 + p -= 8;
939 + do {
940 + __m128i value, result;
941 +
942 + value = _mm_loadu_si128((__m128i *)p);
943 + result = _mm_subs_epu16(value, xmm_wsize);
944 + _mm_storeu_si128((__m128i *)p, result);
945 +
946 + p -= 8;
947 + n -= 8;
948 + } while (n > 0);
949 +
950 + n = wsize;
951 +#ifndef FASTEST
952 + p = &s->prev[n];
953 + p -= 8;
954 + do {
955 + __m128i value, result;
956 +
957 + value = _mm_loadu_si128((__m128i *)p);
958 + result = _mm_subs_epu16(value, xmm_wsize);
959 + _mm_storeu_si128((__m128i *)p, result);
960 +
961 + p -= 8;
962 + n -= 8;
963 + } while (n > 0);
964 +#endif
965 + more += wsize;
966 + }
967 + if (s->strm->avail_in == 0) break;
968 +
969 + /* If there was no sliding:
970 + * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
971 + * more == window_size - lookahead - strstart
972 + * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
973 + * => more >= window_size - 2*WSIZE + 2
974 + * In the BIG_MEM or MMAP case (not yet supported),
975 + * window_size == input_size + MIN_LOOKAHEAD &&
976 + * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
977 + * Otherwise, window_size == 2*WSIZE so more >= 2.
978 + * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
979 + */
980 + Assert(more >= 2, "more < 2");
981 +
982 + n = deflate_read_buf(s->strm,
983 + s->window + s->strstart + s->lookahead,
984 + more);
985 + s->lookahead += n;
986 +
987 + /* Initialize the hash value now that we have some input: */
988 + if (s->lookahead >= MIN_MATCH) {
989 + uInt str = s->strstart;
990 + s->ins_h = s->window[str];
991 + if (str >= 1)
992 + UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
993 +#if MIN_MATCH != 3
994 + Call UPDATE_HASH() MIN_MATCH-3 more times
995 +#endif
996 + }
997 + /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
998 + * but this is not important since only literal bytes will be emitted.
999 + */
1000 +
1001 + } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
1002 +
1003 + /* If the WIN_INIT bytes after the end of the current data have never been
1004 + * written, then zero those bytes in order to avoid memory check reports of
1005 + * the use of uninitialized (or uninitialised as Julian writes) bytes by
1006 + * the longest match routines. Update the high water mark for the next
1007 + * time through here. WIN_INIT is set to MAX_MATCH since the longest match
1008 + * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
1009 + */
1010 + if (s->high_water < s->window_size) {
1011 + ulg curr = s->strstart + (ulg)(s->lookahead);
1012 + ulg init;
1013 +
1014 + if (s->high_water < curr) {
1015 + /* Previous high water mark below current data -- zero WIN_INIT
1016 + * bytes or up to end of window, whichever is less.
1017 + */
1018 + init = s->window_size - curr;
1019 + if (init > WIN_INIT)
1020 + init = WIN_INIT;
1021 + zmemzero(s->window + curr, (unsigned)init);
1022 + s->high_water = curr + init;
1023 + }
1024 + else if (s->high_water < (ulg)curr + WIN_INIT) {
1025 + /* High water mark at or above current data, but below current data
1026 + * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
1027 + * to end of window, whichever is less.
1028 + */
1029 + init = (ulg)curr + WIN_INIT - s->high_water;
1030 + if (init > s->window_size - s->high_water)
1031 + init = s->window_size - s->high_water;
1032 + zmemzero(s->window + s->high_water, (unsigned)init);
1033 + s->high_water += init;
1034 + }
1035 + }
1036 +
1037 + Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
1038 + "not enough room for search");
1039 +}
1040 diff --git a/simd_stub.c b/simd_stub.c
1041 new file mode 100644
1042 index 000000000000..c6d46051498f
1043 --- /dev/null
1044 +++ b/simd_stub.c
1045 @@ -0,0 +1,35 @@
1046 +/* simd_stub.c -- stub implementations
1047 +* Copyright (C) 2014 Intel Corporation
1048 +* For conditions of distribution and use, see copyright notice in zlib.h
1049 +*/
1050 +#include <assert.h>
1051 +
1052 +#include "deflate.h"
1053 +#include "x86.h"
1054 +
1055 +int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
1056 +
1057 +void ZLIB_INTERNAL crc_fold_init(deflate_state *const s) {
1058 + assert(0);
1059 +}
1060 +
1061 +void ZLIB_INTERNAL crc_fold_copy(deflate_state *const s,
1062 + unsigned char *dst,
1063 + const unsigned char *src,
1064 + long len) {
1065 + assert(0);
1066 +}
1067 +
1068 +unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
1069 + assert(0);
1070 + return 0;
1071 +}
1072 +
1073 +void ZLIB_INTERNAL fill_window_sse(deflate_state *s)
1074 +{
1075 + assert(0);
1076 +}
1077 +
1078 +void x86_check_features(void)
1079 +{
1080 +}
1081 diff --git a/x86.c b/x86.c
1082 new file mode 100644
1083 index 000000000000..e56fe8b85a39
1084 --- /dev/null
1085 +++ b/x86.c
1086 @@ -0,0 +1,92 @@
1087 +/*
1088 + * x86 feature check
1089 + *
1090 + * Copyright (C) 2013 Intel Corporation. All rights reserved.
1091 + * Author:
1092 + * Jim Kukunas
1093 + *
1094 + * For conditions of distribution and use, see copyright notice in zlib.h
1095 + */
1096 +
1097 +#include "x86.h"
1098 +#include "zutil.h"
1099 +
1100 +int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
1101 +
1102 +#ifndef _MSC_VER
1103 +#include <pthread.h>
1104 +
1105 +pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT;
1106 +static void _x86_check_features(void);
1107 +
1108 +void x86_check_features(void)
1109 +{
1110 + pthread_once(&cpu_check_inited_once, _x86_check_features);
1111 +}
1112 +
1113 +static void _x86_check_features(void)
1114 +{
1115 + int x86_cpu_has_sse2;
1116 + int x86_cpu_has_sse42;
1117 + int x86_cpu_has_pclmulqdq;
1118 + unsigned eax, ebx, ecx, edx;
1119 +
1120 + eax = 1;
1121 +#ifdef __i386__
1122 + __asm__ __volatile__ (
1123 + "xchg %%ebx, %1\n\t"
1124 + "cpuid\n\t"
1125 + "xchg %1, %%ebx\n\t"
1126 + : "+a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
1127 + );
1128 +#else
1129 + __asm__ __volatile__ (
1130 + "cpuid\n\t"
1131 + : "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
1132 + );
1133 +#endif /* (__i386__) */
1134 +
1135 + x86_cpu_has_sse2 = edx & 0x4000000;
1136 + x86_cpu_has_sse42 = ecx & 0x100000;
1137 + x86_cpu_has_pclmulqdq = ecx & 0x2;
1138 +
1139 + x86_cpu_enable_simd = x86_cpu_has_sse2 &&
1140 + x86_cpu_has_sse42 &&
1141 + x86_cpu_has_pclmulqdq;
1142 +}
1143 +#else
1144 +#include <intrin.h>
1145 +#include <windows.h>
1146 +
1147 +static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
1148 + PVOID param,
1149 + PVOID *context);
1150 +static INIT_ONCE cpu_check_inited_once = INIT_ONCE_STATIC_INIT;
1151 +
1152 +void x86_check_features(void)
1153 +{
1154 + InitOnceExecuteOnce(&cpu_check_inited_once, _x86_check_features,
1155 + NULL, NULL);
1156 +}
1157 +
1158 +static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
1159 + PVOID param,
1160 + PVOID *context)
1161 +{
1162 + int x86_cpu_has_sse2;
1163 + int x86_cpu_has_sse42;
1164 + int x86_cpu_has_pclmulqdq;
1165 + int regs[4];
1166 +
1167 + __cpuid(regs, 1);
1168 +
1169 + x86_cpu_has_sse2 = regs[3] & 0x4000000;
1170 + x86_cpu_has_sse42= regs[2] & 0x100000;
1171 + x86_cpu_has_pclmulqdq = regs[2] & 0x2;
1172 +
1173 + x86_cpu_enable_simd = x86_cpu_has_sse2 &&
1174 + x86_cpu_has_sse42 &&
1175 + x86_cpu_has_pclmulqdq;
1176 + return TRUE;
1177 +}
1178 +#endif /* _MSC_VER */
1179 diff --git a/x86.h b/x86.h
1180 new file mode 100644
1181 index 000000000000..ebcf10ab09d2
1182 --- /dev/null
1183 +++ b/x86.h
1184 @@ -0,0 +1,15 @@
1185 +/* x86.h -- check for x86 CPU features
1186 +* Copyright (C) 2013 Intel Corporation Jim Kukunas
1187 +* For conditions of distribution and use, see copyright notice in zlib.h
1188 +*/
1189 +
1190 +#ifndef X86_H
1191 +#define X86_H
1192 +
1193 +#include "zlib.h"
1194 +
1195 +extern int x86_cpu_enable_simd;
1196 +
1197 +void x86_check_features(void);
1198 +
1199 +#endif /* X86_H */
1200 diff --git a/zutil.h b/zutil.h
1201 index 80375b8b6109..4425bcf75eb3 100644
1202 --- a/zutil.h
1203 +++ b/zutil.h
1204 @@ -283,4 +283,10 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
1205 #define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
1206 (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
1207
1208 +#ifdef _MSC_VER
1209 +#define zalign(x) __declspec(align(x))
1210 +#else
1211 +#define zalign(x) __attribute__((aligned((x))))
1212 +#endif
1213 +
1214 #endif /* ZUTIL_H */