"Fossies" - the Fresh Open Source Software Archive

Member "qt-everywhere-src-6.3.1/qtwebengine/src/3rdparty/chromium/third_party/zlib/patches/0001-simd.patch" (8 Jun 2022, 39766 Bytes) of package /linux/misc/qt-everywhere-src-6.3.1.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Diff source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 diff --git a/crc32.c b/crc32.c
    2 index 9580440c0e6b..9162429cc7b4 100644
    3 --- a/crc32.c
    4 +++ b/crc32.c
    5 @@ -28,6 +28,8 @@
    6  #  endif /* !DYNAMIC_CRC_TABLE */
    7  #endif /* MAKECRCH */
    8  
    9 +#include "deflate.h"
   10 +#include "x86.h"
   11  #include "zutil.h"      /* for STDC and FAR definitions */
   12  
   13  /* Definitions for doing the crc four data bytes at a time. */
   14 @@ -440,3 +442,28 @@ uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
   15  {
   16      return crc32_combine_(crc1, crc2, len2);
   17  }
   18 +
   19 +ZLIB_INTERNAL void crc_reset(deflate_state *const s)
   20 +{
   21 +    if (x86_cpu_enable_simd) {
   22 +        crc_fold_init(s);
   23 +        return;
   24 +    }
   25 +    s->strm->adler = crc32(0L, Z_NULL, 0);
   26 +}
   27 +
   28 +ZLIB_INTERNAL void crc_finalize(deflate_state *const s)
   29 +{
   30 +    if (x86_cpu_enable_simd)
   31 +        s->strm->adler = crc_fold_512to32(s);
   32 +}
   33 +
   34 +ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size)
   35 +{
   36 +    if (x86_cpu_enable_simd) {
   37 +        crc_fold_copy(strm->state, dst, strm->next_in, size);
   38 +        return;
   39 +    }
   40 +    zmemcpy(dst, strm->next_in, size);
   41 +    strm->adler = crc32(strm->adler, dst, size);
   42 +}
   43 diff --git a/crc_folding.c b/crc_folding.c
   44 new file mode 100644
   45 index 000000000000..48d77744aaf4
   46 --- /dev/null
   47 +++ b/crc_folding.c
   48 @@ -0,0 +1,493 @@
   49 +/*
   50 + * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
   51 + * instruction.
   52 + *
   53 + * A white paper describing this algorithm can be found at:
   54 + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
   55 + *
   56 + * Copyright (C) 2013 Intel Corporation. All rights reserved.
   57 + * Authors:
   58 + *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
   59 + *     Jim Guilford    <james.guilford@intel.com>
   60 + *     Vinodh Gopal    <vinodh.gopal@intel.com>
   61 + *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
   62 + *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
   63 + *
   64 + * For conditions of distribution and use, see copyright notice in zlib.h
   65 + */
   66 +
   67 +#include "deflate.h"
   68 +
   69 +#include <inttypes.h>
   70 +#include <emmintrin.h>
   71 +#include <immintrin.h>
   72 +#include <wmmintrin.h>
   73 +
   74 +#define CRC_LOAD(s) \
   75 +    do { \
   76 +        __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
   77 +        __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
   78 +        __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
   79 +        __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
   80 +        __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
   81 +
   82 +#define CRC_SAVE(s) \
   83 +        _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
   84 +        _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
   85 +        _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
   86 +        _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
   87 +        _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
   88 +    } while (0);
   89 +
   90 +ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
   91 +{
   92 +    CRC_LOAD(s)
   93 +
   94 +    xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
   95 +    xmm_crc1 = _mm_setzero_si128();
   96 +    xmm_crc2 = _mm_setzero_si128();
   97 +    xmm_crc3 = _mm_setzero_si128();
   98 +
   99 +    CRC_SAVE(s)
  100 +
  101 +    s->strm->adler = 0;
  102 +}
  103 +
  104 +local void fold_1(deflate_state *const s,
  105 +        __m128i *xmm_crc0, __m128i *xmm_crc1,
  106 +        __m128i *xmm_crc2, __m128i *xmm_crc3)
  107 +{
  108 +    const __m128i xmm_fold4 = _mm_set_epi32(
  109 +            0x00000001, 0x54442bd4,
  110 +            0x00000001, 0xc6e41596);
  111 +
  112 +    __m128i x_tmp3;
  113 +    __m128 ps_crc0, ps_crc3, ps_res;
  114 +
  115 +    x_tmp3 = *xmm_crc3;
  116 +
  117 +    *xmm_crc3 = *xmm_crc0;
  118 +    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
  119 +    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
  120 +    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
  121 +    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
  122 +    ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
  123 +
  124 +    *xmm_crc0 = *xmm_crc1;
  125 +    *xmm_crc1 = *xmm_crc2;
  126 +    *xmm_crc2 = x_tmp3;
  127 +    *xmm_crc3 = _mm_castps_si128(ps_res);
  128 +}
  129 +
  130 +local void fold_2(deflate_state *const s,
  131 +        __m128i *xmm_crc0, __m128i *xmm_crc1,
  132 +        __m128i *xmm_crc2, __m128i *xmm_crc3)
  133 +{
  134 +    const __m128i xmm_fold4 = _mm_set_epi32(
  135 +            0x00000001, 0x54442bd4,
  136 +            0x00000001, 0xc6e41596);
  137 +
  138 +    __m128i x_tmp3, x_tmp2;
  139 +    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
  140 +
  141 +    x_tmp3 = *xmm_crc3;
  142 +    x_tmp2 = *xmm_crc2;
  143 +
  144 +    *xmm_crc3 = *xmm_crc1;
  145 +    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
  146 +    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
  147 +    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
  148 +    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
  149 +    ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
  150 +
  151 +    *xmm_crc2 = *xmm_crc0;
  152 +    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
  153 +    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
  154 +    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
  155 +    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
  156 +    ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
  157 +
  158 +    *xmm_crc0 = x_tmp2;
  159 +    *xmm_crc1 = x_tmp3;
  160 +    *xmm_crc2 = _mm_castps_si128(ps_res20);
  161 +    *xmm_crc3 = _mm_castps_si128(ps_res31);
  162 +}
  163 +
  164 +local void fold_3(deflate_state *const s,
  165 +        __m128i *xmm_crc0, __m128i *xmm_crc1,
  166 +        __m128i *xmm_crc2, __m128i *xmm_crc3)
  167 +{
  168 +    const __m128i xmm_fold4 = _mm_set_epi32(
  169 +            0x00000001, 0x54442bd4,
  170 +            0x00000001, 0xc6e41596);
  171 +
  172 +    __m128i x_tmp3;
  173 +    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
  174 +
  175 +    x_tmp3 = *xmm_crc3;
  176 +
  177 +    *xmm_crc3 = *xmm_crc2;
  178 +    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
  179 +    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
  180 +    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
  181 +    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
  182 +    ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
  183 +
  184 +    *xmm_crc2 = *xmm_crc1;
  185 +    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
  186 +    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
  187 +    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
  188 +    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
  189 +    ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
  190 +
  191 +    *xmm_crc1 = *xmm_crc0;
  192 +    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
  193 +    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
  194 +    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
  195 +    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
  196 +    ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
  197 +
  198 +    *xmm_crc0 = x_tmp3;
  199 +    *xmm_crc1 = _mm_castps_si128(ps_res10);
  200 +    *xmm_crc2 = _mm_castps_si128(ps_res21);
  201 +    *xmm_crc3 = _mm_castps_si128(ps_res32);
  202 +}
  203 +
  204 +local void fold_4(deflate_state *const s,
  205 +        __m128i *xmm_crc0, __m128i *xmm_crc1,
  206 +        __m128i *xmm_crc2, __m128i *xmm_crc3)
  207 +{
  208 +    const __m128i xmm_fold4 = _mm_set_epi32(
  209 +            0x00000001, 0x54442bd4,
  210 +            0x00000001, 0xc6e41596);
  211 +
  212 +    __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
  213 +    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
  214 +    __m128 ps_t0, ps_t1, ps_t2, ps_t3;
  215 +    __m128 ps_res0, ps_res1, ps_res2, ps_res3;
  216 +
  217 +    x_tmp0 = *xmm_crc0;
  218 +    x_tmp1 = *xmm_crc1;
  219 +    x_tmp2 = *xmm_crc2;
  220 +    x_tmp3 = *xmm_crc3;
  221 +
  222 +    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
  223 +    x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
  224 +    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
  225 +    ps_t0 = _mm_castsi128_ps(x_tmp0);
  226 +    ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
  227 +
  228 +    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
  229 +    x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
  230 +    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
  231 +    ps_t1 = _mm_castsi128_ps(x_tmp1);
  232 +    ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
  233 +
  234 +    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
  235 +    x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
  236 +    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
  237 +    ps_t2 = _mm_castsi128_ps(x_tmp2);
  238 +    ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
  239 +
  240 +    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
  241 +    x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
  242 +    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
  243 +    ps_t3 = _mm_castsi128_ps(x_tmp3);
  244 +    ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
  245 +
  246 +    *xmm_crc0 = _mm_castps_si128(ps_res0);
  247 +    *xmm_crc1 = _mm_castps_si128(ps_res1);
  248 +    *xmm_crc2 = _mm_castps_si128(ps_res2);
  249 +    *xmm_crc3 = _mm_castps_si128(ps_res3);
  250 +}
  251 +
  252 +local const unsigned zalign(32) pshufb_shf_table[60] = {
  253 +   0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
  254 +   0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
  255 +   0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
  256 +   0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
  257 +   0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
  258 +   0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
  259 +   0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl  9 (16 - 7)/shr7 */
  260 +   0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl  8 (16 - 8)/shr8 */
  261 +   0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl  7 (16 - 9)/shr9 */
  262 +   0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl  6 (16 -10)/shr10*/
  263 +   0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl  5 (16 -11)/shr11*/
  264 +   0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl  4 (16 -12)/shr12*/
  265 +   0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
  266 +   0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
  267 +   0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
  268 +};
  269 +
  270 +local void partial_fold(deflate_state *const s, const size_t len,
  271 +        __m128i *xmm_crc0, __m128i *xmm_crc1,
  272 +        __m128i *xmm_crc2, __m128i *xmm_crc3,
  273 +        __m128i *xmm_crc_part)
  274 +{
  275 +
  276 +    const __m128i xmm_fold4 = _mm_set_epi32(
  277 +            0x00000001, 0x54442bd4,
  278 +            0x00000001, 0xc6e41596);
  279 +    const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
  280 +
  281 +    __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
  282 +    __m128i xmm_a0_0, xmm_a0_1;
  283 +    __m128 ps_crc3, psa0_0, psa0_1, ps_res;
  284 +
  285 +    xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
  286 +    xmm_shr = xmm_shl;
  287 +    xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
  288 +
  289 +    xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
  290 +
  291 +    *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
  292 +    xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
  293 +    *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
  294 +
  295 +    *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
  296 +    xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
  297 +    *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
  298 +
  299 +    *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
  300 +    xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
  301 +    *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
  302 +
  303 +    *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
  304 +    *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
  305 +    *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
  306 +
  307 +    xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
  308 +    xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
  309 +
  310 +    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
  311 +    psa0_0 = _mm_castsi128_ps(xmm_a0_0);
  312 +    psa0_1 = _mm_castsi128_ps(xmm_a0_1);
  313 +
  314 +    ps_res = _mm_xor_ps(ps_crc3, psa0_0);
  315 +    ps_res = _mm_xor_ps(ps_res, psa0_1);
  316 +
  317 +    *xmm_crc3 = _mm_castps_si128(ps_res);
  318 +}
  319 +
  320 +ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
  321 +        unsigned char *dst, const unsigned char *src, long len)
  322 +{
  323 +    unsigned long algn_diff;
  324 +    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
  325 +
  326 +    CRC_LOAD(s)
  327 +
  328 +    if (len < 16) {
  329 +        if (len == 0)
  330 +            return;
  331 +        goto partial;
  332 +    }
  333 +
  334 +    algn_diff = 0 - (uintptr_t)src & 0xF;
  335 +    if (algn_diff) {
  336 +        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
  337 +        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
  338 +
  339 +        dst += algn_diff;
  340 +        src += algn_diff;
  341 +        len -= algn_diff;
  342 +
  343 +        partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
  344 +            &xmm_crc_part);
  345 +    }
  346 +
  347 +    while ((len -= 64) >= 0) {
  348 +        xmm_t0 = _mm_load_si128((__m128i *)src);
  349 +        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
  350 +        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
  351 +        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
  352 +
  353 +        fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
  354 +
  355 +        _mm_storeu_si128((__m128i *)dst, xmm_t0);
  356 +        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
  357 +        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
  358 +        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
  359 +
  360 +        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
  361 +        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
  362 +        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
  363 +        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
  364 +
  365 +        src += 64;
  366 +        dst += 64;
  367 +    }
  368 +
  369 +    /*
  370 +     * len = num bytes left - 64
  371 +     */
  372 +    if (len + 16 >= 0) {
  373 +        len += 16;
  374 +
  375 +        xmm_t0 = _mm_load_si128((__m128i *)src);
  376 +        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
  377 +        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
  378 +
  379 +        fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
  380 +
  381 +        _mm_storeu_si128((__m128i *)dst, xmm_t0);
  382 +        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
  383 +        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
  384 +
  385 +        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
  386 +        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
  387 +        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
  388 +
  389 +        if (len == 0)
  390 +            goto done;
  391 +
  392 +        dst += 48;
  393 +        src += 48;
  394 +    } else if (len + 32 >= 0) {
  395 +        len += 32;
  396 +
  397 +        xmm_t0 = _mm_load_si128((__m128i *)src);
  398 +        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
  399 +
  400 +        fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
  401 +
  402 +        _mm_storeu_si128((__m128i *)dst, xmm_t0);
  403 +        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
  404 +
  405 +        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
  406 +        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
  407 +
  408 +        if (len == 0)
  409 +            goto done;
  410 +
  411 +        dst += 32;
  412 +        src += 32;
  413 +    } else if (len + 48 >= 0) {
  414 +        len += 48;
  415 +
  416 +        xmm_t0 = _mm_load_si128((__m128i *)src);
  417 +
  418 +        fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
  419 +
  420 +        _mm_storeu_si128((__m128i *)dst, xmm_t0);
  421 +
  422 +        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
  423 +
  424 +        if (len == 0)
  425 +            goto done;
  426 +
  427 +        dst += 16;
  428 +        src += 16;
  429 +    } else {
  430 +        len += 64;
  431 +        if (len == 0)
  432 +            goto done;
  433 +    }
  434 +
  435 +partial:
  436 +
  437 +#if defined(_MSC_VER)
  438 +    /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
  439 +    {
  440 +        int32_t parts[4] = {0, 0, 0, 0};
  441 +        memcpy(&parts, src, len);
  442 +        xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
  443 +    }
  444 +#else
  445 +    {
  446 +        int64_t parts[2] = {0, 0};
  447 +        memcpy(&parts, src, len);
  448 +        xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
  449 +    }
  450 +#endif
  451 +
  452 +    _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
  453 +    partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
  454 +        &xmm_crc_part);
  455 +done:
  456 +    CRC_SAVE(s)
  457 +}
  458 +
  459 +local const unsigned zalign(16) crc_k[] = {
  460 +    0xccaa009e, 0x00000000, /* rk1 */
  461 +    0x751997d0, 0x00000001, /* rk2 */
  462 +    0xccaa009e, 0x00000000, /* rk5 */
  463 +    0x63cd6124, 0x00000001, /* rk6 */
  464 +    0xf7011640, 0x00000001, /* rk7 */
  465 +    0xdb710640, 0x00000001  /* rk8 */
  466 +};
  467 +
  468 +local const unsigned zalign(16) crc_mask[4] = {
  469 +    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
  470 +};
  471 +
  472 +local const unsigned zalign(16) crc_mask2[4] = {
  473 +    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
  474 +};
  475 +
  476 +unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
  477 +{
  478 +    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
  479 +    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
  480 +
  481 +    unsigned crc;
  482 +    __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
  483 +
  484 +    CRC_LOAD(s)
  485 +
  486 +    /*
  487 +     * k1
  488 +     */
  489 +    crc_fold = _mm_load_si128((__m128i *)crc_k);
  490 +
  491 +    x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
  492 +    xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
  493 +    xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
  494 +    xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
  495 +
  496 +    x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
  497 +    xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
  498 +    xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
  499 +    xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
  500 +
  501 +    x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
  502 +    xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
  503 +    xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
  504 +    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
  505 +
  506 +    /*
  507 +     * k5
  508 +     */
  509 +    crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
  510 +
  511 +    xmm_crc0 = xmm_crc3;
  512 +    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
  513 +    xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
  514 +    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
  515 +
  516 +    xmm_crc0 = xmm_crc3;
  517 +    xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
  518 +    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
  519 +    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
  520 +    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
  521 +
  522 +    /*
  523 +     * k7
  524 +     */
  525 +    xmm_crc1 = xmm_crc3;
  526 +    xmm_crc2 = xmm_crc3;
  527 +    crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
  528 +
  529 +    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
  530 +    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
  531 +    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
  532 +
  533 +    xmm_crc2 = xmm_crc3;
  534 +    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
  535 +    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
  536 +    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
  537 +
  538 +    crc = _mm_extract_epi32(xmm_crc3, 2);
  539 +    return ~crc;
  540 +    CRC_SAVE(s)
  541 +}
  542 diff --git a/deflate.c b/deflate.c
  543 index 1ec761448de9..aa0c9c67a6dc 100644
  544 --- a/deflate.c
  545 +++ b/deflate.c
  546 @@ -48,8 +48,9 @@
  547   */
  548  
  549  /* @(#) $Id$ */
  550 -
  551 +#include <assert.h>
  552  #include "deflate.h"
  553 +#include "x86.h"
  554  
  555  const char deflate_copyright[] =
  556     " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
  557 @@ -86,7 +87,7 @@ local block_state deflate_huff   OF((deflate_state *s, int flush));
  558  local void lm_init        OF((deflate_state *s));
  559  local void putShortMSB    OF((deflate_state *s, uInt b));
  560  local void flush_pending  OF((z_streamp strm));
  561 -local unsigned read_buf   OF((z_streamp strm, Bytef *buf, unsigned size));
  562 +unsigned ZLIB_INTERNAL deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
  563  #ifdef ASMV
  564  #  pragma message("Assembler code may have bugs -- use at your own risk")
  565        void match_init OF((void)); /* asm code initialization */
  566 @@ -100,6 +101,20 @@ local  void check_match OF((deflate_state *s, IPos start, IPos match,
  567                              int length));
  568  #endif
  569  
  570 +/* From crc32.c */
  571 +extern void ZLIB_INTERNAL crc_reset(deflate_state *const s);
  572 +extern void ZLIB_INTERNAL crc_finalize(deflate_state *const s);
  573 +extern void ZLIB_INTERNAL copy_with_crc(z_streamp strm, Bytef *dst, long size);
  574 +
  575 +#ifdef _MSC_VER
  576 +#define INLINE __inline
  577 +#else
  578 +#define INLINE inline
  579 +#endif
  580 +
  581 +/* Inline optimisation */
  582 +local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str);
  583 +
  584  /* ===========================================================================
  585   * Local data
  586   */
  587 @@ -162,7 +177,6 @@ local const config configuration_table[10] = {
  588   */
  589  #define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
  590  
  591 -
  592  /* ===========================================================================
  593   * Insert string str in the dictionary and set match_head to the previous head
  594   * of the hash chain (the most recent string with same hash key). Return
  595 @@ -173,17 +187,28 @@ local const config configuration_table[10] = {
  596   *    characters and the first MIN_MATCH bytes of str are valid (except for
  597   *    the last MIN_MATCH-1 bytes of the input file).
  598   */
  599 +local INLINE Pos insert_string_c(deflate_state *const s, const Pos str)
  600 +{
  601 +    Pos ret;
  602 +
  603 +    UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]);
  604  #ifdef FASTEST
  605 -#define INSERT_STRING(s, str, match_head) \
  606 -   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
  607 -    match_head = s->head[s->ins_h], \
  608 -    s->head[s->ins_h] = (Pos)(str))
  609 +    ret = s->head[s->ins_h];
  610  #else
  611 -#define INSERT_STRING(s, str, match_head) \
  612 -   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
  613 -    match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
  614 -    s->head[s->ins_h] = (Pos)(str))
  615 +    ret = s->prev[str & s->w_mask] = s->head[s->ins_h];
  616  #endif
  617 +    s->head[s->ins_h] = str;
  618 +
  619 +    return ret;
  620 +}
  621 +
  622 +local INLINE Pos insert_string(deflate_state *const s, const Pos str)
  623 +{
  624 +    if (x86_cpu_enable_simd)
  625 +        return insert_string_sse(s, str);
  626 +    return insert_string_c(s, str);
  627 +}
  628 +
  629  
  630  /* ===========================================================================
  631   * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
  632 @@ -248,6 +273,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
  633      const char *version;
  634      int stream_size;
  635  {
  636 +    unsigned window_padding = 8;
  637      deflate_state *s;
  638      int wrap = 1;
  639      static const char my_version[] = ZLIB_VERSION;
  640 @@ -257,6 +283,8 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
  641       * output size for (length,distance) codes is <= 24 bits.
  642       */
  643  
  644 +    x86_check_features();
  645 +
  646      if (version == Z_NULL || version[0] != my_version[0] ||
  647          stream_size != sizeof(z_stream)) {
  648          return Z_VERSION_ERROR;
  649 @@ -313,12 +341,19 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
  650      s->w_size = 1 << s->w_bits;
  651      s->w_mask = s->w_size - 1;
  652  
  653 -    s->hash_bits = (uInt)memLevel + 7;
  654 +    if (x86_cpu_enable_simd) {
  655 +        s->hash_bits = 15;
  656 +    } else {
  657 +        s->hash_bits = memLevel + 7;
  658 +    }
  659 +
  660      s->hash_size = 1 << s->hash_bits;
  661      s->hash_mask = s->hash_size - 1;
  662      s->hash_shift =  ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
  663  
  664 -    s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
  665 +    s->window = (Bytef *) ZALLOC(strm,
  666 +                                 s->w_size + window_padding,
  667 +                                 2*sizeof(Byte));
  668      s->prev   = (Posf *)  ZALLOC(strm, s->w_size, sizeof(Pos));
  669      s->head   = (Posf *)  ZALLOC(strm, s->hash_size, sizeof(Pos));
  670  
  671 @@ -418,11 +453,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
  672          str = s->strstart;
  673          n = s->lookahead - (MIN_MATCH-1);
  674          do {
  675 -            UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
  676 -#ifndef FASTEST
  677 -            s->prev[str & s->w_mask] = s->head[s->ins_h];
  678 -#endif
  679 -            s->head[s->ins_h] = (Pos)str;
  680 +            insert_string(s, str);
  681              str++;
  682          } while (--n);
  683          s->strstart = str;
  684 @@ -848,7 +879,7 @@ int ZEXPORT deflate (strm, flush)
  685  #ifdef GZIP
  686      if (s->status == GZIP_STATE) {
  687          /* gzip header */
  688 -        strm->adler = crc32(0L, Z_NULL, 0);
  689 +        crc_reset(s);
  690          put_byte(s, 31);
  691          put_byte(s, 139);
  692          put_byte(s, 8);
  693 @@ -1049,6 +1080,7 @@ int ZEXPORT deflate (strm, flush)
  694      /* Write the trailer */
  695  #ifdef GZIP
  696      if (s->wrap == 2) {
  697 +        crc_finalize(s);
  698          put_byte(s, (Byte)(strm->adler & 0xff));
  699          put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
  700          put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
  701 @@ -1161,7 +1193,7 @@ int ZEXPORT deflateCopy (dest, source)
  702   * allocating a large strm->next_in buffer and copying from it.
  703   * (See also flush_pending()).
  704   */
  705 -local unsigned read_buf(strm, buf, size)
  706 +ZLIB_INTERNAL unsigned deflate_read_buf(strm, buf, size)
  707      z_streamp strm;
  708      Bytef *buf;
  709      unsigned size;
  710 @@ -1173,15 +1205,16 @@ local unsigned read_buf(strm, buf, size)
  711  
  712      strm->avail_in  -= len;
  713  
  714 -    zmemcpy(buf, strm->next_in, len);
  715 -    if (strm->state->wrap == 1) {
  716 -        strm->adler = adler32(strm->adler, buf, len);
  717 -    }
  718  #ifdef GZIP
  719 -    else if (strm->state->wrap == 2) {
  720 -        strm->adler = crc32(strm->adler, buf, len);
  721 -    }
  722 +    if (strm->state->wrap == 2)
  723 +        copy_with_crc(strm, buf, len);
  724 +    else 
  725  #endif
  726 +    {
  727 +        zmemcpy(buf, strm->next_in, len);
  728 +        if (strm->state->wrap == 1)
  729 +            strm->adler = adler32(strm->adler, buf, len);
  730 +    }
  731      strm->next_in  += len;
  732      strm->total_in += len;
  733  
  734 @@ -1479,7 +1512,19 @@ local void check_match(s, start, match, length)
  735   *    performed for at least two bytes (required for the zip translate_eol
  736   *    option -- not supported here).
  737   */
  738 -local void fill_window(s)
  739 +local void fill_window_c(deflate_state *s);
  740 +
  741 +local void fill_window(deflate_state *s)
  742 +{
  743 +    if (x86_cpu_enable_simd) {
  744 +        fill_window_sse(s);
  745 +        return;
  746 +    }
  747 +
  748 +    fill_window_c(s);
  749 +}
  750 +
  751 +local void fill_window_c(s)
  752      deflate_state *s;
  753  {
  754      unsigned n;
  755 @@ -1847,7 +1892,7 @@ local block_state deflate_fast(s, flush)
  756           */
  757          hash_head = NIL;
  758          if (s->lookahead >= MIN_MATCH) {
  759 -            INSERT_STRING(s, s->strstart, hash_head);
  760 +            hash_head = insert_string(s, s->strstart);
  761          }
  762  
  763          /* Find the longest match, discarding those <= prev_length.
  764 @@ -1878,7 +1923,7 @@ local block_state deflate_fast(s, flush)
  765                  s->match_length--; /* string at strstart already in table */
  766                  do {
  767                      s->strstart++;
  768 -                    INSERT_STRING(s, s->strstart, hash_head);
  769 +                    hash_head = insert_string(s, s->strstart);
  770                      /* strstart never exceeds WSIZE-MAX_MATCH, so there are
  771                       * always MIN_MATCH bytes ahead.
  772                       */
  773 @@ -1950,7 +1995,7 @@ local block_state deflate_slow(s, flush)
  774           */
  775          hash_head = NIL;
  776          if (s->lookahead >= MIN_MATCH) {
  777 -            INSERT_STRING(s, s->strstart, hash_head);
  778 +            hash_head = insert_string(s, s->strstart);
  779          }
  780  
  781          /* Find the longest match, discarding those <= prev_length.
  782 @@ -2001,7 +2046,7 @@ local block_state deflate_slow(s, flush)
  783              s->prev_length -= 2;
  784              do {
  785                  if (++s->strstart <= max_insert) {
  786 -                    INSERT_STRING(s, s->strstart, hash_head);
  787 +                    hash_head = insert_string(s, s->strstart);
  788                  }
  789              } while (--s->prev_length != 0);
  790              s->match_available = 0;
  791 @@ -2161,3 +2206,37 @@ local block_state deflate_huff(s, flush)
  792          FLUSH_BLOCK(s, 0);
  793      return block_done;
  794  }
  795 +
  796 +/* Safe to inline this as GCC/clang will use inline asm and Visual Studio will
  797 + * use intrinsic without extra params
  798 + */
  799 +local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str)
  800 +{
  801 +    Pos ret;
  802 +    unsigned *ip, val, h = 0;
  803 +
  804 +    ip = (unsigned *)&s->window[str];
  805 +    val = *ip;
  806 +
  807 +    if (s->level >= 6)
  808 +        val &= 0xFFFFFF;
  809 +
  810 +/* Windows clang should use inline asm */
  811 +#if defined(_MSC_VER) && !defined(__clang__)
  812 +    h = _mm_crc32_u32(h, val);
  813 +#elif defined(__i386__) || defined(__amd64__)
  814 +    __asm__ __volatile__ (
  815 +        "crc32 %1,%0\n\t"
  816 +    : "+r" (h)
  817 +    : "r" (val)
  818 +    );
  819 +#else
  820 +    /* This should never happen */
  821 +    assert(0);
  822 +#endif
  823 +
  824 +    ret = s->head[h & s->hash_mask];
  825 +    s->head[h & s->hash_mask] = str;
  826 +    s->prev[str & s->w_mask] = ret;
  827 +    return ret;
  828 +}
  829 diff --git a/deflate.h b/deflate.h
  830 index 23ecdd312bc0..ab56df7663b6 100644
  831 --- a/deflate.h
  832 +++ b/deflate.h
  833 @@ -109,7 +109,7 @@ typedef struct internal_state {
  834      ulg   gzindex;       /* where in extra, name, or comment */
  835      Byte  method;        /* can only be DEFLATED */
  836      int   last_flush;    /* value of flush param for previous deflate call */
  837 -
  838 +    unsigned zalign(16) crc0[4 * 5];
  839                  /* used by deflate.c: */
  840  
  841      uInt  w_size;        /* LZ77 window size (32K by default) */
  842 @@ -346,4 +346,14 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
  843                flush = _tr_tally(s, distance, length)
  844  #endif
  845  
  846 +/* Functions that are SIMD optimised on x86 */
  847 +void ZLIB_INTERNAL crc_fold_init(deflate_state* const s);
  848 +void ZLIB_INTERNAL crc_fold_copy(deflate_state* const s,
  849 +                                 unsigned char* dst,
  850 +                                 const unsigned char* src,
  851 +                                 long len);
  852 +unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state* const s);
  853 +
  854 +void ZLIB_INTERNAL fill_window_sse(deflate_state* s);
  855 +
  856  #endif /* DEFLATE_H */
  857 diff --git a/fill_window_sse.c b/fill_window_sse.c
  858 new file mode 100644
  859 index 000000000000..949ccce1ba9c
  860 --- /dev/null
  861 +++ b/fill_window_sse.c
  862 @@ -0,0 +1,177 @@
  863 +/*
  864 + * Fill Window with SSE2-optimized hash shifting
  865 + *
  866 + * Copyright (C) 2013 Intel Corporation
  867 + * Authors:
  868 + *  Arjan van de Ven    <arjan@linux.intel.com>
  869 + *  Jim Kukunas         <james.t.kukunas@linux.intel.com>
  870 + *
  871 + * For conditions of distribution and use, see copyright notice in zlib.h
  872 + */
  873 +
  874 +#include <immintrin.h>
  875 +#include "deflate.h"
  876 +
  877 +#define UPDATE_HASH(s,h,i) \
  878 +    {\
  879 +        if (s->level < 6) { \
  880 +            h = (3483 * (s->window[i]) +\
  881 +                 23081* (s->window[i+1]) +\
  882 +                 6954 * (s->window[i+2]) +\
  883 +                 20947* (s->window[i+3])) & s->hash_mask;\
  884 +        } else {\
  885 +            h = (25881* (s->window[i]) +\
  886 +                 24674* (s->window[i+1]) +\
  887 +                 25811* (s->window[i+2])) & s->hash_mask;\
  888 +        }\
  889 +    }\
  890 +
  891 +extern int deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
  892 +
  893 +void fill_window_sse(deflate_state *s)
  894 +{
  895 +    const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
  896 +
  897 +    register unsigned n;
  898 +    register Posf *p;
  899 +    unsigned more;    /* Amount of free space at the end of the window. */
  900 +    uInt wsize = s->w_size;
  901 +
  902 +    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
  903 +
  904 +    do {
  905 +        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
  906 +
  907 +        /* Deal with !@#$% 64K limit: */
  908 +        if (sizeof(int) <= 2) {
  909 +            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
  910 +                more = wsize;
  911 +
  912 +            } else if (more == (unsigned)(-1)) {
  913 +                /* Very unlikely, but possible on 16 bit machine if
  914 +                 * strstart == 0 && lookahead == 1 (input done a byte at time)
  915 +                 */
  916 +                more--;
  917 +            }
  918 +        }
  919 +
  920 +        /* If the window is almost full and there is insufficient lookahead,
  921 +         * move the upper half to the lower one to make room in the upper half.
  922 +         */
  923 +        if (s->strstart >= wsize+MAX_DIST(s)) {
  924 +
  925 +            zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
  926 +            s->match_start -= wsize;
  927 +            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
  928 +            s->block_start -= (long) wsize;
  929 +
  930 +            /* Slide the hash table (could be avoided with 32 bit values
  931 +               at the expense of memory usage). We slide even when level == 0
  932 +               to keep the hash table consistent if we switch back to level > 0
  933 +               later. (Using level 0 permanently is not an optimal usage of
  934 +               zlib, so we don't care about this pathological case.)
  935 +             */
  936 +            n = s->hash_size;
  937 +            p = &s->head[n];
  938 +            p -= 8;
  939 +            do {
  940 +                __m128i value, result;
  941 +
  942 +                value = _mm_loadu_si128((__m128i *)p);
  943 +                result = _mm_subs_epu16(value, xmm_wsize);
  944 +                _mm_storeu_si128((__m128i *)p, result);
  945 +
  946 +                p -= 8;
  947 +                n -= 8;
  948 +            } while (n > 0);
  949 +
  950 +            n = wsize;
  951 +#ifndef FASTEST
  952 +            p = &s->prev[n];
  953 +            p -= 8;
  954 +            do {
  955 +                __m128i value, result;
  956 +
  957 +                value = _mm_loadu_si128((__m128i *)p);
  958 +                result = _mm_subs_epu16(value, xmm_wsize);
  959 +                _mm_storeu_si128((__m128i *)p, result);
  960 +
  961 +                p -= 8;
  962 +                n -= 8;
  963 +            } while (n > 0);
  964 +#endif
  965 +            more += wsize;
  966 +        }
  967 +        if (s->strm->avail_in == 0) break;
  968 +
  969 +        /* If there was no sliding:
  970 +         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
  971 +         *    more == window_size - lookahead - strstart
  972 +         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
  973 +         * => more >= window_size - 2*WSIZE + 2
  974 +         * In the BIG_MEM or MMAP case (not yet supported),
  975 +         *   window_size == input_size + MIN_LOOKAHEAD  &&
  976 +         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
  977 +         * Otherwise, window_size == 2*WSIZE so more >= 2.
  978 +         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
  979 +         */
  980 +        Assert(more >= 2, "more < 2");
  981 +
  982 +        n = deflate_read_buf(s->strm,
  983 +                             s->window + s->strstart + s->lookahead,
  984 +                             more);
  985 +        s->lookahead += n;
  986 +
  987 +        /* Initialize the hash value now that we have some input: */
  988 +        if (s->lookahead >= MIN_MATCH) {
  989 +            uInt str = s->strstart;
  990 +            s->ins_h = s->window[str];
  991 +            if (str >= 1)
  992 +                UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
  993 +#if MIN_MATCH != 3
  994 +            Call UPDATE_HASH() MIN_MATCH-3 more times
  995 +#endif
  996 +        }
  997 +        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
  998 +         * but this is not important since only literal bytes will be emitted.
  999 +         */
 1000 +
 1001 +    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
 1002 +
 1003 +    /* If the WIN_INIT bytes after the end of the current data have never been
 1004 +     * written, then zero those bytes in order to avoid memory check reports of
 1005 +     * the use of uninitialized (or uninitialised as Julian writes) bytes by
 1006 +     * the longest match routines.  Update the high water mark for the next
 1007 +     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
 1008 +     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
 1009 +     */
 1010 +    if (s->high_water < s->window_size) {
 1011 +        ulg curr = s->strstart + (ulg)(s->lookahead);
 1012 +        ulg init;
 1013 +
 1014 +        if (s->high_water < curr) {
 1015 +            /* Previous high water mark below current data -- zero WIN_INIT
 1016 +             * bytes or up to end of window, whichever is less.
 1017 +             */
 1018 +            init = s->window_size - curr;
 1019 +            if (init > WIN_INIT)
 1020 +                init = WIN_INIT;
 1021 +            zmemzero(s->window + curr, (unsigned)init);
 1022 +            s->high_water = curr + init;
 1023 +        }
 1024 +        else if (s->high_water < (ulg)curr + WIN_INIT) {
 1025 +            /* High water mark at or above current data, but below current data
 1026 +             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
 1027 +             * to end of window, whichever is less.
 1028 +             */
 1029 +            init = (ulg)curr + WIN_INIT - s->high_water;
 1030 +            if (init > s->window_size - s->high_water)
 1031 +                init = s->window_size - s->high_water;
 1032 +            zmemzero(s->window + s->high_water, (unsigned)init);
 1033 +            s->high_water += init;
 1034 +        }
 1035 +    }
 1036 +
 1037 +    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
 1038 +           "not enough room for search");
 1039 +}
 1040 diff --git a/simd_stub.c b/simd_stub.c
 1041 new file mode 100644
 1042 index 000000000000..c6d46051498f
 1043 --- /dev/null
 1044 +++ b/simd_stub.c
 1045 @@ -0,0 +1,35 @@
 1046 +/* simd_stub.c -- stub implementations
 1047 +* Copyright (C) 2014 Intel Corporation
 1048 +* For conditions of distribution and use, see copyright notice in zlib.h
 1049 +*/
 1050 +#include <assert.h>
 1051 +
 1052 +#include "deflate.h"
 1053 +#include "x86.h"
 1054 +
 1055 +int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
 1056 +
 1057 +void ZLIB_INTERNAL crc_fold_init(deflate_state *const s) {
 1058 +    assert(0);
 1059 +}
 1060 +
 1061 +void ZLIB_INTERNAL crc_fold_copy(deflate_state *const s,
 1062 +                                 unsigned char *dst,
 1063 +                                 const unsigned char *src,
 1064 +                                 long len) {
 1065 +    assert(0);
 1066 +}
 1067 +
 1068 +unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
 1069 +    assert(0);
 1070 +    return 0;
 1071 +}
 1072 +
 1073 +void ZLIB_INTERNAL fill_window_sse(deflate_state *s)
 1074 +{
 1075 +    assert(0);
 1076 +}
 1077 +
 1078 +void x86_check_features(void)
 1079 +{
 1080 +}
 1081 diff --git a/x86.c b/x86.c
 1082 new file mode 100644
 1083 index 000000000000..e56fe8b85a39
 1084 --- /dev/null
 1085 +++ b/x86.c
 1086 @@ -0,0 +1,92 @@
 1087 +/*
 1088 + * x86 feature check
 1089 + *
 1090 + * Copyright (C) 2013 Intel Corporation. All rights reserved.
 1091 + * Author:
 1092 + *  Jim Kukunas
 1093 + * 
 1094 + * For conditions of distribution and use, see copyright notice in zlib.h
 1095 + */
 1096 +
 1097 +#include "x86.h"
 1098 +#include "zutil.h"
 1099 +
 1100 +int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
 1101 +
 1102 +#ifndef _MSC_VER
 1103 +#include <pthread.h>
 1104 +
 1105 +pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT;
 1106 +static void _x86_check_features(void);
 1107 +
 1108 +void x86_check_features(void)
 1109 +{
 1110 +  pthread_once(&cpu_check_inited_once, _x86_check_features);
 1111 +}
 1112 +
 1113 +static void _x86_check_features(void)
 1114 +{
 1115 +    int x86_cpu_has_sse2;
 1116 +    int x86_cpu_has_sse42;
 1117 +    int x86_cpu_has_pclmulqdq;
 1118 +    unsigned eax, ebx, ecx, edx;
 1119 +
 1120 +    eax = 1;
 1121 +#ifdef __i386__
 1122 +    __asm__ __volatile__ (
 1123 +        "xchg %%ebx, %1\n\t"
 1124 +        "cpuid\n\t"
 1125 +        "xchg %1, %%ebx\n\t"
 1126 +    : "+a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
 1127 +    );
 1128 +#else
 1129 +    __asm__ __volatile__ (
 1130 +        "cpuid\n\t"
 1131 +    : "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
 1132 +    );
 1133 +#endif  /* (__i386__) */
 1134 +
 1135 +    x86_cpu_has_sse2 = edx & 0x4000000;
 1136 +    x86_cpu_has_sse42 = ecx & 0x100000;
 1137 +    x86_cpu_has_pclmulqdq = ecx & 0x2;
 1138 +
 1139 +    x86_cpu_enable_simd = x86_cpu_has_sse2 &&
 1140 +                          x86_cpu_has_sse42 &&
 1141 +                          x86_cpu_has_pclmulqdq;
 1142 +}
 1143 +#else
 1144 +#include <intrin.h>
 1145 +#include <windows.h>
 1146 +
 1147 +static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
 1148 +                                         PVOID param,
 1149 +                                         PVOID *context);
 1150 +static INIT_ONCE cpu_check_inited_once = INIT_ONCE_STATIC_INIT;
 1151 +
 1152 +void x86_check_features(void)
 1153 +{
 1154 +    InitOnceExecuteOnce(&cpu_check_inited_once, _x86_check_features,
 1155 +                        NULL, NULL);
 1156 +}
 1157 +
 1158 +static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
 1159 +                                         PVOID param,
 1160 +                                         PVOID *context)
 1161 +{
 1162 +    int x86_cpu_has_sse2;
 1163 +    int x86_cpu_has_sse42;
 1164 +    int x86_cpu_has_pclmulqdq;
 1165 +    int regs[4];
 1166 +
 1167 +    __cpuid(regs, 1);
 1168 +
 1169 +    x86_cpu_has_sse2 = regs[3] & 0x4000000;
 1170 +    x86_cpu_has_sse42= regs[2] & 0x100000;
 1171 +    x86_cpu_has_pclmulqdq = regs[2] & 0x2;
 1172 +
 1173 +    x86_cpu_enable_simd = x86_cpu_has_sse2 &&
 1174 +                          x86_cpu_has_sse42 &&
 1175 +                          x86_cpu_has_pclmulqdq;
 1176 +    return TRUE;
 1177 +}
 1178 +#endif  /* _MSC_VER */
 1179 diff --git a/x86.h b/x86.h
 1180 new file mode 100644
 1181 index 000000000000..ebcf10ab09d2
 1182 --- /dev/null
 1183 +++ b/x86.h
 1184 @@ -0,0 +1,15 @@
 1185 +/* x86.h -- check for x86 CPU features
 1186 +* Copyright (C) 2013 Intel Corporation Jim Kukunas
 1187 +* For conditions of distribution and use, see copyright notice in zlib.h
 1188 +*/
 1189 +
 1190 +#ifndef X86_H
 1191 +#define X86_H
 1192 +
 1193 +#include "zlib.h"
 1194 +
 1195 +extern int x86_cpu_enable_simd;
 1196 +
 1197 +void x86_check_features(void);
 1198 +
 1199 +#endif  /* X86_H */
 1200 diff --git a/zutil.h b/zutil.h
 1201 index 80375b8b6109..4425bcf75eb3 100644
 1202 --- a/zutil.h
 1203 +++ b/zutil.h
 1204 @@ -283,4 +283,10 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 1205  #define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
 1206                      (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
 1207  
 1208 +#ifdef _MSC_VER
 1209 +#define zalign(x) __declspec(align(x))
 1210 +#else
 1211 +#define zalign(x) __attribute__((aligned((x))))
 1212 +#endif
 1213 +
 1214  #endif /* ZUTIL_H */