"Fossies" - the Fresh Open Source Software Archive

Member "hashcat-6.2.6/deps/zlib/contrib/amd64/amd64-match.S" (2 Sep 2022, 12418 Bytes) of package /linux/privat/hashcat-6.2.6.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PowerPC Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 /*
    2  * match.S -- optimized version of longest_match()
    3  * based on the similar work by Gilles Vollant, and Brian Raiter, written 1998
    4  *
    5  * This is free software; you can redistribute it and/or modify it
    6  * under the terms of the BSD License. Use by owners of Che Guevarra
    7  * parafernalia is prohibited, where possible, and highly discouraged
    8  * elsewhere.
    9  */
   10 
   11 #ifndef NO_UNDERLINE
   12 #   define  match_init  _match_init
   13 #   define  longest_match   _longest_match
   14 #endif
   15 
   16 #define scanend     ebx
   17 #define scanendw    bx
   18 #define chainlenwmask   edx /* high word: current chain len low word: s->wmask */
   19 #define curmatch    rsi
   20 #define curmatchd   esi
   21 #define windowbestlen   r8
   22 #define scanalign   r9
   23 #define scanalignd  r9d
   24 #define window      r10
   25 #define bestlen     r11
   26 #define bestlend    r11d
   27 #define scanstart   r12d
   28 #define scanstartw  r12w
   29 #define scan        r13
   30 #define nicematch   r14d
   31 #define limit       r15
   32 #define limitd      r15d
   33 #define prev        rcx
   34 
   35 /*
   36  * The 258 is a "magic number, not a parameter -- changing it
   37  * breaks the hell loose
   38  */
   39 #define MAX_MATCH   (258)
   40 #define MIN_MATCH   (3)
   41 #define MIN_LOOKAHEAD   (MAX_MATCH + MIN_MATCH + 1)
   42 #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
   43 
   44 /* stack frame offsets */
   45 #define LocalVarsSize   (112)
   46 #define _chainlenwmask  ( 8-LocalVarsSize)(%rsp)
   47 #define _windowbestlen  (16-LocalVarsSize)(%rsp)
   48 #define save_r14        (24-LocalVarsSize)(%rsp)
   49 #define save_rsi        (32-LocalVarsSize)(%rsp)
   50 #define save_rbx        (40-LocalVarsSize)(%rsp)
   51 #define save_r12        (56-LocalVarsSize)(%rsp)
   52 #define save_r13        (64-LocalVarsSize)(%rsp)
   53 #define save_r15        (80-LocalVarsSize)(%rsp)
   54 
   55 
   56 .globl  match_init, longest_match
   57 
   58 /*
   59  * On AMD64 the first argument of a function (in our case -- the pointer to
   60  * deflate_state structure) is passed in %rdi, hence our offsets below are
   61  * all off of that.
   62  */
   63 
   64 /* you can check the structure offset by running
   65 
   66 #include <stdlib.h>
   67 #include <stdio.h>
   68 #include "deflate.h"
   69 
   70 void print_depl()
   71 {
   72 deflate_state ds;
   73 deflate_state *s=&ds;
   74 printf("size pointer=%u\n",(int)sizeof(void*));
   75 
   76 printf("#define dsWSize         (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s)));
   77 printf("#define dsWMask         (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s)));
   78 printf("#define dsWindow        (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s)));
   79 printf("#define dsPrev          (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s)));
   80 printf("#define dsMatchLen      (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s)));
   81 printf("#define dsPrevMatch     (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s)));
   82 printf("#define dsStrStart      (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s)));
   83 printf("#define dsMatchStart    (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s)));
   84 printf("#define dsLookahead     (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s)));
   85 printf("#define dsPrevLen       (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s)));
   86 printf("#define dsMaxChainLen   (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
   87 printf("#define dsGoodMatch     (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s)));
   88 printf("#define dsNiceMatch     (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s)));
   89 }
   90 
   91 */
   92 
   93 
   94 /*
   95   to compile for XCode 3.2 on MacOSX x86_64
   96   - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
   97  */
   98 
   99 
  100 #ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
  101 #define dsWSize     ( 68)(%rdi)
  102 #define dsWMask     ( 76)(%rdi)
  103 #define dsWindow    ( 80)(%rdi)
  104 #define dsPrev      ( 96)(%rdi)
  105 #define dsMatchLen  (144)(%rdi)
  106 #define dsPrevMatch (148)(%rdi)
  107 #define dsStrStart  (156)(%rdi)
  108 #define dsMatchStart    (160)(%rdi)
  109 #define dsLookahead (164)(%rdi)
  110 #define dsPrevLen   (168)(%rdi)
  111 #define dsMaxChainLen   (172)(%rdi)
  112 #define dsGoodMatch (188)(%rdi)
  113 #define dsNiceMatch (192)(%rdi)
  114 
  115 #else 
  116 
  117 #ifndef STRUCT_OFFSET
  118 #   define STRUCT_OFFSET    (0)
  119 #endif
  120 
  121 
  122 #define dsWSize     ( 56 + STRUCT_OFFSET)(%rdi)
  123 #define dsWMask     ( 64 + STRUCT_OFFSET)(%rdi)
  124 #define dsWindow    ( 72 + STRUCT_OFFSET)(%rdi)
  125 #define dsPrev      ( 88 + STRUCT_OFFSET)(%rdi)
  126 #define dsMatchLen  (136 + STRUCT_OFFSET)(%rdi)
  127 #define dsPrevMatch (140 + STRUCT_OFFSET)(%rdi)
  128 #define dsStrStart  (148 + STRUCT_OFFSET)(%rdi)
  129 #define dsMatchStart    (152 + STRUCT_OFFSET)(%rdi)
  130 #define dsLookahead (156 + STRUCT_OFFSET)(%rdi)
  131 #define dsPrevLen   (160 + STRUCT_OFFSET)(%rdi)
  132 #define dsMaxChainLen   (164 + STRUCT_OFFSET)(%rdi)
  133 #define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)
  134 #define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)
  135 
  136 #endif
  137 
  138 
  139 
  140 
  141 .text
  142 
  143 /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
  144 
  145 longest_match:
  146 /*
  147  * Retrieve the function arguments. %curmatch will hold cur_match
  148  * throughout the entire function (passed via rsi on amd64).
  149  * rdi will hold the pointer to the deflate_state (first arg on amd64)
  150  */
  151         mov     %rsi, save_rsi
  152         mov     %rbx, save_rbx
  153         mov %r12, save_r12
  154         mov     %r13, save_r13
  155         mov     %r14, save_r14
  156         mov     %r15, save_r15
  157 
  158 /* uInt wmask = s->w_mask;                      */
  159 /* unsigned chain_length = s->max_chain_length;             */
  160 /* if (s->prev_length >= s->good_match) {               */
  161 /*     chain_length >>= 2;                      */
  162 /* }                                    */
  163 
  164         movl    dsPrevLen, %eax
  165         movl    dsGoodMatch, %ebx
  166         cmpl    %ebx, %eax
  167         movl    dsWMask, %eax
  168         movl    dsMaxChainLen, %chainlenwmask
  169         jl  LastMatchGood
  170         shrl    $2, %chainlenwmask
  171 LastMatchGood:
  172 
  173 /* chainlen is decremented once beforehand so that the function can */
  174 /* use the sign flag instead of the zero flag for the exit test.    */
  175 /* It is then shifted into the high word, to make room for the wmask    */
  176 /* value, which it will always accompany.               */
  177 
  178         decl    %chainlenwmask
  179         shll    $16, %chainlenwmask
  180         orl %eax, %chainlenwmask
  181 
  182 /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;  */
  183 
  184         movl    dsNiceMatch, %eax
  185         movl    dsLookahead, %ebx
  186         cmpl    %eax, %ebx
  187         jl  LookaheadLess
  188         movl    %eax, %ebx
  189 LookaheadLess:  movl    %ebx, %nicematch
  190 
  191 /* register Bytef *scan = s->window + s->strstart;          */
  192 
  193         mov dsWindow, %window
  194         movl    dsStrStart, %limitd
  195         lea (%limit, %window), %scan
  196 
  197 /* Determine how many bytes the scan ptr is off from being      */
  198 /* dword-aligned.                           */
  199 
  200         mov %scan, %scanalign
  201         negl    %scanalignd
  202         andl    $3, %scanalignd
  203 
  204 /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ?           */
  205 /*     s->strstart - (IPos)MAX_DIST(s) : NIL;               */
  206 
  207         movl    dsWSize, %eax
  208         subl    $MIN_LOOKAHEAD, %eax
  209         xorl    %ecx, %ecx
  210         subl    %eax, %limitd
  211         cmovng  %ecx, %limitd
  212 
  213 /* int best_len = s->prev_length;                   */
  214 
  215         movl    dsPrevLen, %bestlend
  216 
  217 /* Store the sum of s->window + best_len in %windowbestlen locally, and in memory.  */
  218 
  219         lea (%window, %bestlen), %windowbestlen
  220         mov %windowbestlen, _windowbestlen
  221 
  222 /* register ush scan_start = *(ushf*)scan;              */
  223 /* register ush scan_end   = *(ushf*)(scan+best_len-1);         */
  224 /* Posf *prev = s->prev;                        */
  225 
  226         movzwl  (%scan), %scanstart
  227         movzwl  -1(%scan, %bestlen), %scanend
  228         mov dsPrev, %prev
  229 
  230 /* Jump into the main loop.                     */
  231 
  232         movl    %chainlenwmask, _chainlenwmask
  233         jmp LoopEntry
  234 
  235 .balign 16
  236 
  237 /* do {
  238  *     match = s->window + cur_match;
  239  *     if (*(ushf*)(match+best_len-1) != scan_end ||
  240  *         *(ushf*)match != scan_start) continue;
  241  *     [...]
  242  * } while ((cur_match = prev[cur_match & wmask]) > limit
  243  *          && --chain_length != 0);
  244  *
  245  * Here is the inner loop of the function. The function will spend the
  246  * majority of its time in this loop, and majority of that time will
  247  * be spent in the first ten instructions.
  248  */
  249 LookupLoop:
  250         andl    %chainlenwmask, %curmatchd
  251         movzwl  (%prev, %curmatch, 2), %curmatchd
  252         cmpl    %limitd, %curmatchd
  253         jbe LeaveNow
  254         subl    $0x00010000, %chainlenwmask
  255         js  LeaveNow
  256 LoopEntry:  cmpw    -1(%windowbestlen, %curmatch), %scanendw
  257         jne LookupLoop
  258         cmpw    %scanstartw, (%window, %curmatch)
  259         jne LookupLoop
  260 
  261 /* Store the current value of chainlen.                 */
  262         movl    %chainlenwmask, _chainlenwmask
  263 
  264 /* %scan is the string under scrutiny, and %prev to the string we   */
  265 /* are hoping to match it up with. In actuality, %esi and %edi are  */
  266 /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is  */
  267 /* initialized to -(MAX_MATCH_8 - scanalign).               */
  268 
  269         mov $(-MAX_MATCH_8), %rdx
  270         lea (%curmatch, %window), %windowbestlen
  271         lea MAX_MATCH_8(%windowbestlen, %scanalign), %windowbestlen
  272         lea MAX_MATCH_8(%scan, %scanalign), %prev
  273 
  274 /* the prefetching below makes very little difference... */
  275         prefetcht1  (%windowbestlen, %rdx)
  276         prefetcht1  (%prev, %rdx)
  277 
  278 /*
  279  * Test the strings for equality, 8 bytes at a time. At the end,
  280  * adjust %rdx so that it is offset to the exact byte that mismatched.
  281  *
  282  * It should be confessed that this loop usually does not represent
  283  * much of the total running time. Replacing it with a more
  284  * straightforward "rep cmpsb" would not drastically degrade
  285  * performance -- unrolling it, for example, makes no difference.
  286  */
  287 
  288 #undef USE_SSE  /* works, but is 6-7% slower, than non-SSE... */
  289 
  290 LoopCmps:
  291 #ifdef USE_SSE
  292         /* Preload the SSE registers */
  293         movdqu    (%windowbestlen, %rdx), %xmm1
  294         movdqu    (%prev, %rdx), %xmm2
  295         pcmpeqb %xmm2, %xmm1
  296         movdqu  16(%windowbestlen, %rdx), %xmm3
  297         movdqu  16(%prev, %rdx), %xmm4
  298         pcmpeqb %xmm4, %xmm3
  299         movdqu  32(%windowbestlen, %rdx), %xmm5
  300         movdqu  32(%prev, %rdx), %xmm6
  301         pcmpeqb %xmm6, %xmm5
  302         movdqu  48(%windowbestlen, %rdx), %xmm7
  303         movdqu  48(%prev, %rdx), %xmm8
  304         pcmpeqb %xmm8, %xmm7
  305 
  306         /* Check the comparisions' results */
  307         pmovmskb %xmm1, %rax
  308         notw    %ax
  309         bsfw    %ax, %ax
  310         jnz LeaveLoopCmps
  311         
  312         /* this is the only iteration of the loop with a possibility of having
  313            incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40 
  314            and (0x40*4)+8=0x108 */
  315         add $8, %rdx
  316         jz LenMaximum
  317         add $8, %rdx
  318 
  319         
  320         pmovmskb %xmm3, %rax
  321         notw    %ax
  322         bsfw    %ax, %ax
  323         jnz LeaveLoopCmps
  324         
  325         
  326         add $16, %rdx
  327 
  328 
  329         pmovmskb %xmm5, %rax
  330         notw    %ax
  331         bsfw    %ax, %ax
  332         jnz LeaveLoopCmps
  333         
  334         add $16, %rdx
  335 
  336 
  337         pmovmskb %xmm7, %rax
  338         notw    %ax
  339         bsfw    %ax, %ax
  340         jnz LeaveLoopCmps
  341         
  342         add $16, %rdx
  343         
  344         jmp LoopCmps
  345 LeaveLoopCmps:  add %rax, %rdx
  346 #else
  347         mov (%windowbestlen, %rdx), %rax
  348         xor (%prev, %rdx), %rax
  349         jnz LeaveLoopCmps
  350         
  351         mov 8(%windowbestlen, %rdx), %rax
  352         xor 8(%prev, %rdx), %rax
  353         jnz LeaveLoopCmps8
  354 
  355         mov 16(%windowbestlen, %rdx), %rax
  356         xor 16(%prev, %rdx), %rax
  357         jnz LeaveLoopCmps16
  358                 
  359         add $24, %rdx
  360         jnz LoopCmps
  361         jmp LenMaximum
  362 #   if 0
  363 /*
  364  * This three-liner is tantalizingly simple, but bsf is a slow instruction,
  365  * and the complicated alternative down below is quite a bit faster. Sad...
  366  */
  367 
  368 LeaveLoopCmps:  bsf %rax, %rax /* find the first non-zero bit */
  369         shrl    $3, %eax /* divide by 8 to get the byte */
  370         add %rax, %rdx
  371 #   else
  372 LeaveLoopCmps16:
  373         add $8, %rdx
  374 LeaveLoopCmps8:
  375         add $8, %rdx
  376 LeaveLoopCmps:  testl   $0xFFFFFFFF, %eax /* Check the first 4 bytes */
  377         jnz     Check16
  378         add     $4, %rdx
  379         shr     $32, %rax
  380 Check16:        testw   $0xFFFF, %ax
  381         jnz     LenLower
  382         add $2, %rdx
  383         shrl    $16, %eax
  384 LenLower:   subb    $1, %al
  385         adc $0, %rdx
  386 #   endif
  387 #endif
  388 
  389 /* Calculate the length of the match. If it is longer than MAX_MATCH,   */
  390 /* then automatically accept it as the best possible match and leave.   */
  391 
  392         lea (%prev, %rdx), %rax
  393         sub %scan, %rax
  394         cmpl    $MAX_MATCH, %eax
  395         jge LenMaximum
  396 
  397 /* If the length of the match is not longer than the best match we  */
  398 /* have so far, then forget it and return to the lookup loop.       */
  399 
  400         cmpl    %bestlend, %eax
  401         jg  LongerMatch
  402         mov _windowbestlen, %windowbestlen
  403         mov dsPrev, %prev
  404         movl    _chainlenwmask, %edx
  405         jmp LookupLoop
  406 
  407 /*         s->match_start = cur_match;                  */
  408 /*         best_len = len;                      */
  409 /*         if (len >= nice_match) break;                */
  410 /*         scan_end = *(ushf*)(scan+best_len-1);            */
  411 
  412 LongerMatch:
  413         movl    %eax, %bestlend
  414         movl    %curmatchd, dsMatchStart
  415         cmpl    %nicematch, %eax
  416         jge LeaveNow
  417 
  418         lea (%window, %bestlen), %windowbestlen
  419         mov %windowbestlen, _windowbestlen
  420 
  421         movzwl  -1(%scan, %rax), %scanend
  422         mov dsPrev, %prev
  423         movl    _chainlenwmask, %chainlenwmask
  424         jmp LookupLoop
  425 
  426 /* Accept the current string, with the maximum possible length.     */
  427 
  428 LenMaximum:
  429         movl    $MAX_MATCH, %bestlend
  430         movl    %curmatchd, dsMatchStart
  431 
  432 /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len;       */
  433 /* return s->lookahead;                         */
  434 
  435 LeaveNow:
  436         movl    dsLookahead, %eax
  437         cmpl    %eax, %bestlend
  438         cmovngl %bestlend, %eax
  439 LookaheadRet:
  440 
  441 /* Restore the registers and return from whence we came.            */
  442 
  443     mov save_rsi, %rsi
  444     mov save_rbx, %rbx
  445     mov save_r12, %r12
  446     mov save_r13, %r13
  447     mov save_r14, %r14
  448     mov save_r15, %r15
  449 
  450     ret
  451 
  452 match_init: ret