"Fossies" - the Fresh Open Source Software Archive

Member "zsync-0.6.2/librcksum/rsum.c" (16 Sep 2010, 17657 Bytes) of package /linux/privat/old/zsync-0.6.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "rsum.c" see the Fossies "Dox" file reference documentation.

    1 
    2 /*
    3  *   rcksum/lib - library for using the rsync algorithm to determine
    4  *               which parts of a file you have and which you need.
    5  *   Copyright (C) 2004,2005,2007,2009 Colin Phipps <cph@moria.org.uk>
    6  *
    7  *   This program is free software; you can redistribute it and/or modify
    8  *   it under the terms of the Artistic License v2 (see the accompanying 
    9  *   file COPYING for the full license terms), or, at your option, any later 
   10  *   version of the same license.
   11  *
   12  *   This program is distributed in the hope that it will be useful,
   13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
   14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   15  *   COPYING file for details.
   16  */
   17 
   18 /* This is the core of the rsync rolling checksum algorithm - this is what it's
   19  * all about. */
   20 
   21 #include "zsglobal.h"
   22 
   23 #include <stdio.h>
   24 #include <stdlib.h>
   25 #include <string.h>
   26 #include <errno.h>
   27 #include <unistd.h>
   28 
   29 #ifdef WITH_DMALLOC
   30 # include <dmalloc.h>
   31 #endif
   32 
   33 #include "md4.h"
   34 #include "rcksum.h"
   35 #include "internal.h"
   36 
   37 #define UPDATE_RSUM(a, b, oldc, newc, bshift) do { (a) += ((unsigned char)(newc)) - ((unsigned char)(oldc)); (b) += (a) - ((oldc) << (bshift)); } while (0)
   38 
   39 /* rcksum_calc_rsum_block(data, data_len)
   40  * Calculate the rsum for a single block of data. */
   41 struct rsum __attribute__ ((pure)) rcksum_calc_rsum_block(const unsigned char *data, size_t len) {
   42     register unsigned short a = 0;
   43     register unsigned short b = 0;
   44 
   45     while (len) {
   46         unsigned char c = *data++;
   47         a += c;
   48         b += len * c;
   49         len--;
   50     }
   51     {
   52         struct rsum r = { a, b };
   53         return r;
   54     }
   55 }
   56 
   57 /* rcksum_calc_checksum(checksum_buf, data, data_len)
   58  * Returns the MD4 checksum (in checksum_buf) of the given data block */
   59 void rcksum_calc_checksum(unsigned char *c, const unsigned char *data,
   60                           size_t len) {
   61     MD4_CTX ctx;
   62     MD4Init(&ctx);
   63     MD4Update(&ctx, data, len);
   64     MD4Final(c, &ctx);
   65 }
   66 
   67 #ifndef HAVE_PWRITE
   68 /* Fallback pwrite(2) implementation if needed (but not strictly complete, as
   69  * it moves the file pointer - we don't care). */
   70 ssize_t pwrite(int d, const void *buf, size_t nbytes, off_t offset) {
   71     if (lseek(d, offset, SEEK_SET) == -1)
   72         return -1;
   73     return write(d, buf, nbytes);
   74 }
   75 #endif
   76 
   77 /* write_blocks(rcksum_state, buf, startblock, endblock)
   78  * Writes the block range (inclusive) from the supplied buffer to our
   79  * under-construction output file */
   80 static void write_blocks(struct rcksum_state *z, const unsigned char *data,
   81                          zs_blockid bfrom, zs_blockid bto) {
   82     off_t len = ((off_t) (bto - bfrom + 1)) << z->blockshift;
   83     off_t offset = ((off_t) bfrom) << z->blockshift;
   84 
   85     while (len) {
   86         size_t l = len;
   87         int rc;
   88 
   89         /* On some platforms, the bytes-to-write could be more than pwrite(2)
   90          * will accept. Write in blocks of 2^31 bytes in that case. */
   91         if ((off_t) l < len)
   92             l = 0x8000000;
   93 
   94         /* Write */
   95         rc = pwrite(z->fd, data, l, offset);
   96         if (rc == -1) {
   97             fprintf(stderr, "IO error: %s\n", strerror(errno));
   98             exit(-1);
   99         }
  100 
  101         /* Keep track of any data still to do */
  102         len -= rc;
  103         if (len) {              /* More to write */
  104             data += rc;
  105             offset += rc;
  106         }
  107     }
  108 
  109     {   /* Having written those blocks, discard them from the rsum hashes (as
  110          * we don't need to identify data for those blocks again, and this may
  111          * speed up lookups (in particular if there are lots of identical
  112          * blocks), and add the written blocks to the record of blocks that we
  113          * have received and stored the data for */
  114         int id;
  115         for (id = bfrom; id <= bto; id++) {
  116             remove_block_from_hash(z, id);
  117             add_to_ranges(z, id);
  118         }
  119     }
  120 }
  121 
  122 /* rcksum_read_known_data(self, buf, offset, len)
  123  * Read back data from the working output - read len bytes from offset into
  124  * buf[] (which must be at least len bytes long) */
  125 int rcksum_read_known_data(struct rcksum_state *z, unsigned char *buf,
  126                            off_t offset, size_t len) {
  127     int rc = pread(z->fd, buf, len, offset);
  128     return rc;
  129 }
  130 
  131 /* rcksum_submit_blocks(self, data, startblock, endblock)
  132  * The data in data[] (which should be (endblock - startblock + 1) * blocksize * bytes)
  133  * is tested block-by-block as valid data against the target checksums for
  134  * those blocks and, if valid, accepted and written to the working output.
  135  *
  136  * Use this when you have obtained data that you know corresponds to given
  137  * blocks in the output file (i.e. you've downloaded them from a real copy of
  138  * the target).
  139  */
  140 int rcksum_submit_blocks(struct rcksum_state *const z, const unsigned char *data,
  141                          zs_blockid bfrom, zs_blockid bto) {
  142     zs_blockid x;
  143     unsigned char md4sum[CHECKSUM_SIZE];
  144 
  145     /* Build checksum hash tables if we don't have them yet */
  146     if (!z->rsum_hash)
  147         if (!build_hash(z))
  148             return -1;
  149 
  150     /* Check each block */
  151     for (x = bfrom; x <= bto; x++) {
  152         rcksum_calc_checksum(&md4sum[0], data + ((x - bfrom) << z->blockshift),
  153                              z->blocksize);
  154         if (memcmp(&md4sum, &(z->blockhashes[x].checksum[0]), z->checksum_bytes)) {
  155             if (x > bfrom)      /* Write any good blocks we did get */
  156                 write_blocks(z, data, bfrom, x - 1);
  157             return -1;
  158         }
  159     }
  160 
  161     /* All blocks are valid; write them and update our state */
  162     write_blocks(z, data, bfrom, bto);
  163     return 0;
  164 }
  165 
  166 /* check_checksums_on_hash_chain(self, &hash_entry, data[], onlyone)
  167  * Given a hash table entry, check the data in this block against every entry
  168  * in the linked list for this hash entry, checking the checksums for this
  169  * block against those recorded in the hash entries.
  170  *
  171  * If we get a hit (checksums match a desired block), write the data to that
  172  * block in the target file and update our state accordingly to indicate that
  173  * we have got that block successfully.
  174  *
  175  * Return the number of blocks successfully obtained.
  176  */
  177 static int check_checksums_on_hash_chain(struct rcksum_state *const z,
  178                                          const struct hash_entry *e,
  179                                          const unsigned char *data,
  180                                          int onlyone) {
  181     unsigned char md4sum[2][CHECKSUM_SIZE];
  182     signed int done_md4 = -1;
  183     int got_blocks = 0;
  184     register struct rsum r = z->r[0];
  185 
  186     /* This is a hint to the caller that they should try matching the next
  187      * block against a particular hash entry (because at least z->seq_matches
  188      * prior blocks to it matched in sequence). Clear it here and set it below
  189      * if and when we get such a set of matches. */
  190     z->next_match = NULL;
  191 
  192     /* This is essentially a for (;e;e=e->next), but we want to remove links from
  193      * the list as we find matches, without keeping too many temp variables.
  194      */
  195     z->rover = e;
  196     while (z->rover) {
  197         zs_blockid id;
  198 
  199         e = z->rover;
  200         z->rover = onlyone ? NULL : e->next;
  201 
  202         /* Check weak checksum first */
  203 
  204         z->stats.hashhit++;
  205         if (e->r.a != (r.a & z->rsum_a_mask) || e->r.b != r.b) {
  206             continue;
  207         }
  208 
  209         id = get_HE_blockid(z, e);
  210 
  211         if (!onlyone && z->seq_matches > 1
  212             && (z->blockhashes[id + 1].r.a != (z->r[1].a & z->rsum_a_mask)
  213                 || z->blockhashes[id + 1].r.b != z->r[1].b))
  214             continue;
  215 
  216         z->stats.weakhit++;
  217 
  218         {
  219             int ok = 1;
  220             signed int check_md4 = 0;
  221             zs_blockid next_known = -1;
  222 
  223             /* This block at least must match; we must match at least
  224              * z->seq_matches-1 others, which could either be trailing stuff,
  225              * or these could be preceding blocks that we have verified
  226              * already. */
  227             do {
  228                 /* We only calculate the MD4 once we need it; but need not do so twice */
  229                 if (check_md4 > done_md4) {
  230                     rcksum_calc_checksum(&md4sum[check_md4][0],
  231                                          data + z->blocksize * check_md4,
  232                                          z->blocksize);
  233                     done_md4 = check_md4;
  234                     z->stats.checksummed++;
  235                 }
  236 
  237                 /* Now check the strong checksum for this block */
  238                 if (memcmp(&md4sum[check_md4],
  239                      z->blockhashes[id + check_md4].checksum,
  240                      z->checksum_bytes))
  241                     ok = 0;
  242 
  243                 else if (next_known == -1)
  244 
  245                 check_md4++;
  246             } while (ok && !onlyone && check_md4 < z->seq_matches);
  247 
  248             if (ok) {
  249                 int num_write_blocks;
  250 
  251                 /* Find the next block that we already have data for. If this
  252                  * is part of a run of matches then we have this stored already
  253                  * as ->next_known. */
  254                 zs_blockid next_known = onlyone ? z->next_known : next_known_block(z, id);
  255 
  256                 z->stats.stronghit += check_md4;
  257 
  258                 if (next_known > id + check_md4) {
  259                     num_write_blocks = check_md4;
  260 
  261                     /* Save state for this run of matches */
  262                     z->next_match = &(z->blockhashes[id + check_md4]);
  263                     if (!onlyone) z->next_known = next_known;
  264                 }
  265                 else {
  266                     /* We've reached the EOF, or data we already know. Just
  267                      * write out the blocks we don't know, and that's the end
  268                      * of this run of matches. */
  269                     num_write_blocks = next_known - id;
  270                 }
  271 
  272                 /* Write out the matched blocks that we don't yet know */
  273                 write_blocks(z, data, id, id + num_write_blocks - 1);
  274                 got_blocks += num_write_blocks;
  275             }
  276         }
  277     }
  278     return got_blocks;
  279 }
  280 
  281 /* rcksum_submit_source_data(self, data, datalen, offset)
  282  * Reads the supplied data (length datalen) and identifies any contained blocks
  283  * of data that can be used to make up the target file.
  284  *
  285  * offset should be 0 for a new data stream (or if our position in the data
  286  * stream has been changed and does not match the last call) or should be the
  287  * offset in the whole source stream otherwise.
  288  *
  289  * Returns the number of blocks in the target file that we obtained as a result
  290  * of reading this buffer. 
  291  *
  292  * IMPLEMENTATION:
  293  * We maintain the following state:
  294  * skip - the number of bytes to skip next time we enter rcksum_submit_source_data
  295  *        e.g. because we've just matched a block and the forward jump takes 
  296  *        us past the end of the buffer
  297  * r[0] - rolling checksum of the first blocksize bytes of the buffer
  298  * r[1] - rolling checksum of the next blocksize bytes of the buffer (if seq_matches > 1)
  299  */
  300 int rcksum_submit_source_data(struct rcksum_state *const z, unsigned char *data,
  301                               size_t len, off_t offset) {
  302     /* The window in data[] currently being considered is 
  303      * [x, x+bs)
  304      */
  305     int x = 0;
  306     register int bs = z->blocksize;
  307     int got_blocks = 0;
  308 
  309     if (offset) {
  310         x = z->skip;
  311     }
  312     else {
  313         z->next_match = NULL;
  314     }
  315 
  316     if (x || !offset) {
  317         z->r[0] = rcksum_calc_rsum_block(data + x, bs);
  318         if (z->seq_matches > 1)
  319             z->r[1] = rcksum_calc_rsum_block(data + x + bs, bs);
  320     }
  321     z->skip = 0;
  322 
  323     /* Work through the block until the current blocksize bytes being
  324      * considered, starting at x, is at the end of the buffer */
  325     for (;;) {
  326         if (x + z->context == len) {
  327             return got_blocks;
  328         }
  329 
  330 #if 0
  331         {   /* Catch rolling checksum failure */
  332             int k = 0;
  333             struct rsum c = rcksum_calc_rsum_block(data + x + bs * k, bs);
  334             if (c.a != z->r[k].a || c.b != z->r[k].b) {
  335                 fprintf(stderr, "rsum miscalc (%d) at %lld\n", k, offset + x);
  336                 exit(3);
  337             }
  338         }
  339 #endif
  340 
  341         {
  342             /* # of blocks of the output file we got from this data */
  343             int thismatch = 0;
  344             /* # of blocks to advance if thismatch > 0. Can be less than
  345              * thismatch as thismatch could be N*blocks_matched, if a block was
  346              * duplicated to multiple locations in the output file. */
  347             int blocks_matched = 0; 
  348 
  349             /* If the previous block was a match, but we're looking for
  350              * sequential matches, then test this block against the block in
  351              * the target immediately after our previous hit. */
  352             if (z->next_match && z->seq_matches > 1) {
  353                 if (0 != (thismatch = check_checksums_on_hash_chain(z, z->next_match, data + x, 1))) {
  354                     blocks_matched = 1;
  355                 }
  356             }
  357             if (!thismatch) {
  358                 const struct hash_entry *e;
  359 
  360                 /* Do a hash table lookup - first in the bithash (fast negative
  361                  * check) and then in the rsum hash */
  362                 unsigned hash = z->r[0].b;
  363                 hash ^= ((z->seq_matches > 1) ? z->r[1].b
  364                         : z->r[0].a & z->rsum_a_mask) << BITHASHBITS;
  365                 if ((z->bithash[(hash & z->bithashmask) >> 3] & (1 << (hash & 7))) != 0
  366                     && (e = z->rsum_hash[hash & z->hashmask]) != NULL) {
  367 
  368                     /* Okay, we have a hash hit. Follow the hash chain and
  369                      * check our block against all the entries. */
  370                     thismatch = check_checksums_on_hash_chain(z, e, data + x, 0);
  371                     if (thismatch)
  372                         blocks_matched = z->seq_matches;
  373                 }
  374             }
  375             got_blocks += thismatch;
  376 
  377             /* If we got a hit, skip forward (if a block in the target matches
  378              * at x, it's highly unlikely to get a hit at x+1 as all the
  379              * target's blocks are multiples of the blocksize apart. */
  380             if (blocks_matched) {
  381                 x += bs + (blocks_matched > 1 ? bs : 0);
  382 
  383                 if (x + z->context > len) {
  384                     /* can't calculate rsum for block after this one, because
  385                      * it's not in the buffer. So leave a hint for next time so
  386                      * we know we need to recalculate */
  387                     z->skip = x + z->context - len;
  388                     return got_blocks;
  389                 }
  390 
  391                 /* If we are moving forward just 1 block, we already have the
  392                  * following block rsum. If we are skipping both, then
  393                  * recalculate both */
  394                 if (z->seq_matches > 1 && blocks_matched == 1)
  395                     z->r[0] = z->r[1];
  396                 else
  397                     z->r[0] = rcksum_calc_rsum_block(data + x, bs);
  398                 if (z->seq_matches > 1)
  399                     z->r[1] = rcksum_calc_rsum_block(data + x + bs, bs);
  400                 continue;
  401             }
  402         }
  403 
  404         /* Else - advance the window by 1 byte - update the rolling checksum
  405          * and our offset in the buffer */
  406         {
  407             unsigned char Nc = data[x + bs * 2];
  408             unsigned char nc = data[x + bs];
  409             unsigned char oc = data[x];
  410             UPDATE_RSUM(z->r[0].a, z->r[0].b, oc, nc, z->blockshift);
  411             if (z->seq_matches > 1)
  412                 UPDATE_RSUM(z->r[1].a, z->r[1].b, nc, Nc, z->blockshift);
  413         }
  414         x++;
  415     }
  416 }
  417 
  418 /* rcksum_submit_source_file(self, stream, progress)
  419  * Read the given stream, applying the rsync rolling checksum algorithm to
  420  * identify any blocks of data in common with the target file. Blocks found are
  421  * written to our working target output. Progress reports if progress != 0
  422  */
  423 int rcksum_submit_source_file(struct rcksum_state *z, FILE * f, int progress) {
  424     /* Track progress */
  425     int got_blocks = 0;
  426     off_t in = 0;
  427     int in_mb = 0;
  428 
  429     /* Allocate buffer of 16 blocks */
  430     register int bufsize = z->blocksize * 16;
  431     unsigned char *buf = malloc(bufsize + z->context);
  432     if (!buf)
  433         return 0;
  434 
  435     /* Build checksum hash tables ready to analyse the blocks we find */
  436     if (!z->rsum_hash)
  437         if (!build_hash(z)) {
  438             free(buf);
  439             return 0;
  440         }
  441 
  442     while (!feof(f)) {
  443         size_t len;
  444         off_t start_in = in;
  445 
  446         /* If this is the start, fill the buffer for the first time */
  447         if (!in) {
  448             len = fread(buf, 1, bufsize, f);
  449             in += len;
  450         }
  451 
  452         /* Else, move the last context bytes from the end of the buffer to the
  453          * start, and refill the rest of the buffer from the stream. */
  454         else {
  455             memcpy(buf, buf + (bufsize - z->context), z->context);
  456             in += bufsize - z->context;
  457             len = z->context + fread(buf + z->context, 1, bufsize - z->context, f);
  458         }
  459 
  460         /* If either fread above failed, or EOFed */
  461         if (ferror(f)) {
  462             perror("fread");
  463             free(buf);
  464             return got_blocks;
  465         }
  466         if (feof(f)) {          /* 0 pad to complete a block */
  467             memset(buf + len, 0, z->context);
  468             len += z->context;
  469         }
  470 
  471         /* Process the data in the buffer, and report progress */
  472         got_blocks += rcksum_submit_source_data(z, buf, len, start_in);
  473         if (progress && in_mb != in / 1000000) {
  474             in_mb = in / 1000000;
  475             fputc('*', stderr);
  476         }
  477     }
  478     free(buf);
  479     return got_blocks;
  480 }