"Fossies" - the Fresh Open Source Software Archive 
Member "zsync-0.6.2/librcksum/rsum.c" (16 Sep 2010, 17657 Bytes) of package /linux/privat/old/zsync-0.6.2.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "rsum.c" see the
Fossies "Dox" file reference documentation.
1
2 /*
3 * rcksum/lib - library for using the rsync algorithm to determine
4 * which parts of a file you have and which you need.
5 * Copyright (C) 2004,2005,2007,2009 Colin Phipps <cph@moria.org.uk>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the Artistic License v2 (see the accompanying
9 * file COPYING for the full license terms), or, at your option, any later
10 * version of the same license.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * COPYING file for details.
16 */
17
18 /* This is the core of the rsync rolling checksum algorithm - this is what it's
19 * all about. */
20
21 #include "zsglobal.h"
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <errno.h>
27 #include <unistd.h>
28
29 #ifdef WITH_DMALLOC
30 # include <dmalloc.h>
31 #endif
32
33 #include "md4.h"
34 #include "rcksum.h"
35 #include "internal.h"
36
37 #define UPDATE_RSUM(a, b, oldc, newc, bshift) do { (a) += ((unsigned char)(newc)) - ((unsigned char)(oldc)); (b) += (a) - ((oldc) << (bshift)); } while (0)
38
39 /* rcksum_calc_rsum_block(data, data_len)
40 * Calculate the rsum for a single block of data. */
41 struct rsum __attribute__ ((pure)) rcksum_calc_rsum_block(const unsigned char *data, size_t len) {
42 register unsigned short a = 0;
43 register unsigned short b = 0;
44
45 while (len) {
46 unsigned char c = *data++;
47 a += c;
48 b += len * c;
49 len--;
50 }
51 {
52 struct rsum r = { a, b };
53 return r;
54 }
55 }
56
57 /* rcksum_calc_checksum(checksum_buf, data, data_len)
58 * Returns the MD4 checksum (in checksum_buf) of the given data block */
59 void rcksum_calc_checksum(unsigned char *c, const unsigned char *data,
60 size_t len) {
61 MD4_CTX ctx;
62 MD4Init(&ctx);
63 MD4Update(&ctx, data, len);
64 MD4Final(c, &ctx);
65 }
66
67 #ifndef HAVE_PWRITE
68 /* Fallback pwrite(2) implementation if needed (but not strictly complete, as
69 * it moves the file pointer - we don't care). */
70 ssize_t pwrite(int d, const void *buf, size_t nbytes, off_t offset) {
71 if (lseek(d, offset, SEEK_SET) == -1)
72 return -1;
73 return write(d, buf, nbytes);
74 }
75 #endif
76
77 /* write_blocks(rcksum_state, buf, startblock, endblock)
78 * Writes the block range (inclusive) from the supplied buffer to our
79 * under-construction output file */
80 static void write_blocks(struct rcksum_state *z, const unsigned char *data,
81 zs_blockid bfrom, zs_blockid bto) {
82 off_t len = ((off_t) (bto - bfrom + 1)) << z->blockshift;
83 off_t offset = ((off_t) bfrom) << z->blockshift;
84
85 while (len) {
86 size_t l = len;
87 int rc;
88
89 /* On some platforms, the bytes-to-write could be more than pwrite(2)
90 * will accept. Write in blocks of 2^31 bytes in that case. */
91 if ((off_t) l < len)
92 l = 0x8000000;
93
94 /* Write */
95 rc = pwrite(z->fd, data, l, offset);
96 if (rc == -1) {
97 fprintf(stderr, "IO error: %s\n", strerror(errno));
98 exit(-1);
99 }
100
101 /* Keep track of any data still to do */
102 len -= rc;
103 if (len) { /* More to write */
104 data += rc;
105 offset += rc;
106 }
107 }
108
109 { /* Having written those blocks, discard them from the rsum hashes (as
110 * we don't need to identify data for those blocks again, and this may
111 * speed up lookups (in particular if there are lots of identical
112 * blocks), and add the written blocks to the record of blocks that we
113 * have received and stored the data for */
114 int id;
115 for (id = bfrom; id <= bto; id++) {
116 remove_block_from_hash(z, id);
117 add_to_ranges(z, id);
118 }
119 }
120 }
121
122 /* rcksum_read_known_data(self, buf, offset, len)
123 * Read back data from the working output - read len bytes from offset into
124 * buf[] (which must be at least len bytes long) */
125 int rcksum_read_known_data(struct rcksum_state *z, unsigned char *buf,
126 off_t offset, size_t len) {
127 int rc = pread(z->fd, buf, len, offset);
128 return rc;
129 }
130
131 /* rcksum_submit_blocks(self, data, startblock, endblock)
132 * The data in data[] (which should be (endblock - startblock + 1) * blocksize * bytes)
133 * is tested block-by-block as valid data against the target checksums for
134 * those blocks and, if valid, accepted and written to the working output.
135 *
136 * Use this when you have obtained data that you know corresponds to given
137 * blocks in the output file (i.e. you've downloaded them from a real copy of
138 * the target).
139 */
140 int rcksum_submit_blocks(struct rcksum_state *const z, const unsigned char *data,
141 zs_blockid bfrom, zs_blockid bto) {
142 zs_blockid x;
143 unsigned char md4sum[CHECKSUM_SIZE];
144
145 /* Build checksum hash tables if we don't have them yet */
146 if (!z->rsum_hash)
147 if (!build_hash(z))
148 return -1;
149
150 /* Check each block */
151 for (x = bfrom; x <= bto; x++) {
152 rcksum_calc_checksum(&md4sum[0], data + ((x - bfrom) << z->blockshift),
153 z->blocksize);
154 if (memcmp(&md4sum, &(z->blockhashes[x].checksum[0]), z->checksum_bytes)) {
155 if (x > bfrom) /* Write any good blocks we did get */
156 write_blocks(z, data, bfrom, x - 1);
157 return -1;
158 }
159 }
160
161 /* All blocks are valid; write them and update our state */
162 write_blocks(z, data, bfrom, bto);
163 return 0;
164 }
165
166 /* check_checksums_on_hash_chain(self, &hash_entry, data[], onlyone)
167 * Given a hash table entry, check the data in this block against every entry
168 * in the linked list for this hash entry, checking the checksums for this
169 * block against those recorded in the hash entries.
170 *
171 * If we get a hit (checksums match a desired block), write the data to that
172 * block in the target file and update our state accordingly to indicate that
173 * we have got that block successfully.
174 *
175 * Return the number of blocks successfully obtained.
176 */
177 static int check_checksums_on_hash_chain(struct rcksum_state *const z,
178 const struct hash_entry *e,
179 const unsigned char *data,
180 int onlyone) {
181 unsigned char md4sum[2][CHECKSUM_SIZE];
182 signed int done_md4 = -1;
183 int got_blocks = 0;
184 register struct rsum r = z->r[0];
185
186 /* This is a hint to the caller that they should try matching the next
187 * block against a particular hash entry (because at least z->seq_matches
188 * prior blocks to it matched in sequence). Clear it here and set it below
189 * if and when we get such a set of matches. */
190 z->next_match = NULL;
191
192 /* This is essentially a for (;e;e=e->next), but we want to remove links from
193 * the list as we find matches, without keeping too many temp variables.
194 */
195 z->rover = e;
196 while (z->rover) {
197 zs_blockid id;
198
199 e = z->rover;
200 z->rover = onlyone ? NULL : e->next;
201
202 /* Check weak checksum first */
203
204 z->stats.hashhit++;
205 if (e->r.a != (r.a & z->rsum_a_mask) || e->r.b != r.b) {
206 continue;
207 }
208
209 id = get_HE_blockid(z, e);
210
211 if (!onlyone && z->seq_matches > 1
212 && (z->blockhashes[id + 1].r.a != (z->r[1].a & z->rsum_a_mask)
213 || z->blockhashes[id + 1].r.b != z->r[1].b))
214 continue;
215
216 z->stats.weakhit++;
217
218 {
219 int ok = 1;
220 signed int check_md4 = 0;
221 zs_blockid next_known = -1;
222
223 /* This block at least must match; we must match at least
224 * z->seq_matches-1 others, which could either be trailing stuff,
225 * or these could be preceding blocks that we have verified
226 * already. */
227 do {
228 /* We only calculate the MD4 once we need it; but need not do so twice */
229 if (check_md4 > done_md4) {
230 rcksum_calc_checksum(&md4sum[check_md4][0],
231 data + z->blocksize * check_md4,
232 z->blocksize);
233 done_md4 = check_md4;
234 z->stats.checksummed++;
235 }
236
237 /* Now check the strong checksum for this block */
238 if (memcmp(&md4sum[check_md4],
239 z->blockhashes[id + check_md4].checksum,
240 z->checksum_bytes))
241 ok = 0;
242
243 else if (next_known == -1)
244
245 check_md4++;
246 } while (ok && !onlyone && check_md4 < z->seq_matches);
247
248 if (ok) {
249 int num_write_blocks;
250
251 /* Find the next block that we already have data for. If this
252 * is part of a run of matches then we have this stored already
253 * as ->next_known. */
254 zs_blockid next_known = onlyone ? z->next_known : next_known_block(z, id);
255
256 z->stats.stronghit += check_md4;
257
258 if (next_known > id + check_md4) {
259 num_write_blocks = check_md4;
260
261 /* Save state for this run of matches */
262 z->next_match = &(z->blockhashes[id + check_md4]);
263 if (!onlyone) z->next_known = next_known;
264 }
265 else {
266 /* We've reached the EOF, or data we already know. Just
267 * write out the blocks we don't know, and that's the end
268 * of this run of matches. */
269 num_write_blocks = next_known - id;
270 }
271
272 /* Write out the matched blocks that we don't yet know */
273 write_blocks(z, data, id, id + num_write_blocks - 1);
274 got_blocks += num_write_blocks;
275 }
276 }
277 }
278 return got_blocks;
279 }
280
281 /* rcksum_submit_source_data(self, data, datalen, offset)
282 * Reads the supplied data (length datalen) and identifies any contained blocks
283 * of data that can be used to make up the target file.
284 *
285 * offset should be 0 for a new data stream (or if our position in the data
286 * stream has been changed and does not match the last call) or should be the
287 * offset in the whole source stream otherwise.
288 *
289 * Returns the number of blocks in the target file that we obtained as a result
290 * of reading this buffer.
291 *
292 * IMPLEMENTATION:
293 * We maintain the following state:
294 * skip - the number of bytes to skip next time we enter rcksum_submit_source_data
295 * e.g. because we've just matched a block and the forward jump takes
296 * us past the end of the buffer
297 * r[0] - rolling checksum of the first blocksize bytes of the buffer
298 * r[1] - rolling checksum of the next blocksize bytes of the buffer (if seq_matches > 1)
299 */
300 int rcksum_submit_source_data(struct rcksum_state *const z, unsigned char *data,
301 size_t len, off_t offset) {
302 /* The window in data[] currently being considered is
303 * [x, x+bs)
304 */
305 int x = 0;
306 register int bs = z->blocksize;
307 int got_blocks = 0;
308
309 if (offset) {
310 x = z->skip;
311 }
312 else {
313 z->next_match = NULL;
314 }
315
316 if (x || !offset) {
317 z->r[0] = rcksum_calc_rsum_block(data + x, bs);
318 if (z->seq_matches > 1)
319 z->r[1] = rcksum_calc_rsum_block(data + x + bs, bs);
320 }
321 z->skip = 0;
322
323 /* Work through the block until the current blocksize bytes being
324 * considered, starting at x, is at the end of the buffer */
325 for (;;) {
326 if (x + z->context == len) {
327 return got_blocks;
328 }
329
330 #if 0
331 { /* Catch rolling checksum failure */
332 int k = 0;
333 struct rsum c = rcksum_calc_rsum_block(data + x + bs * k, bs);
334 if (c.a != z->r[k].a || c.b != z->r[k].b) {
335 fprintf(stderr, "rsum miscalc (%d) at %lld\n", k, offset + x);
336 exit(3);
337 }
338 }
339 #endif
340
341 {
342 /* # of blocks of the output file we got from this data */
343 int thismatch = 0;
344 /* # of blocks to advance if thismatch > 0. Can be less than
345 * thismatch as thismatch could be N*blocks_matched, if a block was
346 * duplicated to multiple locations in the output file. */
347 int blocks_matched = 0;
348
349 /* If the previous block was a match, but we're looking for
350 * sequential matches, then test this block against the block in
351 * the target immediately after our previous hit. */
352 if (z->next_match && z->seq_matches > 1) {
353 if (0 != (thismatch = check_checksums_on_hash_chain(z, z->next_match, data + x, 1))) {
354 blocks_matched = 1;
355 }
356 }
357 if (!thismatch) {
358 const struct hash_entry *e;
359
360 /* Do a hash table lookup - first in the bithash (fast negative
361 * check) and then in the rsum hash */
362 unsigned hash = z->r[0].b;
363 hash ^= ((z->seq_matches > 1) ? z->r[1].b
364 : z->r[0].a & z->rsum_a_mask) << BITHASHBITS;
365 if ((z->bithash[(hash & z->bithashmask) >> 3] & (1 << (hash & 7))) != 0
366 && (e = z->rsum_hash[hash & z->hashmask]) != NULL) {
367
368 /* Okay, we have a hash hit. Follow the hash chain and
369 * check our block against all the entries. */
370 thismatch = check_checksums_on_hash_chain(z, e, data + x, 0);
371 if (thismatch)
372 blocks_matched = z->seq_matches;
373 }
374 }
375 got_blocks += thismatch;
376
377 /* If we got a hit, skip forward (if a block in the target matches
378 * at x, it's highly unlikely to get a hit at x+1 as all the
379 * target's blocks are multiples of the blocksize apart. */
380 if (blocks_matched) {
381 x += bs + (blocks_matched > 1 ? bs : 0);
382
383 if (x + z->context > len) {
384 /* can't calculate rsum for block after this one, because
385 * it's not in the buffer. So leave a hint for next time so
386 * we know we need to recalculate */
387 z->skip = x + z->context - len;
388 return got_blocks;
389 }
390
391 /* If we are moving forward just 1 block, we already have the
392 * following block rsum. If we are skipping both, then
393 * recalculate both */
394 if (z->seq_matches > 1 && blocks_matched == 1)
395 z->r[0] = z->r[1];
396 else
397 z->r[0] = rcksum_calc_rsum_block(data + x, bs);
398 if (z->seq_matches > 1)
399 z->r[1] = rcksum_calc_rsum_block(data + x + bs, bs);
400 continue;
401 }
402 }
403
404 /* Else - advance the window by 1 byte - update the rolling checksum
405 * and our offset in the buffer */
406 {
407 unsigned char Nc = data[x + bs * 2];
408 unsigned char nc = data[x + bs];
409 unsigned char oc = data[x];
410 UPDATE_RSUM(z->r[0].a, z->r[0].b, oc, nc, z->blockshift);
411 if (z->seq_matches > 1)
412 UPDATE_RSUM(z->r[1].a, z->r[1].b, nc, Nc, z->blockshift);
413 }
414 x++;
415 }
416 }
417
418 /* rcksum_submit_source_file(self, stream, progress)
419 * Read the given stream, applying the rsync rolling checksum algorithm to
420 * identify any blocks of data in common with the target file. Blocks found are
421 * written to our working target output. Progress reports if progress != 0
422 */
423 int rcksum_submit_source_file(struct rcksum_state *z, FILE * f, int progress) {
424 /* Track progress */
425 int got_blocks = 0;
426 off_t in = 0;
427 int in_mb = 0;
428
429 /* Allocate buffer of 16 blocks */
430 register int bufsize = z->blocksize * 16;
431 unsigned char *buf = malloc(bufsize + z->context);
432 if (!buf)
433 return 0;
434
435 /* Build checksum hash tables ready to analyse the blocks we find */
436 if (!z->rsum_hash)
437 if (!build_hash(z)) {
438 free(buf);
439 return 0;
440 }
441
442 while (!feof(f)) {
443 size_t len;
444 off_t start_in = in;
445
446 /* If this is the start, fill the buffer for the first time */
447 if (!in) {
448 len = fread(buf, 1, bufsize, f);
449 in += len;
450 }
451
452 /* Else, move the last context bytes from the end of the buffer to the
453 * start, and refill the rest of the buffer from the stream. */
454 else {
455 memcpy(buf, buf + (bufsize - z->context), z->context);
456 in += bufsize - z->context;
457 len = z->context + fread(buf + z->context, 1, bufsize - z->context, f);
458 }
459
460 /* If either fread above failed, or EOFed */
461 if (ferror(f)) {
462 perror("fread");
463 free(buf);
464 return got_blocks;
465 }
466 if (feof(f)) { /* 0 pad to complete a block */
467 memset(buf + len, 0, z->context);
468 len += z->context;
469 }
470
471 /* Process the data in the buffer, and report progress */
472 got_blocks += rcksum_submit_source_data(z, buf, len, start_in);
473 if (progress && in_mb != in / 1000000) {
474 in_mb = in / 1000000;
475 fputc('*', stderr);
476 }
477 }
478 free(buf);
479 return got_blocks;
480 }