|
|
chunker.c - dedup - deduplicating backup program |
|
|
 |
git clone git://bitreich.org/dedup/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/dedup/ (git://bitreich.org) |
|
|
 |
Log |
|
|
 |
Files |
|
|
 |
Refs |
|
|
 |
Tags |
|
|
 |
README |
|
|
 |
LICENSE |
|
|
|
--- |
|
|
|
chunker.c (6135B) |
|
|
|
--- |
|
|
|
1 #include <stdint.h> |
|
|
|
2 #include <stdio.h> |
|
|
|
3 #include <stdlib.h> |
|
|
|
4 #include <string.h> |
|
|
|
5 #include <unistd.h> |
|
|
|
6 |
|
|
|
7 #include "misc.h" |
|
|
|
8 |
|
|
|
9 #define ROTL(x, y) (((x) << (y)) | ((x) >> (32 - (y)))) |
|
|
|
10 |
|
|
|
11 struct chunker { |
|
|
|
12 unsigned char *buf; |
|
|
|
13 int fd; |
|
|
|
14 size_t rp; |
|
|
|
15 size_t wp; |
|
|
|
16 size_t minsize; |
|
|
|
17 size_t maxsize; |
|
|
|
18 size_t mask; |
|
|
|
19 size_t winsize; |
|
|
|
20 uint32_t seed; |
|
|
|
21 }; |
|
|
|
22 |
|
|
|
23 /* |
|
|
|
24 * Static table for use in buzhash algorithm. |
|
|
|
25 * 256 * 32 bits randomly generated unique integers |
|
|
|
26 * |
|
|
|
27 * To get better pseudo-random results, there is exactly the same number |
|
|
|
28 * of 0 and 1 spread amongst these integers. It means that there is |
|
|
|
29 * exactly 50% chance that a XOR operation would flip all the bits in |
|
|
|
30 * the hash. |
|
|
|
31 */ |
|
|
|
32 static uint32_t buztbl[] = { |
|
|
|
33 0xbc9fa594,0x30a8f827,0xced627a7,0xdb46a745,0xcfa4a9e8,0x77cccb59,0xddb66276,0x3adc532f, |
|
|
|
34 0xfe8b67d3,0x8155b59e,0x0c893666,0x1d757009,0x17394ee4,0x85d94c07,0xcacd52da,0x076c6f79, |
|
|
|
35 0xead0a798,0x6c7ccb4a,0x2639a1b8,0x3aa5ae32,0x3e6218d2,0xb290d980,0xa5149521,0x4b426119, |
|
|
|
36 0xd3230fc7,0x677c1cc4,0x2b64603c,0x01fe92a8,0xbe358296,0xa7e7fac7,0xf509bf41,0x04b017ad, |
|
|
|
37 0xf900344c,0x8e14e202,0xb2a6e9b4,0x3db3c311,0x960286a8,0xf6bf0468,0xed54ec94,0xf358070c, |
|
|
|
38 0x6a4795dd,0x3f7b925c,0x5e13a060,0xfaecbafe,0x03c8bb55,0x8a56ba88,0x633e3b49,0xe036bbbe, |
|
|
|
39 0x1ed3dbb5,0x76e8ad74,0x79d346ab,0x44b4ccc4,0x71eb22d3,0xa1aa3f24,0x50e05b81,0xa3b450d3, |
|
|
|
40 0x7f5caffb,0xa1990650,0x54c44800,0xda134b65,0x72362eea,0xbd12b8e6,0xf7c99fdc,0x020d48c7, |
|
|
|
41 0x9d9c3d46,0x32b75615,0xe61923cf,0xadc09d8f,0xab11376b,0xd66fe4cd,0xb3b086b6,0xb8345b9f, |
|
|
|
42 0x59029667,0xae0e937c,0xcbd4d4ba,0x720bb3fb,0x5f7d2ca3,0xec24ba15,0x6b40109b,0xf0a54587, |
|
|
|
43 0x3acf9420,0x466e981d,0xc66dc124,0x150ef7b4,0xc3ce718e,0x136774f5,0x46684ab4,0xb4b490f0, |
|
|
|
44 0x26508a8b,0xf12febc8,0x4b99171b,0xfc373c84,0x339b5677,0x41703ff3,0x7cadbbd7,0x15ea24e2, |
|
|
|
45 0x7a2f9783,0xed6a383a,0x649eb072,0x79970941,0x2abd28ad,0x4375e00c,0x9df084f7,0x6fdeec6c, |
|
|
|
46 0x6619ac6d,0x7d256f4d,0x9b8e658a,0x3d7627e9,0xd5a98d45,0x15f84223,0x9b6acef5,0xf876be67, |
|
|
|
47 0xe3ae7089,0x84e2b64a,0x6818a969,0x86e9ba4e,0xa24a5b57,0x61570cf1,0xa5f8fc91,0x879d8383, |
|
|
|
48 0x91b13866,0x75e87961,0x16db8138,0x5a2ff6b8,0x8f664e9b,0x894e1496,0x88235c5b,0xcdb3b580, |
|
|
|
49 0xa2e80109,0xb0f88a82,0xd12cd340,0x93fbc37d,0xf4d1eb82,0xce42f309,0x16ffd2c2,0xb4dfef2b, |
|
|
|
50 0xb8b1a33e,0x4708a5e6,0xba66dd88,0xa9ec0da6,0x6f8ee2c9,0xad8b9993,0x1d6a25a8,0x1f3d08ce, |
|
|
|
51 0x149c04e7,0x5cd1fa51,0xb84c89c7,0xeced6f8c,0xe328b30f,0x084fa836,0x6d1bb1b7,0x94c78ea5, |
|
|
|
52 0x14973034,0xf1a1bcef,0x48b798d2,0xded9ca9e,0x5fd965d0,0x92544eb1,0x5e80f189,0xcbbf5e15, |
|
|
|
53 0x4d8121f0,0x5dd3b92f,0xd9ea98fb,0x2dbf5644,0x0fbcb9b7,0x20a1db53,0x7c3fcc98,0x36744fbd, |
|
|
|
54 0xced08954,0x8e7c5efe,0x3c5f6733,0x657477be,0x3630a02d,0x38bcbda0,0xb7702575,0x4a7f4bce, |
|
|
|
55 0x0e7660fe,0x4dcb91b5,0x4fd7ffd3,0x041821c1,0xa846a181,0xc8048e9e,0xd4b05072,0x986e0509, |
|
|
|
56 0xa00aaeeb,0x02e3526a,0x2fac4843,0xfa98e805,0x923ecd8d,0x395d9546,0x8674c3cd,0xae5a8a71, |
|
|
|
57 0x966dfe45,0x5c9ceba5,0x0830a1cf,0xa1750981,0x8f604480,0x28ea0c9a,0x0da12413,0x98b0b3c5, |
|
|
|
58 0xa21d473a,0x96ce4308,0xe9a1001b,0x8bbacb44,0x18bad3f4,0xe3121acb,0x46a9b45f,0x92cd9704, |
|
|
|
59 0xc1a7c619,0x3281e361,0x462e8c79,0x9e572f93,0x7239e5f0,0x67d8e6ba,0x13747ce3,0xf01ee64a, |
|
|
|
60 0xe7d0ae12,0xeea04088,0xe5b36767,0x17558eae,0x678ffbe6,0xe0bbc866,0x0c24adec,0xa9cbb869, |
|
|
|
61 0x3fd44ee1,0x9ca4ca06,0x04c0ef00,0x04589a21,0x9cf9c819,0x976f6ca1,0x8a30e66a,0x004d6f7e, |
|
|
|
62 0x384c8851,0x5bc97eb8,0xc6c49339,0x5aa386c7,0x74bdf8af,0x9b713750,0x4112f8c2,0x2895dae1, |
|
|
|
63 0xf576d905,0x9de98bce,0xb2b26bcd,0xd46707a0,0x147fbb46,0xa52c6e50,0xe43128fc,0x374ad964, |
|
|
|
64 0x8dfd4d53,0xc4d0c087,0x31dfb5ca,0xa44589b5,0x6b637e2e,0x663f6b45,0xd2d8baa0,0x1dac7e4c |
|
|
|
65 }; |
|
|
|
66 |
|
|
|
67 /* Buzhash: https://en.wikipedia.org/wiki/Rolling_hash#Cyclic_polynomial */ |
|
|
|
68 static inline uint32_t |
|
|
|
69 hinit(unsigned char *buf, size_t size) |
|
|
|
70 { |
|
|
|
71 uint32_t sum; |
|
|
|
72 size_t i; |
|
|
|
73 |
|
|
|
74 for (i = 1, sum = 0; i < size; i++, buf++) |
|
|
|
75 sum ^= ROTL(buztbl[*buf], (size - i) % 32); |
|
|
|
76 return sum ^ buztbl[*buf]; |
|
|
|
77 } |
|
|
|
78 |
|
|
|
79 static inline uint32_t |
|
|
|
80 hupdate(uint32_t sum, unsigned char out, unsigned char in, size_t size) |
|
|
|
81 { |
|
|
|
82 return ROTL(sum, 1) ^ ROTL(buztbl[out], size % 32) ^ buztbl[in]; |
|
|
|
83 } |
|
|
|
84 |
|
|
|
85 static size_t |
|
|
|
86 cgetsize(struct chunker *c) |
|
|
|
87 { |
|
|
|
88 size_t maxcsize, winsize, i; |
|
|
|
89 uint32_t sum; |
|
|
|
90 unsigned char *bp; |
|
|
|
91 |
|
|
|
92 maxcsize = c->wp - c->rp; |
|
|
|
93 winsize = c->winsize; |
|
|
|
94 if (maxcsize < winsize) |
|
|
|
95 return maxcsize; |
|
|
|
96 |
|
|
|
97 /* |
|
|
|
98 * To achieve better deduplication, we chunk blocks based on a |
|
|
|
99 * recurring pattern on the data stream. We slide a fixed window |
|
|
|
100 * of WINSIZE bytes over the data, and a rolling hash is computed |
|
|
|
101 * for this window. |
|
|
|
102 * When the rolling hash matches a given pattern the block is chunked |
|
|
|
103 * at the end of that window. |
|
|
|
104 */ |
|
|
|
105 bp = &c->buf[c->rp]; |
|
|
|
106 sum = hinit(bp, winsize); |
|
|
|
107 for (i = 0; i < maxcsize - winsize; i++) { |
|
|
|
108 size_t csize = i + winsize; |
|
|
|
109 |
|
|
|
110 if (i > 0) { |
|
|
|
111 unsigned char out = bp[i - 1]; |
|
|
|
112 unsigned char in = bp[csize - 1]; |
|
|
|
113 |
|
|
|
114 sum = hupdate(sum, out, in, winsize); |
|
|
|
115 } |
|
|
|
116 |
|
|
|
117 if (csize < c->minsize) |
|
|
|
118 continue; |
|
|
|
119 |
|
|
|
120 if ((sum & c->mask) == 0) |
|
|
|
121 return csize; |
|
|
|
122 } |
|
|
|
123 return maxcsize; |
|
|
|
124 } |
|
|
|
125 |
|
|
|
126 struct chunker * |
|
|
|
127 copen(int fd, size_t minsize, size_t maxsize, |
|
|
|
128 size_t mask, size_t winsize, uint32_t seed) |
|
|
|
129 { |
|
|
|
130 struct chunker *c; |
|
|
|
131 size_t i; |
|
|
|
132 |
|
|
|
133 c = calloc(1, sizeof(*c)); |
|
|
|
134 if (c == NULL) { |
|
|
|
135 seterr("calloc: out of memory"); |
|
|
|
136 return NULL; |
|
|
|
137 } |
|
|
|
138 |
|
|
|
139 c->buf = calloc(1, maxsize); |
|
|
|
140 if (c->buf == NULL) { |
|
|
|
141 free(c); |
|
|
|
142 seterr("calloc: out of memory"); |
|
|
|
143 return NULL; |
|
|
|
144 } |
|
|
|
145 |
|
|
|
146 c->fd = fd; |
|
|
|
147 c->minsize = minsize; |
|
|
|
148 c->maxsize = maxsize; |
|
|
|
149 c->mask = mask; |
|
|
|
150 c->winsize = winsize; |
|
|
|
151 c->seed = seed; |
|
|
|
152 |
|
|
|
153 for (i = 0; i < sizeof(buztbl) / sizeof(buztbl[0]); i++) |
|
|
|
154 buztbl[i] ^= c->seed; |
|
|
|
155 return c; |
|
|
|
156 } |
|
|
|
157 |
|
|
|
158 void |
|
|
|
159 cclose(struct chunker *c) |
|
|
|
160 { |
|
|
|
161 free(c->buf); |
|
|
|
162 free(c); |
|
|
|
163 } |
|
|
|
164 |
|
|
|
165 ssize_t |
|
|
|
166 cfill(struct chunker *c) |
|
|
|
167 { |
|
|
|
168 unsigned char *bp; |
|
|
|
169 ssize_t n; |
|
|
|
170 |
|
|
|
171 bp = &c->buf[c->wp]; |
|
|
|
172 n = xread(c->fd, bp, c->maxsize - c->wp); |
|
|
|
173 c->wp += n; |
|
|
|
174 return c->wp; |
|
|
|
175 } |
|
|
|
176 |
|
|
|
177 void * |
|
|
|
178 cget(struct chunker *c, size_t *csize) |
|
|
|
179 { |
|
|
|
180 unsigned char *bp; |
|
|
|
181 |
|
|
|
182 if (c->rp == c->wp) { |
|
|
|
183 seterr("chunker underflow"); |
|
|
|
184 return NULL; |
|
|
|
185 } |
|
|
|
186 |
|
|
|
187 bp = &c->buf[c->rp]; |
|
|
|
188 *csize = cgetsize(c); |
|
|
|
189 c->rp += *csize; |
|
|
|
190 return bp; |
|
|
|
191 } |
|
|
|
192 |
|
|
|
193 size_t |
|
|
|
194 cdrain(struct chunker *c) |
|
|
|
195 { |
|
|
|
196 unsigned char *src, *dst; |
|
|
|
197 |
|
|
|
198 src = &c->buf[c->rp]; |
|
|
|
199 dst = c->buf; |
|
|
|
200 memmove(dst, src, c->wp - c->rp); |
|
|
|
201 c->wp -= c->rp; |
|
|
|
202 c->rp = 0; |
|
|
|
203 return c->wp; |
|
|
|
204 } |
|