libaaruformat 1.0
Aaru Data Preservation Suite - Format Library
Loading...
Searching...
No Matches
spamsum.c
Go to the documentation of this file.
1/*
2 * This file is part of the Aaru Data Preservation Suite.
3 * Copyright (c) 2019-2025 Natalia Portillo.
4 * Copyright (C) 2002 Andrew Tridgell <tridge@samba.org>
5 * Copyright (C) 2006 ManTech International Corporation
6 * Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
7 *
8 * This library is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Lesser General Public License as
10 * published by the Free Software Foundation; either version 2.1 of the
11 * License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 */
21
22#include <errno.h>
23#include <stdint.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27
28#include "aaruformat.h"
29
30#include "spamsum.h"
31
32static uint8_t b64[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50,
33 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66,
34 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
35 0x77, 0x78, 0x79, 0x7A, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2B, 0x2F};
36
38{
39 spamsum_ctx *ctx = malloc(sizeof(spamsum_ctx));
40 if(!ctx) return NULL;
41
42 memset(ctx, 0, sizeof(spamsum_ctx));
43
44 ctx->bh_end = 1;
45 ctx->bh[0].h = HASH_INIT;
46 ctx->bh[0].half_h = HASH_INIT;
47
48 return ctx;
49}
50
59AARU_EXPORT int AARU_CALL aaruf_spamsum_update(spamsum_ctx *ctx, const uint8_t *data, const uint32_t len)
60{
61 if(!ctx || !data) return -1;
62
63 for(int i = 0; i < len; i++) fuzzy_engine_step(ctx, data[i]);
64
65 ctx->total_size += len;
66
67 return 0;
68}
69
76{
77 if(ctx) free(ctx);
78}
79
80#define ROLL_SUM(ctx) ((ctx)->roll.h1 + (ctx)->roll.h2 + (ctx)->roll.h3)
81#define SUM_HASH(c, h) (((h) * HASH_PRIME) ^ (c));
82#define SSDEEP_BS(index) (MIN_BLOCKSIZE << (index))
83
84AARU_LOCAL inline void fuzzy_engine_step(spamsum_ctx *ctx, uint8_t c)
85{
86 uint32_t i = 0;
87 /* At each character we update the rolling hash and the normal hashes.
88 * When the rolling hash hits a reset value then we emit a normal hash
89 * as a element of the signature and reset the normal hash. */
90 roll_hash(ctx, c);
91 const uint64_t h = ROLL_SUM(ctx);
92
93 for(i = ctx->bh_start; i < ctx->bh_end; ++i)
94 {
95 ctx->bh[i].h = SUM_HASH(c, ctx->bh[i].h);
96 ctx->bh[i].half_h = SUM_HASH(c, ctx->bh[i].half_h);
97 }
98
99 for(i = ctx->bh_start; i < ctx->bh_end; ++i)
100 {
101 /* With growing blocksize almost no runs fail the next test. */
102 if(h % SSDEEP_BS(i) != SSDEEP_BS(i) - 1)
103 /* Once this condition is false for one bs, it is
104 * automatically false for all further bs. I.e. if
105 * h === -1 (mod 2*bs) then h === -1 (mod bs). */
106 break;
107
108 /* We have hit a reset point. We now emit hashes which are
109 * based on all characters in the piece of the message between
110 * the last reset point and this one */
111 if(0 == ctx->bh[i].d_len) fuzzy_try_fork_blockhash(ctx);
112
113 ctx->bh[i].digest[ctx->bh[i].d_len] = b64[ctx->bh[i].h % 64];
114 ctx->bh[i].half_digest = b64[ctx->bh[i].half_h % 64];
115
116 if(ctx->bh[i].d_len < SPAMSUM_LENGTH - 1)
117 {
118 /* We can have a problem with the tail overflowing. The
119 * easiest way to cope with this is to only reset the
120 * normal hash if we have room for more characters in
121 * our signature. This has the effect of combining the
122 * last few pieces of the message into a single piece
123 * */
124 ctx->bh[i].digest[++ctx->bh[i].d_len] = 0;
125 ctx->bh[i].h = HASH_INIT;
126
127 if(ctx->bh[i].d_len >= SPAMSUM_LENGTH / 2) continue;
128
129 ctx->bh[i].half_h = HASH_INIT;
130 ctx->bh[i].half_digest = 0;
131 }
132 else
134 }
135}
136
137AARU_LOCAL inline void roll_hash(spamsum_ctx *ctx, uint8_t c)
138{
139 ctx->roll.h2 -= ctx->roll.h1;
140 ctx->roll.h2 += ROLLING_WINDOW * c;
141
142 ctx->roll.h1 += c;
143 ctx->roll.h1 -= ctx->roll.window[ctx->roll.n % ROLLING_WINDOW];
144
145 ctx->roll.window[ctx->roll.n % ROLLING_WINDOW] = c;
146 ctx->roll.n++;
147
148 /* The original spamsum AND'ed this value with 0xFFFFFFFF which
149 * in theory should have no effect. This AND has been removed
150 * for performance (jk) */
151 ctx->roll.h3 <<= 5;
152 ctx->roll.h3 ^= c;
153}
154
156{
157 // assert(ctx->bh_start < ctx->bh_end);
158
159 if(ctx->bh_end - ctx->bh_start < 2) /* Need at least two working hashes. */
160 return;
161
162 if((uint64_t)SSDEEP_BS(ctx->bh_start) * SPAMSUM_LENGTH >= ctx->total_size)
163 /* Initial blocksize estimate would select this or a smaller
164 * blocksize. */
165 return;
166
167 if(ctx->bh[ctx->bh_start + 1].d_len < SPAMSUM_LENGTH / 2) /* Estimate adjustment would select this blocksize. */
168 return;
169
170 /* At this point we are clearly no longer interested in the
171 * start_blocksize. Get rid of it. */
172 ++ctx->bh_start;
173}
174
176{
177 if(ctx->bh_end >= NUM_BLOCKHASHES) return;
178
179 // assert(ctx->bh_end != 0);
180
181 const uint32_t obh = ctx->bh_end - 1;
182 const uint32_t nbh = ctx->bh_end;
183 ctx->bh[nbh].h = ctx->bh[obh].h;
184 ctx->bh[nbh].half_h = ctx->bh[obh].half_h;
185 ctx->bh[nbh].digest[0] = 0;
186 ctx->bh[nbh].half_digest = 0;
187 ctx->bh[nbh].d_len = 0;
188 ++ctx->bh_end;
189}
190
192{
193 uint32_t bi = ctx->bh_start;
194 uint32_t h = ROLL_SUM(ctx);
195 int remain = FUZZY_MAX_RESULT - 1; /* Exclude terminating '\0'. */
196
197 if(!result) return -1;
198
199 /* Verify that our elimination was not overeager. */
200 // assert(bi == 0 || (uint64_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH < ctx->total_size);
201
202 /* Initial blocksize guess. */
203 while((uint64_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < ctx->total_size)
204 {
205 ++bi;
206
207 if(bi >= NUM_BLOCKHASHES)
208 {
209 errno = EOVERFLOW;
210 return -1;
211 }
212 }
213
214 /* Adapt blocksize guess to actual digest length. */
215 while(bi >= ctx->bh_end) --bi;
216
217 while(bi > ctx->bh_start && ctx->bh[bi].d_len < SPAMSUM_LENGTH / 2) --bi;
218
219 // assert(!(bi > 0 && ctx->bh[bi].d_len < SPAMSUM_LENGTH / 2));
220
221 int i = snprintf((char *)result, (size_t)remain, "%lu:", (unsigned long)SSDEEP_BS(bi));
222
223 if(i <= 0) /* Maybe snprintf has set errno here? */
224 return -1;
225
226 // assert(i < remain);
227
228 remain -= i;
229 result += i;
230
231 i = (int)ctx->bh[bi].d_len;
232
233 // assert(i <= remain);
234
235 memcpy(result, ctx->bh[bi].digest, (size_t)i);
236 result += i;
237 remain -= i;
238
239 if(h != 0)
240 {
241 // assert(remain > 0);
242
243 *result = b64[ctx->bh[bi].h % 64];
244
245 if(i < 3 || *result != result[-1] || *result != result[-2] || *result != result[-3])
246 {
247 ++result;
248 --remain;
249 }
250 }
251 else if(ctx->bh[bi].digest[i] != 0)
252 {
253 // assert(remain > 0);
254
255 *result = ctx->bh[bi].digest[i];
256
257 if(i < 3 || *result != result[-1] || *result != result[-2] || *result != result[-3])
258 {
259 ++result;
260 --remain;
261 }
262 }
263
264 // assert(remain > 0);
265
266 *result++ = ':';
267 --remain;
268
269 if(bi < ctx->bh_end - 1)
270 {
271 ++bi;
272 i = (int)ctx->bh[bi].d_len;
273
274 if(i <= remain)
275 ;
276
277 memcpy(result, ctx->bh[bi].digest, (size_t)i);
278 result += i;
279 remain -= i;
280
281 if(h != 0)
282 {
283 // assert(remain > 0);
284
285 h = ctx->bh[bi].half_h;
286 *result = b64[h % 64];
287
288 if(i < 3 || *result != result[-1] || *result != result[-2] || *result != result[-3])
289 {
290 ++result;
291 --remain;
292 }
293 }
294 else
295 {
296 i = ctx->bh[bi].half_digest;
297
298 if(i != 0)
299 {
300 // assert(remain > 0);
301
302 *result = (uint8_t)i;
303
304 if(i < 3 || *result != result[-1] || *result != result[-2] || *result != result[-3])
305 {
306 ++result;
307 --remain;
308 }
309 }
310 }
311 }
312 else if(h != 0)
313 {
314 // assert(ctx->bh[bi].d_len == 0);
315
316 // assert(remain > 0);
317
318 *result++ = b64[ctx->bh[bi].h % 64];
319 /* No need to bother with FUZZY_FLAG_ELIMSEQ, because this
320 * digest has length 1. */
321 --remain;
322 }
323
324 *result = 0;
325
326 return 0;
327}
#define AARU_CALL
Definition decls.h:45
#define AARU_LOCAL
Definition decls.h:55
#define AARU_EXPORT
Definition decls.h:54
#define SUM_HASH(c, h)
Definition spamsum.c:81
static uint8_t b64[]
Definition spamsum.c:32
void fuzzy_try_fork_blockhash(spamsum_ctx *ctx)
Definition spamsum.c:175
#define SSDEEP_BS(index)
Definition spamsum.c:82
void fuzzy_engine_step(spamsum_ctx *ctx, uint8_t c)
Definition spamsum.c:84
void aaruf_spamsum_free(spamsum_ctx *ctx)
Frees a spamsum (fuzzy hash) context.
Definition spamsum.c:75
#define ROLL_SUM(ctx)
Definition spamsum.c:80
spamsum_ctx * aaruf_spamsum_init(void)
Definition spamsum.c:37
void fuzzy_try_reduce_blockhash(spamsum_ctx *ctx)
Definition spamsum.c:155
void roll_hash(spamsum_ctx *ctx, uint8_t c)
Definition spamsum.c:137
int aaruf_spamsum_final(spamsum_ctx *ctx, uint8_t *result)
Definition spamsum.c:191
int aaruf_spamsum_update(spamsum_ctx *ctx, const uint8_t *data, const uint32_t len)
Updates the spamsum context with new data.
Definition spamsum.c:59
#define SPAMSUM_LENGTH
Definition spamsum.h:24
#define FUZZY_MAX_RESULT
Definition spamsum.h:30
#define ROLLING_WINDOW
Definition spamsum.h:26
#define NUM_BLOCKHASHES
Definition spamsum.h:25
#define HASH_INIT
Definition spamsum.h:27
uint32_t d_len
Definition spamsum.h:38
uint32_t h
Definition spamsum.h:34
uint8_t half_digest
Definition spamsum.h:37
uint8_t digest[64]
Definition spamsum.h:36
uint32_t half_h
Definition spamsum.h:35
uint32_t h2
Definition spamsum.h:45
uint32_t n
Definition spamsum.h:47
uint32_t h1
Definition spamsum.h:44
uint32_t h3
Definition spamsum.h:46
uint8_t window[7]
Definition spamsum.h:43
blockhash_ctx bh[31]
Definition spamsum.h:54
roll_state roll
Definition spamsum.h:56
uint32_t bh_end
Definition spamsum.h:53
uint64_t total_size
Definition spamsum.h:55
uint32_t bh_start
Definition spamsum.h:52