From 807e1cf2448f407a17e5d881d9e6fd6d262f2d73 Mon Sep 17 00:00:00 2001 From: Natalia Portillo Date: Sat, 28 May 2022 13:17:05 +0100 Subject: [PATCH] Implement SpamSum. --- CMakeLists.txt | 2 +- README.md | 2 +- include/aaruformat.h | 1 + include/aaruformat/decls.h | 11 ++ include/aaruformat/spamsum.h | 61 +++++++ src/spamsum.c | 314 +++++++++++++++++++++++++++++++++++ tests/CMakeLists.txt | 2 +- tests/spamsum.cpp | 170 +++++++++++++++++++ 8 files changed, 560 insertions(+), 3 deletions(-) create mode 100644 include/aaruformat/spamsum.h create mode 100644 src/spamsum.c create mode 100644 tests/spamsum.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b93f29f..b4392b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ add_compile_definitions(__STDC_FORMAT_MACROS=1) add_library(aaruformat SHARED include/aaruformat/consts.h include/aaruformat/enums.h include/aaru.h include/aaruformat.h include/aaruformat/decls.h include/aaruformat/structs.h src/identify.c src/open.c include/aaruformat/context.h src/close.c include/aaruformat/errors.h src/read.c include/aaruformat/crc64.h src/cst.c src/ecc_cd.c src/helpers.c - src/simd.c include/aaruformat/simd.h src/crc64/crc64.c src/crc64/crc64_clmul.c src/crc64/crc64_vmull.c src/crc64/arm_vmull.c src/crc64/arm_vmull.h) + src/simd.c include/aaruformat/simd.h src/crc64/crc64.c src/crc64/crc64_clmul.c src/crc64/crc64_vmull.c src/crc64/arm_vmull.c src/crc64/arm_vmull.h src/spamsum.c include/aaruformat/spamsum.h) include_directories(include include/aaruformat) diff --git a/README.md b/README.md index 4b07c9d..9408446 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Things still to be implemented that are already in the C# version: - Automatic media type generation from C# enumeration - Nuget package for linking with Aaru - Writing -- Hashing while writing (requires MD5, SHA1, SHA256 and SpamSum) +- Hashing while writing (requires MD5, SHA1 and SHA256) - Deduplication (requires SHA256) - Compression (requires FLAC and LZMA) diff --git a/include/aaruformat.h b/include/aaruformat.h index b8758b0..fcf7136 100644 --- a/include/aaruformat.h +++ b/include/aaruformat.h @@ -29,6 +29,7 @@ #include "aaruformat/enums.h" #include "aaruformat/errors.h" #include "aaruformat/simd.h" +#include "aaruformat/spamsum.h" #include "aaruformat/structs.h" #endif // LIBAARUFORMAT_AARUFORMAT_H diff --git a/include/aaruformat/decls.h b/include/aaruformat/decls.h index 1a3c5d7..1fa0b81 100644 --- a/include/aaruformat/decls.h +++ b/include/aaruformat/decls.h @@ -20,6 +20,7 @@ #define LIBAARUFORMAT_DECLS_H #include "simd.h" +#include "spamsum.h" #ifdef __cplusplus #define EXTERNC extern "C" #else @@ -131,6 +132,16 @@ AARU_LOCAL int32_t AARU_CALL aaruf_get_media_tag_type_for_datatype(int32_t type) AARU_LOCAL int32_t AARU_CALL aaruf_get_xml_mediatype(int32_t type); +AARU_EXPORT spamsum_ctx* AARU_CALL aaruf_spamsum_init(void); +AARU_EXPORT int AARU_CALL aaruf_spamsum_update(spamsum_ctx* ctx, const uint8_t* data, uint32_t len); +AARU_EXPORT int AARU_CALL aaruf_spamsum_final(spamsum_ctx* ctx, uint8_t* result); +AARU_EXPORT void AARU_CALL aaruf_spamsum_free(spamsum_ctx* ctx); + +AARU_LOCAL void fuzzy_engine_step(spamsum_ctx* ctx, uint8_t c); +AARU_LOCAL void roll_hash(spamsum_ctx* ctx, uint8_t c); +AARU_LOCAL void fuzzy_try_reduce_blockhash(spamsum_ctx* ctx); +AARU_LOCAL void fuzzy_try_fork_blockhash(spamsum_ctx* ctx); + #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) diff --git a/include/aaruformat/spamsum.h b/include/aaruformat/spamsum.h new file mode 100644 index 0000000..2176aa9 --- /dev/null +++ b/include/aaruformat/spamsum.h @@ -0,0 +1,61 @@ +/* + * This file is part of the Aaru Data Preservation Suite. + * Copyright (c) 2019-2022 Natalia Portillo. + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of the + * License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#ifndef LIBAARUFORMAT_SPAMSUM_H_ +#define LIBAARUFORMAT_SPAMSUM_H_ + +#include + +#include "decls.h" + +#define SPAMSUM_LENGTH 64 +#define NUM_BLOCKHASHES 31 +#define ROLLING_WINDOW 7 +#define HASH_INIT 0x28021967 +#define HASH_PRIME 0x01000193 +#define MIN_BLOCKSIZE 3 +#define FUZZY_MAX_RESULT ((2 * SPAMSUM_LENGTH) + 20) + +typedef struct +{ + uint32_t h; + uint32_t half_h; + uint8_t digest[SPAMSUM_LENGTH]; + uint8_t half_digest; + uint32_t d_len; +} blockhash_ctx; + +typedef struct +{ + uint8_t window[ROLLING_WINDOW]; + uint32_t h1; + uint32_t h2; + uint32_t h3; + uint32_t n; +} roll_state; + +typedef struct +{ + uint32_t bh_start; + uint32_t bh_end; + blockhash_ctx bh[NUM_BLOCKHASHES]; + uint64_t total_size; + roll_state roll; +} spamsum_ctx; + +#endif // LIBAARUFORMAT_SPAMSUM_H_ diff --git a/src/spamsum.c b/src/spamsum.c new file mode 100644 index 0000000..20c3412 --- /dev/null +++ b/src/spamsum.c @@ -0,0 +1,314 @@ +/* +* This file is part of the Aaru Data Preservation Suite. +* Copyright (c) 2019-2021 Natalia Portillo. +* Copyright (C) 2002 Andrew Tridgell +* Copyright (C) 2006 ManTech International Corporation +* Copyright (C) 2013 Helmut Grohne +* +* This library is free software; you can redistribute it and/or modify +* it under the terms of the GNU Lesser General Public License as +* published by the Free Software Foundation; either version 2.1 of the +* License, or (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. +* +* You should have received a copy of the GNU Lesser General Public +* License along with this library; if not, see . +*/ + +#include +#include +#include +#include +#include + +#include +#include "spamsum.h" + +static uint8_t b64[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, + 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2B, 0x2F}; + +AARU_EXPORT spamsum_ctx* AARU_CALL aaruf_spamsum_init(void) +{ + spamsum_ctx* ctx = (spamsum_ctx*)malloc(sizeof(spamsum_ctx)); + if(!ctx) return NULL; + + memset(ctx, 0, sizeof(spamsum_ctx)); + + ctx->bh_end = 1; + ctx->bh[0].h = HASH_INIT; + ctx->bh[0].half_h = HASH_INIT; + + return ctx; +} + +AARU_EXPORT int AARU_CALL aaruf_spamsum_update(spamsum_ctx* ctx, const uint8_t* data, uint32_t len) +{ + int i; + if(!ctx || !data) return -1; + + for(i = 0; i < len; i++) fuzzy_engine_step(ctx, data[i]); + + ctx->total_size += len; + + return 0; +} + +AARU_EXPORT void AARU_CALL aaruf_spamsum_free(spamsum_ctx* ctx) +{ + if(ctx) free(ctx); +} + +#define ROLL_SUM(ctx) ((ctx)->roll.h1 + (ctx)->roll.h2 + (ctx)->roll.h3) +#define SUM_HASH(c, h) (((h)*HASH_PRIME) ^ (c)); +#define SSDEEP_BS(index) (MIN_BLOCKSIZE << (index)) + +AARU_LOCAL inline void fuzzy_engine_step(spamsum_ctx* ctx, uint8_t c) +{ + uint32_t i; + /* At each character we update the rolling hash and the normal hashes. + * When the rolling hash hits a reset value then we emit a normal hash + * as a element of the signature and reset the normal hash. */ + roll_hash(ctx, c); + uint64_t h = ROLL_SUM(ctx); + + for(i = ctx->bh_start; i < ctx->bh_end; ++i) + { + ctx->bh[i].h = SUM_HASH(c, ctx->bh[i].h); + ctx->bh[i].half_h = SUM_HASH(c, ctx->bh[i].half_h); + } + + for(i = ctx->bh_start; i < ctx->bh_end; ++i) + { + /* With growing blocksize almost no runs fail the next test. */ + if(h % SSDEEP_BS(i) != SSDEEP_BS(i) - 1) + /* Once this condition is false for one bs, it is + * automatically false for all further bs. I.e. if + * h === -1 (mod 2*bs) then h === -1 (mod bs). */ + break; + + /* We have hit a reset point. We now emit hashes which are + * based on all characters in the piece of the message between + * the last reset point and this one */ + if(0 == ctx->bh[i].d_len) fuzzy_try_fork_blockhash(ctx); + + ctx->bh[i].digest[ctx->bh[i].d_len] = b64[ctx->bh[i].h % 64]; + ctx->bh[i].half_digest = b64[ctx->bh[i].half_h % 64]; + + if(ctx->bh[i].d_len < SPAMSUM_LENGTH - 1) + { + /* We can have a problem with the tail overflowing. The + * easiest way to cope with this is to only reset the + * normal hash if we have room for more characters in + * our signature. This has the effect of combining the + * last few pieces of the message into a single piece + * */ + ctx->bh[i].digest[++ctx->bh[i].d_len] = 0; + ctx->bh[i].h = HASH_INIT; + + if(ctx->bh[i].d_len >= SPAMSUM_LENGTH / 2) continue; + + ctx->bh[i].half_h = HASH_INIT; + ctx->bh[i].half_digest = 0; + } + else + fuzzy_try_reduce_blockhash(ctx); + } +} + +AARU_LOCAL inline void roll_hash(spamsum_ctx* ctx, uint8_t c) +{ + ctx->roll.h2 -= ctx->roll.h1; + ctx->roll.h2 += ROLLING_WINDOW * c; + + ctx->roll.h1 += c; + ctx->roll.h1 -= ctx->roll.window[ctx->roll.n % ROLLING_WINDOW]; + + ctx->roll.window[ctx->roll.n % ROLLING_WINDOW] = c; + ctx->roll.n++; + + /* The original spamsum AND'ed this value with 0xFFFFFFFF which + * in theory should have no effect. This AND has been removed + * for performance (jk) */ + ctx->roll.h3 <<= 5; + ctx->roll.h3 ^= c; +} + +AARU_LOCAL inline void fuzzy_try_reduce_blockhash(spamsum_ctx* ctx) +{ + // assert(ctx->bh_start < ctx->bh_end); + + if(ctx->bh_end - ctx->bh_start < 2) /* Need at least two working hashes. */ + return; + + if((uint64_t)SSDEEP_BS(ctx->bh_start) * SPAMSUM_LENGTH >= ctx->total_size) + /* Initial blocksize estimate would select this or a smaller + * blocksize. */ + return; + + if(ctx->bh[ctx->bh_start + 1].d_len < SPAMSUM_LENGTH / 2) /* Estimate adjustment would select this blocksize. */ + return; + + /* At this point we are clearly no longer interested in the + * start_blocksize. Get rid of it. */ + ++ctx->bh_start; +} + +AARU_LOCAL inline void fuzzy_try_fork_blockhash(spamsum_ctx* ctx) +{ + if(ctx->bh_end >= NUM_BLOCKHASHES) return; + + // assert(ctx->bh_end != 0); + + uint32_t obh = ctx->bh_end - 1; + uint32_t nbh = ctx->bh_end; + ctx->bh[nbh].h = ctx->bh[obh].h; + ctx->bh[nbh].half_h = ctx->bh[obh].half_h; + ctx->bh[nbh].digest[0] = 0; + ctx->bh[nbh].half_digest = 0; + ctx->bh[nbh].d_len = 0; + ++ctx->bh_end; +} + +AARU_EXPORT int AARU_CALL aaruf_spamsum_final(spamsum_ctx* ctx, uint8_t* result) +{ + uint32_t bi = ctx->bh_start; + uint32_t h = ROLL_SUM(ctx); + int remain = (int)(FUZZY_MAX_RESULT - 1); /* Exclude terminating '\0'. */ + + if(!result) return -1; + + /* Verify that our elimination was not overeager. */ + // assert(bi == 0 || (uint64_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH < ctx->total_size); + + /* Initial blocksize guess. */ + while((uint64_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < ctx->total_size) + { + ++bi; + + if(bi >= NUM_BLOCKHASHES) + { + errno = EOVERFLOW; + return -1; + } + } + + /* Adapt blocksize guess to actual digest length. */ + while(bi >= ctx->bh_end) --bi; + + while(bi > ctx->bh_start && ctx->bh[bi].d_len < SPAMSUM_LENGTH / 2) --bi; + + // assert(!(bi > 0 && ctx->bh[bi].d_len < SPAMSUM_LENGTH / 2)); + + int i = snprintf((char*)result, (size_t)remain, "%lu:", (unsigned long)SSDEEP_BS(bi)); + + if(i <= 0) /* Maybe snprintf has set errno here? */ + return -1; + + // assert(i < remain); + + remain -= i; + result += i; + + i = (int)ctx->bh[bi].d_len; + + // assert(i <= remain); + + memcpy(result, ctx->bh[bi].digest, (size_t)i); + result += i; + remain -= i; + + if(h != 0) + { + // assert(remain > 0); + + *result = b64[ctx->bh[bi].h % 64]; + + if(i < 3 || *result != result[-1] || *result != result[-2] || *result != result[-3]) + { + ++result; + --remain; + } + } + else if(ctx->bh[bi].digest[i] != 0) + { + // assert(remain > 0); + + *result = ctx->bh[bi].digest[i]; + + if(i < 3 || *result != result[-1] || *result != result[-2] || *result != result[-3]) + { + ++result; + --remain; + } + } + + // assert(remain > 0); + + *result++ = ':'; + --remain; + + if(bi < ctx->bh_end - 1) + { + ++bi; + i = (int)ctx->bh[bi].d_len; + + if(i <= remain) + ; + + memcpy(result, ctx->bh[bi].digest, (size_t)i); + result += i; + remain -= i; + + if(h != 0) + { + // assert(remain > 0); + + h = ctx->bh[bi].half_h; + *result = b64[h % 64]; + + if(i < 3 || *result != result[-1] || *result != result[-2] || *result != result[-3]) + { + ++result; + --remain; + } + } + else + { + i = ctx->bh[bi].half_digest; + + if(i != 0) + { + // assert(remain > 0); + + *result = (uint8_t)i; + + if(i < 3 || *result != result[-1] || *result != result[-2] || *result != result[-3]) + { + ++result; + --remain; + } + } + } + } + else if(h != 0) + { + // assert(ctx->bh[bi].d_len == 0); + + // assert(remain > 0); + + *result++ = b64[ctx->bh[bi].h % 64]; + /* No need to bother with FUZZY_FLAG_ELIMSEQ, because this + * digest has length 1. */ + --remain; + } + + *result = 0; + + return 0; +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8ab387d..9ee06af 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,5 +10,5 @@ file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/data/random # 'Google_Tests_run' is the target name # 'test1.cpp tests2.cpp' are source files with tests -add_executable(tests_run crc64.cpp) +add_executable(tests_run crc64.cpp spamsum.cpp) target_link_libraries(tests_run gtest gtest_main "aaruformat") diff --git a/tests/spamsum.cpp b/tests/spamsum.cpp new file mode 100644 index 0000000..f04ec4c --- /dev/null +++ b/tests/spamsum.cpp @@ -0,0 +1,170 @@ +/* + * This file is part of the Aaru Data Preservation Suite. + * Copyright (c) 2019-2022 Natalia Portillo. + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of the + * License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include +#include +#include + +#include "../include/aaruformat.h" +#include "gtest/gtest.h" + +#define EXPECTED_SPAMSUM "24576:3dvzuAsHTQ16pc7O1Q/gS9qze+Swwn9s6IX:8/TQQpaVqze+JN6IX" +#define EXPECTED_SPAMSUM_15BYTES "3:Ac4E9t:Ac4E9t" +#define EXPECTED_SPAMSUM_31BYTES "3:Ac4E9E5+S09qn:Ac4E9EgSsq" +#define EXPECTED_SPAMSUM_63BYTES "3:Ac4E9E5+S09q2kABV9:Ac4E9EgSs7kW9" +#define EXPECTED_SPAMSUM_2352BYTES "48:pasCLoANDXmjCz1p2OpPm+Gek3xmZfJJ5DD4BacmmlodQMQa/58Z:csK1Nxz7XFGeJS/flHMQu2Z" + +static const uint8_t* buffer; +static const uint8_t* buffer_misaligned; + +class spamsumFixture : public ::testing::Test +{ + public: + spamsumFixture() + { + // initialization; + // can also be done in SetUp() + } + + protected: + void SetUp() + { + char path[PATH_MAX]; + char filename[PATH_MAX]; + + getcwd(path, PATH_MAX); + snprintf(filename, PATH_MAX, "%s/data/random", path); + + FILE* file = fopen(filename, "rb"); + buffer = (const uint8_t*)malloc(1048576); + fread((void*)buffer, 1, 1048576, file); + fclose(file); + + buffer_misaligned = (const uint8_t*)malloc(1048577); + memcpy((void*)(buffer_misaligned + 1), buffer, 1048576); + } + + void TearDown() + { + free((void*)buffer); + free((void*)buffer_misaligned); + } + + ~spamsumFixture() + { + // resources cleanup, no exceptions allowed + } + + // shared user data +}; + +TEST_F(spamsumFixture, spamsum_auto) +{ + spamsum_ctx* ctx = aaruf_spamsum_init(); + const char* spamsum = (const char*)malloc(FUZZY_MAX_RESULT); + + EXPECT_NE(ctx, nullptr); + EXPECT_NE(spamsum, nullptr); + + aaruf_spamsum_update(ctx, buffer, 1048576); + aaruf_spamsum_final(ctx, (uint8_t*)spamsum); + + EXPECT_STREQ(spamsum, EXPECTED_SPAMSUM); + + free((void*)spamsum); +} + +TEST_F(spamsumFixture, spamsum_auto_misaligned) +{ + spamsum_ctx* ctx = aaruf_spamsum_init(); + const char* spamsum = (const char*)malloc(FUZZY_MAX_RESULT); + + EXPECT_NE(ctx, nullptr); + EXPECT_NE(spamsum, nullptr); + + aaruf_spamsum_update(ctx, buffer_misaligned + 1, 1048576); + aaruf_spamsum_final(ctx, (uint8_t*)spamsum); + + EXPECT_STREQ(spamsum, EXPECTED_SPAMSUM); + + free((void*)spamsum); +} + +TEST_F(spamsumFixture, spamsum_auto_15bytes) +{ + spamsum_ctx* ctx = aaruf_spamsum_init(); + const char* spamsum = (const char*)malloc(FUZZY_MAX_RESULT); + + EXPECT_NE(ctx, nullptr); + EXPECT_NE(spamsum, nullptr); + + aaruf_spamsum_update(ctx, buffer, 15); + aaruf_spamsum_final(ctx, (uint8_t*)spamsum); + + EXPECT_STREQ(spamsum, EXPECTED_SPAMSUM_15BYTES); + + free((void*)spamsum); +} + +TEST_F(spamsumFixture, spamsum_auto_31bytes) +{ + spamsum_ctx* ctx = aaruf_spamsum_init(); + const char* spamsum = (const char*)malloc(FUZZY_MAX_RESULT); + + EXPECT_NE(ctx, nullptr); + EXPECT_NE(spamsum, nullptr); + + aaruf_spamsum_update(ctx, buffer, 31); + aaruf_spamsum_final(ctx, (uint8_t*)spamsum); + + EXPECT_STREQ(spamsum, EXPECTED_SPAMSUM_31BYTES); + + free((void*)spamsum); +} + +TEST_F(spamsumFixture, spamsum_auto_63bytes) +{ + spamsum_ctx* ctx = aaruf_spamsum_init(); + const char* spamsum = (const char*)malloc(FUZZY_MAX_RESULT); + + EXPECT_NE(ctx, nullptr); + EXPECT_NE(spamsum, nullptr); + + aaruf_spamsum_update(ctx, buffer, 63); + aaruf_spamsum_final(ctx, (uint8_t*)spamsum); + + EXPECT_STREQ(spamsum, EXPECTED_SPAMSUM_63BYTES); + + free((void*)spamsum); +} + +TEST_F(spamsumFixture, spamsum_auto_2352bytes) +{ + spamsum_ctx* ctx = aaruf_spamsum_init(); + const char* spamsum = (const char*)malloc(FUZZY_MAX_RESULT); + + EXPECT_NE(ctx, nullptr); + EXPECT_NE(spamsum, nullptr); + + aaruf_spamsum_update(ctx, buffer, 2352); + aaruf_spamsum_final(ctx, (uint8_t*)spamsum); + + EXPECT_STREQ(spamsum, EXPECTED_SPAMSUM_2352BYTES); + + free((void*)spamsum); +}