Add hash map implementation for sector deduplication

This commit is contained in:
2025-09-30 20:10:40 +01:00
parent 88816c37fd
commit f6ed0f1856
6 changed files with 284 additions and 0 deletions

View File

@@ -20,6 +20,7 @@
#define LIBAARUFORMAT_CONTEXT_H
#include "crc64.h"
#include "hash_map.h"
#include "lru.h"
#include "structs.h"
#include "utarray.h"
@@ -127,6 +128,8 @@ typedef struct aaruformatContext
int writingBufferPosition;
uint64_t nextBlockPosition;
UT_array *indexEntries;
hash_map_t *sectorHashMap;
bool deduplicate;
} aaruformatContext;
typedef struct DumpHardwareEntriesWithData

View File

@@ -0,0 +1,43 @@
/*
* This file is part of the Aaru Data Preservation Suite.
* Copyright (c) 2019-2025 Natalia Portillo.
*
* This library is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of the
* License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef LIBAARUFORMAT_HASH_MAP_H
#define LIBAARUFORMAT_HASH_MAP_H
#include <stdbool.h>
#include <stdlib.h>
typedef struct
{
uint64_t key;
uint64_t value;
} kv_pair_t;
typedef struct
{
kv_pair_t *table;
size_t size;
size_t count;
} hash_map_t;
hash_map_t *create_map(size_t size);
void free_map(hash_map_t *map);
bool insert_map(hash_map_t *map, uint64_t key, uint64_t value);
bool lookup_map(const hash_map_t *map, uint64_t key, uint64_t *out_value);
#endif // LIBAARUFORMAT_HASH_MAP_H

View File

@@ -593,6 +593,14 @@ int aaruf_close(void *context)
TRACE("Failed to write index header");
return AARUF_ERROR_CANNOT_WRITE_HEADER;
}
if(ctx->deduplicate && ctx->sectorHashMap != NULL)
{
TRACE("Clearing sector hash map");
// Clear sector hash map
free_map(ctx->sectorHashMap);
ctx->sectorHashMap = NULL;
}
}
TRACE("Freeing memory pointers");

View File

@@ -292,6 +292,10 @@ void *aaruf_create(const char *filepath, uint32_t media_type, uint32_t sector_si
return NULL;
}
ctx->deduplicate = parsed_options.deduplicate;
if(ctx->deduplicate)
ctx->sectorHashMap = create_map(ctx->userDataDdtHeader.blocks * 25 / 100); // 25% of total sectors
// Is writing
ctx->isWriting = true;

212
src/ddt/hash_map.c Normal file
View File

@@ -0,0 +1,212 @@
/*
* This file is part of the Aaru Data Preservation Suite.
* Copyright (c) 2019-2025 Natalia Portillo.
*
* This library is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of the
* License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include "hash_map.h"
#define INITIAL_SIZE 1024
#define LOAD_FACTOR 0.75
/**
* @brief Creates a new hash map with the specified initial size.
*
* Allocates and initializes a new hash map structure with the given size. The hash map uses
* open addressing with linear probing for collision resolution. The table is zero-initialized,
* making empty slots identifiable by having a key value of 0.
*
* @param size Initial size of the hash table. Must be greater than 0.
*
* @return Returns a pointer to the newly created hash map, or NULL if allocation fails.
* @retval hash_map_t* Successfully created hash map with:
* - Allocated and zero-initialized table of specified size
* - Size set to the requested value
* - Count initialized to 0 (empty map)
* @retval NULL Memory allocation failed
*
* @note The caller is responsible for freeing the returned hash map using free_map().
* @note A key value of 0 is reserved to indicate empty slots and cannot be used as a valid key.
*
* @see free_map()
*/
hash_map_t *create_map(size_t size)
{
hash_map_t *map = malloc(sizeof(hash_map_t));
map->table = calloc(size, sizeof(kv_pair_t));
map->size = size;
map->count = 0;
return map;
}
/**
* @brief Frees all memory associated with a hash map.
*
* Deallocates the hash table and the hash map structure itself. After calling this function,
* the hash map pointer becomes invalid and should not be used.
*
* @param map Pointer to the hash map to free. Can be NULL (no operation performed).
*
* @note This function does not free any memory pointed to by the values stored in the map.
* If the values are dynamically allocated, they must be freed separately before
* calling this function.
*
* @see create_map()
*/
void free_map(hash_map_t *map)
{
free(map->table);
free(map);
}
/**
* @brief Resizes the hash map to a new size and rehashes all entries.
*
* This is an internal function that creates a new hash table with the specified size,
* rehashes all existing key-value pairs from the old table, and replaces the old table
* with the new one. This operation is automatically triggered when the load factor
* exceeds the threshold during insertion.
*
* @param map Pointer to the hash map to resize. Must not be NULL.
* @param new_size New size for the hash table. Should be larger than the current size
* for optimal performance.
*
* @note This is a static (internal) function and should not be called directly.
* @note The function preserves all existing key-value pairs during the resize operation.
* @note After resizing, the physical positions of entries in the table will change,
* but the logical key-value mappings remain intact.
* @note The old table is automatically freed after successful migration.
*
* @warning If memory allocation for the new table fails, the program may terminate.
*
* @see insert_map()
*/
static void resize_map(hash_map_t *map, size_t new_size)
{
kv_pair_t *old_table = map->table;
size_t old_size = map->size;
map->table = calloc(new_size, sizeof(kv_pair_t));
map->size = new_size;
map->count = 0;
for(size_t i = 0; i < old_size; i++)
if(old_table[i].key != 0)
{
// Re-insert
size_t idx = old_table[i].key % new_size;
while(map->table[idx].key != 0) idx = (idx + 1) % new_size;
map->table[idx] = old_table[i];
map->count++;
}
free(old_table);
}
/**
* @brief Inserts a key-value pair into the hash map.
*
* Adds a new key-value pair to the hash map using open addressing with linear probing
* for collision resolution. If the key already exists, the insertion fails and returns
* false. The function automatically resizes the hash table when the load factor exceeds
* the threshold (0.75) to maintain optimal performance.
*
* @param map Pointer to the hash map. Must not be NULL.
* @param key The key to insert. Must not be 0 as this value is reserved for empty slots.
* @param value The value to associate with the key.
*
* @return Returns the result of the insertion operation.
* @retval true Successfully inserted the key-value pair. The map count is incremented.
* @retval false Key already exists in the map. No changes made to the map.
*
* @note If insertion would exceed the load factor threshold, the hash table is
* automatically resized to twice its current size before insertion.
* @note Time complexity: O(1) average case, O(n) worst case with poor hash distribution.
* @note Space complexity: O(1) unless resizing occurs, in which case it's O(n).
*
* @warning Using 0 as a key value will result in undefined behavior as 0 is reserved
* for marking empty slots.
* @warning If memory allocation fails during automatic resizing, the program may terminate.
*
* @see lookup_map()
* @see resize_map()
*/
bool insert_map(hash_map_t *map, uint64_t key, uint64_t value)
{
if((double)map->count / map->size > LOAD_FACTOR) resize_map(map, map->size * 2);
size_t idx = key % map->size;
while(map->table[idx].key != 0 && map->table[idx].key != key) idx = (idx + 1) % map->size;
if(map->table[idx].key == key) return false; // Already present
map->table[idx].key = key;
map->table[idx].value = value;
map->count++;
return true;
}
/**
* @brief Looks up a value by key in the hash map.
*
* Searches for the specified key in the hash map and retrieves its associated value.
* Uses linear probing to handle collisions during the search. The function does not
* modify the hash map in any way.
*
* @param map Pointer to the hash map to search. Must not be NULL.
* @param key The key to search for. Must not be 0.
* @param out_value Pointer to store the found value. Must not be NULL.
* Only modified if the key is found.
*
* @return Returns whether the key was found in the map.
* @retval true Key found. The associated value is written to *out_value.
* @retval false Key not found. *out_value is not modified.
*
* @note Time complexity: O(1) average case, O(n) worst case with poor hash distribution
* or high load factor.
* @note The function is read-only and does not modify the hash map structure.
* @note Searching for key value 0 will always return false as 0 indicates empty slots.
*
* @warning The out_value parameter must point to valid memory location.
* Passing NULL will result in undefined behavior.
*
* @see insert_map()
*/
bool lookup_map(const hash_map_t *map, uint64_t key, uint64_t *out_value)
{
size_t idx = key % map->size;
while(map->table[idx].key != 0)
{
if(map->table[idx].key == key)
{
*out_value = map->table[idx].value;
return true;
}
idx = (idx + 1) % map->size;
}
return false;
}

View File

@@ -25,6 +25,7 @@
#include "aaruformat.h"
#include "internal.h"
#include "log.h"
#include "xxhash.h"
/**
* @brief Writes a sector to the AaruFormat image.
@@ -155,6 +156,19 @@ int32_t aaruf_write_sector(void *context, uint64_t sector_address, const uint8_t
}
}
uint64_t ddt_entry = 0;
if(ctx->deduplicate)
{
// Calculate 64-bit XXH3 hash of the sector
TRACE("Hashing sector data for deduplication");
uint64_t hash = XXH3_64bits(data, length);
// Check if the hash is already in the map
bool existing = lookup_map(ctx->sectorHashMap, hash, &ddt_entry);
TRACE("Block does %s exist in deduplication map", existing ? "already" : "not yet");
}
bool ddt_ok = set_ddt_entry_v2(ctx, sector_address, ctx->currentBlockOffset, ctx->nextBlockPosition, sector_status);
if(!ddt_ok)