diff --git a/include/aaruformat/context.h b/include/aaruformat/context.h
index 33e500a..8931fa5 100644
--- a/include/aaruformat/context.h
+++ b/include/aaruformat/context.h
@@ -20,6 +20,7 @@
#define LIBAARUFORMAT_CONTEXT_H
#include "crc64.h"
+#include "hash_map.h"
#include "lru.h"
#include "structs.h"
#include "utarray.h"
@@ -127,6 +128,8 @@ typedef struct aaruformatContext
int writingBufferPosition;
uint64_t nextBlockPosition;
UT_array *indexEntries;
+ hash_map_t *sectorHashMap;
+ bool deduplicate;
} aaruformatContext;
typedef struct DumpHardwareEntriesWithData
diff --git a/include/aaruformat/hash_map.h b/include/aaruformat/hash_map.h
new file mode 100644
index 0000000..787e6e6
--- /dev/null
+++ b/include/aaruformat/hash_map.h
@@ -0,0 +1,43 @@
+/*
+ * This file is part of the Aaru Data Preservation Suite.
+ * Copyright (c) 2019-2025 Natalia Portillo.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ */
+
+#ifndef LIBAARUFORMAT_HASH_MAP_H
+#define LIBAARUFORMAT_HASH_MAP_H
+
+#include
+#include
+
+typedef struct
+{
+ uint64_t key;
+ uint64_t value;
+} kv_pair_t;
+
+typedef struct
+{
+ kv_pair_t *table;
+ size_t size;
+ size_t count;
+} hash_map_t;
+
+hash_map_t *create_map(size_t size);
+void free_map(hash_map_t *map);
+bool insert_map(hash_map_t *map, uint64_t key, uint64_t value);
+bool lookup_map(const hash_map_t *map, uint64_t key, uint64_t *out_value);
+
+#endif // LIBAARUFORMAT_HASH_MAP_H
diff --git a/src/close.c b/src/close.c
index 02cb839..cbe6ba1 100644
--- a/src/close.c
+++ b/src/close.c
@@ -593,6 +593,14 @@ int aaruf_close(void *context)
TRACE("Failed to write index header");
return AARUF_ERROR_CANNOT_WRITE_HEADER;
}
+
+ if(ctx->deduplicate && ctx->sectorHashMap != NULL)
+ {
+ TRACE("Clearing sector hash map");
+ // Clear sector hash map
+ free_map(ctx->sectorHashMap);
+ ctx->sectorHashMap = NULL;
+ }
}
TRACE("Freeing memory pointers");
diff --git a/src/create.c b/src/create.c
index 99392a2..10da618 100644
--- a/src/create.c
+++ b/src/create.c
@@ -292,6 +292,10 @@ void *aaruf_create(const char *filepath, uint32_t media_type, uint32_t sector_si
return NULL;
}
+ ctx->deduplicate = parsed_options.deduplicate;
+ if(ctx->deduplicate)
+ ctx->sectorHashMap = create_map(ctx->userDataDdtHeader.blocks * 25 / 100); // 25% of total sectors
+
// Is writing
ctx->isWriting = true;
diff --git a/src/ddt/hash_map.c b/src/ddt/hash_map.c
new file mode 100644
index 0000000..edf259d
--- /dev/null
+++ b/src/ddt/hash_map.c
@@ -0,0 +1,212 @@
+/*
+ * This file is part of the Aaru Data Preservation Suite.
+ * Copyright (c) 2019-2025 Natalia Portillo.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ */
+
+#include
+#include
+#include
+
+#include "hash_map.h"
+
+#define INITIAL_SIZE 1024
+#define LOAD_FACTOR 0.75
+
+/**
+ * @brief Creates a new hash map with the specified initial size.
+ *
+ * Allocates and initializes a new hash map structure with the given size. The hash map uses
+ * open addressing with linear probing for collision resolution. The table is zero-initialized,
+ * making empty slots identifiable by having a key value of 0.
+ *
+ * @param size Initial size of the hash table. Must be greater than 0.
+ *
+ * @return Returns a pointer to the newly created hash map, or NULL if allocation fails.
+ * @retval hash_map_t* Successfully created hash map with:
+ * - Allocated and zero-initialized table of specified size
+ * - Size set to the requested value
+ * - Count initialized to 0 (empty map)
+ * @retval NULL Memory allocation failed
+ *
+ * @note The caller is responsible for freeing the returned hash map using free_map().
+ * @note A key value of 0 is reserved to indicate empty slots and cannot be used as a valid key.
+ *
+ * @see free_map()
+ */
+hash_map_t *create_map(size_t size)
+{
+ hash_map_t *map = malloc(sizeof(hash_map_t));
+ map->table = calloc(size, sizeof(kv_pair_t));
+ map->size = size;
+ map->count = 0;
+
+ return map;
+}
+
+/**
+ * @brief Frees all memory associated with a hash map.
+ *
+ * Deallocates the hash table and the hash map structure itself. After calling this function,
+ * the hash map pointer becomes invalid and should not be used.
+ *
+ * @param map Pointer to the hash map to free. Can be NULL (no operation performed).
+ *
+ * @note This function does not free any memory pointed to by the values stored in the map.
+ * If the values are dynamically allocated, they must be freed separately before
+ * calling this function.
+ *
+ * @see create_map()
+ */
+void free_map(hash_map_t *map)
+{
+ free(map->table);
+ free(map);
+}
+
+/**
+ * @brief Resizes the hash map to a new size and rehashes all entries.
+ *
+ * This is an internal function that creates a new hash table with the specified size,
+ * rehashes all existing key-value pairs from the old table, and replaces the old table
+ * with the new one. This operation is automatically triggered when the load factor
+ * exceeds the threshold during insertion.
+ *
+ * @param map Pointer to the hash map to resize. Must not be NULL.
+ * @param new_size New size for the hash table. Should be larger than the current size
+ * for optimal performance.
+ *
+ * @note This is a static (internal) function and should not be called directly.
+ * @note The function preserves all existing key-value pairs during the resize operation.
+ * @note After resizing, the physical positions of entries in the table will change,
+ * but the logical key-value mappings remain intact.
+ * @note The old table is automatically freed after successful migration.
+ *
+ * @warning If memory allocation for the new table fails, the program may terminate.
+ *
+ * @see insert_map()
+ */
+static void resize_map(hash_map_t *map, size_t new_size)
+{
+ kv_pair_t *old_table = map->table;
+ size_t old_size = map->size;
+
+ map->table = calloc(new_size, sizeof(kv_pair_t));
+ map->size = new_size;
+ map->count = 0;
+
+ for(size_t i = 0; i < old_size; i++)
+ if(old_table[i].key != 0)
+ {
+ // Re-insert
+ size_t idx = old_table[i].key % new_size;
+
+ while(map->table[idx].key != 0) idx = (idx + 1) % new_size;
+
+ map->table[idx] = old_table[i];
+ map->count++;
+ }
+
+ free(old_table);
+}
+
+/**
+ * @brief Inserts a key-value pair into the hash map.
+ *
+ * Adds a new key-value pair to the hash map using open addressing with linear probing
+ * for collision resolution. If the key already exists, the insertion fails and returns
+ * false. The function automatically resizes the hash table when the load factor exceeds
+ * the threshold (0.75) to maintain optimal performance.
+ *
+ * @param map Pointer to the hash map. Must not be NULL.
+ * @param key The key to insert. Must not be 0 as this value is reserved for empty slots.
+ * @param value The value to associate with the key.
+ *
+ * @return Returns the result of the insertion operation.
+ * @retval true Successfully inserted the key-value pair. The map count is incremented.
+ * @retval false Key already exists in the map. No changes made to the map.
+ *
+ * @note If insertion would exceed the load factor threshold, the hash table is
+ * automatically resized to twice its current size before insertion.
+ * @note Time complexity: O(1) average case, O(n) worst case with poor hash distribution.
+ * @note Space complexity: O(1) unless resizing occurs, in which case it's O(n).
+ *
+ * @warning Using 0 as a key value will result in undefined behavior as 0 is reserved
+ * for marking empty slots.
+ * @warning If memory allocation fails during automatic resizing, the program may terminate.
+ *
+ * @see lookup_map()
+ * @see resize_map()
+ */
+bool insert_map(hash_map_t *map, uint64_t key, uint64_t value)
+{
+ if((double)map->count / map->size > LOAD_FACTOR) resize_map(map, map->size * 2);
+
+ size_t idx = key % map->size;
+
+ while(map->table[idx].key != 0 && map->table[idx].key != key) idx = (idx + 1) % map->size;
+
+ if(map->table[idx].key == key) return false; // Already present
+
+ map->table[idx].key = key;
+ map->table[idx].value = value;
+ map->count++;
+
+ return true;
+}
+
+/**
+ * @brief Looks up a value by key in the hash map.
+ *
+ * Searches for the specified key in the hash map and retrieves its associated value.
+ * Uses linear probing to handle collisions during the search. The function does not
+ * modify the hash map in any way.
+ *
+ * @param map Pointer to the hash map to search. Must not be NULL.
+ * @param key The key to search for. Must not be 0.
+ * @param out_value Pointer to store the found value. Must not be NULL.
+ * Only modified if the key is found.
+ *
+ * @return Returns whether the key was found in the map.
+ * @retval true Key found. The associated value is written to *out_value.
+ * @retval false Key not found. *out_value is not modified.
+ *
+ * @note Time complexity: O(1) average case, O(n) worst case with poor hash distribution
+ * or high load factor.
+ * @note The function is read-only and does not modify the hash map structure.
+ * @note Searching for key value 0 will always return false as 0 indicates empty slots.
+ *
+ * @warning The out_value parameter must point to valid memory location.
+ * Passing NULL will result in undefined behavior.
+ *
+ * @see insert_map()
+ */
+bool lookup_map(const hash_map_t *map, uint64_t key, uint64_t *out_value)
+{
+ size_t idx = key % map->size;
+
+ while(map->table[idx].key != 0)
+ {
+ if(map->table[idx].key == key)
+ {
+ *out_value = map->table[idx].value;
+ return true;
+ }
+
+ idx = (idx + 1) % map->size;
+ }
+
+ return false;
+}
\ No newline at end of file
diff --git a/src/write.c b/src/write.c
index 57b0d1f..ed1be43 100644
--- a/src/write.c
+++ b/src/write.c
@@ -25,6 +25,7 @@
#include "aaruformat.h"
#include "internal.h"
#include "log.h"
+#include "xxhash.h"
/**
* @brief Writes a sector to the AaruFormat image.
@@ -155,6 +156,19 @@ int32_t aaruf_write_sector(void *context, uint64_t sector_address, const uint8_t
}
}
+ uint64_t ddt_entry = 0;
+
+ if(ctx->deduplicate)
+ {
+ // Calculate 64-bit XXH3 hash of the sector
+ TRACE("Hashing sector data for deduplication");
+ uint64_t hash = XXH3_64bits(data, length);
+
+ // Check if the hash is already in the map
+ bool existing = lookup_map(ctx->sectorHashMap, hash, &ddt_entry);
+ TRACE("Block does %s exist in deduplication map", existing ? "already" : "not yet");
+ }
+
bool ddt_ok = set_ddt_entry_v2(ctx, sector_address, ctx->currentBlockOffset, ctx->nextBlockPosition, sector_status);
if(!ddt_ok)