Enhance documentation for various structures with detailed descriptions and formatting improvements

This commit is contained in:
2025-10-01 05:35:39 +01:00
parent 1f91ad1e08
commit 41aee42c53
16 changed files with 1935 additions and 1273 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -24,43 +24,90 @@
#pragma ide diagnostic ignored "OCUnusedMacroInspection"
#endif
/** Magic identidier = "DICMFRMT". */
#define DIC_MAGIC 0x544D52464D434944
/** Magic identidier = "AARUFRMT". */
#define AARU_MAGIC 0x544D524655524141
/** Image format version. A change in this number indicates an incompatible change to the format that prevents older
* implementations from reading it correctly, if at all. */
#define AARUF_VERSION 2
/** First version of AaruFormat, created in C#.
* CRC64 was byte-swapped
/** \file aaruformat/consts.h
* \brief Core public constants and compiletime limits for the Aaru container format implementation.
*
* This header exposes magic identifiers, format version selectors, resource limits, codec parameter bounds,
* and bit masks used across libaaruformat. All values are immutable interface contracts; changing them breaks
* backward compatibility unless a new format version is declared.
*
* Summary:
* - Magic numbers (DIC_MAGIC, AARU_MAGIC) identify container families (legacy DiscImageChef vs AaruFormat).
* - Version macros distinguish format generations (V1 C# / legacy CRC endianness, V2 current C implementation).
* - Cache and table size limits provide protective upper bounds against runaway memory consumption.
* - Audio constants (SAMPLES_PER_SECTOR, MIN/MAX_FLAKE_BLOCK) align with Red Book (CDDA) and FLAC encoding best
* practices.
* - CD_* masks assist with extracting flags / positional subfields in deduplicated Compact Disc sector tables.
* - CRC64 constants implement ECMA182 polynomial and standard seed, enabling deterministic endtoend block
* integrity.
*
* Notes:
* - Magic values are stored littleendian on disk when written as 64bit integers; when inspecting raw bytes make
* sure to account for host endianness.
* - AARUF_VERSION must be incremented only when an incompatible ondisk layout change is introduced.
* - MAX_DDT_ENTRY_CACHE is a soft upper bound sized to balance deduplication hit rate vs RAM; tune in future builds
* via configuration if adaptive heuristics are introduced.
* - The LZMA properties length (5) derives from the standard LZMA header (lc/lp/pb + dict size) and is constant for
* raw LZMA streams used here.
* - FLAC sample block guidance: empirical evaluation shows >4608 samples per block does not yield meaningful ratio
* gains for typical optical audio captures while increasing decode buffer size.
*
* Thread safety: All macros are compiletime constants; no synchronization required.
* Portability: Constants chosen to fit within 64bit targets; arithmetic assumes two's complement.
*/
#define AARUF_VERSION_V1 1
/** Second version of AaruFormat, created in C.
* Introduced new header, many new features, and blocks.
*/
#define AARUF_VERSION_V2 2
/** Maximum read cache size, 512MiB. */
#define MAX_CACHE_SIZE 536870912
/** Size in bytes of LZMA properties. */
#define LZMA_PROPERTIES_LENGTH 5
/** Maximum number of entries for the DDT cache. */
#define MAX_DDT_ENTRY_CACHE 16000000
/** How many samples are contained in a RedBook sector. */
#define SAMPLES_PER_SECTOR 588
/** Maximum number of samples for a FLAC block. Bigger than 4608 gives no benefit. */
#define MAX_FLAKE_BLOCK 4608
/** Minimum number of samples for a FLAC block. CUETools.Codecs.FLAKE does not support it to be smaller than 256. */
#define MIN_FLAKE_BLOCK 256
/** This mask is to check for flags in CompactDisc suffix/prefix DDT */
#define CD_XFIX_MASK 0xFF000000
/** This mask is to check for position in CompactDisc suffix/prefix deduplicated block */
#define CD_DFIX_MASK 0x00FFFFFF
#define CRC64_ECMA_POLY 0xC96C5795D7870F42
#define CRC64_ECMA_SEED 0xFFFFFFFFFFFFFFFF
/** Magic identifier for legacy DiscImageChef container (ASCII "DICMFRMT").
* Retained for backward compatibility / migration tooling. */
#define DIC_MAGIC 0x544D52464D434944ULL
/** Magic identifier for AaruFormat container (ASCII "AARUFRMT").
* Used in the primary header to assert correct file type. */
#define AARU_MAGIC 0x544D524655524141ULL
/** Current image format major version (incompatible changes bump this).
* Readers should reject headers with a higher number unless explicitly forward compatible. */
#define AARUF_VERSION 2
/** First ondisk version (C# implementation).
* Quirk: CRC64 values were stored byteswapped relative to ECMA182 canonical output. */
#define AARUF_VERSION_V1 1
/** Second ondisk version (C implementation).
* Introduced: extended header (GUID, feature bitmaps), hierarchical DDT v2, improved index (v2/v3),
* multicodec compression, refined metadata blocks. */
#define AARUF_VERSION_V2 2
/** Maximum read cache size (bytes). 512 MiB chosen to prevent excessive resident memory while
* still enabling efficient sequential and moderate random access patterns. */
#define MAX_CACHE_SIZE 536870912ULL
/** Size in bytes of the fixed LZMA properties header (lc/lp/pb + dictionary size). */
#define LZMA_PROPERTIES_LENGTH 5
/** Maximum number of cached DDT entry descriptors retained in memory for fast duplicate detection.
* At 16,000,000 entries with a compact structure, this caps hash_map overhead while covering large images.
* (Approx memory just for lookup bookkeeping: ~16 bytes * N ≈ 256 MB worst case; typical effective <50% of cap.) */
#define MAX_DDT_ENTRY_CACHE 16000000
/** Red Book (CDDA) PCM samples per 2352byte sector: 44,100 Hz / 75 sectors per second = 588 samples. */
#define SAMPLES_PER_SECTOR 588
/** FLAC maximum block size used for encoding audio sectors.
* Empirically >4608 samples yields diminishing compression returns and higher decode latency. */
#define MAX_FLAKE_BLOCK 4608
/** FLAC minimum block size. CUETools.Codecs.FLAKE does not accept blocks smaller than 256 samples. */
#define MIN_FLAKE_BLOCK 256
/** Mask for extracting correction / fix flags in Compact Disc suffix/prefix DDT entries.
* High 8 bits store status (see SectorStatus / CdFixFlags relationships). */
#define CD_XFIX_MASK 0xFF000000U
/** Mask for extracting positional index (lower 24 bits) in Compact Disc suffix/prefix deduplicated block entries. */
#define CD_DFIX_MASK 0x00FFFFFFU
/** ECMA182 CRC64 polynomial (reflected form used in standard implementations). */
#define CRC64_ECMA_POLY 0xC96C5795D7870F42ULL
/** Initial seed value for CRC64 computations (all bits set). */
#define CRC64_ECMA_SEED 0xFFFFFFFFFFFFFFFFULL
#ifndef _MSC_VER
#pragma clang diagnostic pop
#endif
#endif // LIBAARUFORMAT_CONSTS_H
#endif // LIBAARUFORMAT_CONSTS_H

View File

@@ -25,6 +25,41 @@
#include "structs.h"
#include "utarray.h"
/** \file aaruformat/context.h
* \brief Central runtime context structures for libaaruformat (image state, caches, checksum buffers).
*
* The principal structure, \ref aaruformatContext, aggregates: header metadata, open stream handle, deduplication
* tables (DDT) currently in memory, optical disc auxiliary data (sector prefix/suffix/subchannel), track listings,
* geometry & metadata blocks, checksum accumulators, CRC & ECC helper contexts, hash map for deduplication, and
* transient write buffers.
*
* Memory ownership model (unless otherwise stated): if a pointer field is non-NULL it is owned by the context and
* will be freed (or otherwise released) during context close / destruction. Callers must not free or reallocate
* these pointers directly. External callers should treat all internal buffers as readonly unless explicitly writing.
*
* Threading: a single context instance is NOT thread-safe; serialize access if used across threads.
* Lifetime: allocate, initialize/open, perform read/write/verify operations, then close/free.
*
* Deduplication tables (DDT): only a subset (primary table + an active secondary + optional cache) is retained in RAM;
* large images may rely on lazy loading of secondary tables. Flags (inMemoryDdt, userDataDdt*, cachedSecondary*)
* indicate what is currently resident.
*
* Optical auxiliary buffers (sectorPrefix / sectorSuffix / subchannel / corrected variants) are populated only for
* images where those components exist (e.g., raw CD dumps). They may be NULL for block devices / nonoptical media.
*
* Index handling: indexEntries (UT_array) holds a flattened list of \ref IndexEntry structures (regardless of
* v1/v2/v3). hash_map_t *sectorHashMap provides fast duplicate detection keyed by content fingerprint / sparse sector
* key.
*
* Invariants / sanity expectations (not strictly enforced everywhere):
* - magic == AARU_MAGIC after successful open/create.
* - header.imageMajorVersion <= AARUF_VERSION.
* - imageStream != NULL when any I/O method is in progress.
* - If deduplicate == false, sectorHashMap may still be populated for bookkeeping but duplicates are stored
* independently.
* - If userDataDdtMini != NULL then userDataDdtBig == NULL (and vice versa) for a given level.
*/
#ifndef MD5_DIGEST_LENGTH
#define MD5_DIGEST_LENGTH 16
#endif
@@ -37,121 +72,186 @@
#define SHA256_DIGEST_LENGTH 32
#endif
/** \struct Crc64Context
* \brief Internal (legacy) CRC64 computation context (superseded by crt \ref crc64_ctx usage).
*
* Kept for compatibility with earlier code paths; new code should prefer the opaque crc64_ctx API.
*/
typedef struct Crc64Context
{
uint64_t finalSeed;
uint64_t table[256];
uint64_t hashInt;
uint64_t finalSeed; ///< Final CRC value (post processing) or running seed.
uint64_t table[256]; ///< Precomputed 256-entry lookup table for the ECMA polynomial.
uint64_t hashInt; ///< Intermediate accumulator.
} Crc64Context;
/** \struct CdEccContext
* \brief Lookup tables and state for Compact Disc EDC/ECC (P/Q) regeneration / verification.
*
* Fields may be lazily allocated; inited_edc indicates tables are ready.
*/
typedef struct CdEccContext
{
bool inited_edc;
uint8_t *ecc_b_table;
uint8_t *ecc_f_table;
uint32_t *edc_table;
bool inited_edc; ///< True once EDC/ECC tables have been initialized.
uint8_t *ecc_b_table; ///< Backward (B) ECC table (allocated, size implementation-defined).
uint8_t *ecc_f_table; ///< Forward (F) ECC table.
uint32_t *edc_table; ///< EDC (CRC) lookup table.
} CdEccContext;
/** \struct Checksums
* \brief Collected wholeimage checksums / hashes present in a checksum block.
*
* Only hash arrays with corresponding has* flags set contain valid data. spamsum is a dynamically allocated
* NULterminated buffer (original SpamSum signature bytes followed by appended '\0').
*/
typedef struct Checksums
{
bool hasMd5;
bool hasSha1;
bool hasSha256;
bool hasSpamSum;
uint8_t md5[MD5_DIGEST_LENGTH];
uint8_t sha1[SHA1_DIGEST_LENGTH];
uint8_t sha256[SHA256_DIGEST_LENGTH];
uint8_t *spamsum;
bool hasMd5; ///< True if md5[] buffer populated.
bool hasSha1; ///< True if sha1[] buffer populated.
bool hasSha256; ///< True if sha256[] buffer populated.
bool hasSpamSum; ///< True if spamsum pointer allocated and signature read.
uint8_t md5[MD5_DIGEST_LENGTH]; ///< MD5 digest (16 bytes).
uint8_t sha1[SHA1_DIGEST_LENGTH]; ///< SHA-1 digest (20 bytes).
uint8_t sha256[SHA256_DIGEST_LENGTH]; ///< SHA-256 digest (32 bytes).
uint8_t *spamsum; ///< SpamSum fuzzy hash (ASCII), allocated length+1 with trailing 0.
} Checksums;
/** \struct mediaTagEntry
* \brief Hash table entry for an arbitrary media tag (e.g., proprietary drive/medium descriptor).
*
* Stored via uthash (hh handle). Type is a formatspecific integer identifier mapping to external interpretation.
*/
typedef struct mediaTagEntry
{
uint8_t *data;
int32_t type;
uint32_t length;
UT_hash_handle hh;
uint8_t *data; ///< Tag data blob (opaque to library core); length bytes long.
int32_t type; ///< Numeric type identifier.
uint32_t length; ///< Length in bytes of data.
UT_hash_handle hh; ///< uthash linkage.
} mediaTagEntry;
/** \struct aaruformatContext
* \brief Master context representing an open or increation Aaru image.
*
* Contains stream handle, parsed headers, deduplication structures, optical extras, metadata blocks, checksum
* information, caches, and write-state. Allocate with library factory (or zeroinit + explicit open) and destroy
* with corresponding close/free routine.
*
* Field grouping:
* - Core & header: magic, library*Version, imageStream, header.
* - Optical sector adjuncts: sectorPrefix/sectorSuffix/subchannel plus corrected variants & mode2Subheaders.
* - Deduplication: inMemoryDdt, userDataDdt*, userDataDdtHeader, mini/big/cached secondary arrays, version tags.
* - Metadata & geometry: geometryBlock, metadataBlockHeader+metadataBlock, cicmBlockHeader+cicmBlock, tracksHeader.
* - Tracks & hardware: trackEntries, dataTracks, dumpHardwareHeader, dumpHardwareEntriesWithData.
* - Integrity & ECC: checksums, eccCdContext, crc64Context.
* - Index & dedup lookup: indexEntries (UT_array of IndexEntry), sectorHashMap (duplicate detection), deduplicate
* flag.
* - Write path: isWriting, currentBlockHeader, writingBuffer(+position/offset), nextBlockPosition.
*
* Notes:
* - userDataDdt points to memory-mapped or fully loaded DDT (legacy path); userDataDdtMini / userDataDdtBig
* supersede.
* - shift retained for backward compatibility with earlier singlelevel address shift semantics.
* - mappedMemoryDdtSize is meaningful only if userDataDdt references an mmapped region.
*/
typedef struct aaruformatContext
{
uint64_t magic;
uint8_t libraryMajorVersion;
uint8_t libraryMinorVersion;
FILE *imageStream;
AaruHeaderV2 header;
uint8_t *sectorPrefix;
uint8_t *sectorPrefixCorrected;
uint8_t *sectorSuffix;
uint8_t *sectorSuffixCorrected;
uint8_t *sectorSubchannel;
uint8_t *mode2Subheaders;
uint8_t shift;
bool inMemoryDdt;
uint64_t *userDataDdt;
size_t mappedMemoryDdtSize;
uint32_t *sectorPrefixDdt;
uint32_t *sectorSuffixDdt;
GeometryBlockHeader geometryBlock;
MetadataBlockHeader metadataBlockHeader;
uint8_t *metadataBlock;
TracksHeader tracksHeader;
TrackEntry *trackEntries;
CicmMetadataBlock cicmBlockHeader;
uint8_t *cicmBlock;
DumpHardwareHeader dumpHardwareHeader;
struct DumpHardwareEntriesWithData *dumpHardwareEntriesWithData;
ImageInfo imageInfo;
CdEccContext *eccCdContext;
uint8_t numberOfDataTracks;
TrackEntry *dataTracks;
bool *readableSectorTags;
struct CacheHeader blockHeaderCache;
struct CacheHeader blockCache;
Checksums checksums;
mediaTagEntry *mediaTags;
DdtHeader2 userDataDdtHeader;
int ddtVersion;
uint16_t *userDataDdtMini;
uint32_t *userDataDdtBig;
uint16_t *sectorPrefixDdtMini;
uint16_t *sectorSuffixDdtMini;
uint64_t cachedDdtOffset;
uint64_t cachedDdtPosition;
uint64_t primaryDdtOffset;
uint16_t *cachedSecondaryDdtSmall;
uint32_t *cachedSecondaryDdtBig;
bool isWriting;
BlockHeader currentBlockHeader;
uint8_t *writingBuffer;
int currentBlockOffset;
crc64_ctx *crc64Context;
int writingBufferPosition;
uint64_t nextBlockPosition;
UT_array *indexEntries;
hash_map_t *sectorHashMap;
bool deduplicate;
uint64_t magic; ///< File magic (AARU_MAGIC) post-open.
uint8_t libraryMajorVersion; ///< Linked library major version.
uint8_t libraryMinorVersion; ///< Linked library minor version.
FILE *imageStream; ///< Underlying FILE* stream (binary mode).
AaruHeaderV2 header; ///< Parsed container header (v2).
/* Optical auxiliary buffers (NULL if not present) */
uint8_t *sectorPrefix; ///< Raw per-sector prefix (e.g., sync+header) uncorrected.
uint8_t *sectorPrefixCorrected; ///< Corrected variant (post error correction) if stored.
uint8_t *sectorSuffix; ///< Raw per-sector suffix (EDC/ECC) uncorrected.
uint8_t *sectorSuffixCorrected; ///< Corrected suffix if stored separately.
uint8_t *sectorSubchannel; ///< Raw 96-byte subchannel (if captured).
uint8_t *mode2Subheaders; ///< MODE2 Form1/Form2 8-byte subheaders (concatenated).
uint8_t shift; ///< Legacy overall shift (deprecated by data_shift/table_shift).
bool inMemoryDdt; ///< True if primary (and possibly secondary) DDT loaded.
uint64_t *userDataDdt; ///< Legacy flat DDT pointer (NULL when using v2 mini/big arrays).
size_t mappedMemoryDdtSize; ///< Length of mmapped DDT if userDataDdt is mmapped.
uint32_t *sectorPrefixDdt; ///< Legacy CD sector prefix DDT (deprecated by *_Mini/Big).
uint32_t *sectorSuffixDdt; ///< Legacy CD sector suffix DDT.
GeometryBlockHeader geometryBlock; ///< Logical geometry block (if present).
MetadataBlockHeader metadataBlockHeader; ///< Metadata block header.
uint8_t *metadataBlock; ///< Raw metadata UTF-16LE concatenated strings.
TracksHeader tracksHeader; ///< Tracks header (optical) if present.
TrackEntry *trackEntries; ///< Full track list (tracksHeader.entries elements).
CicmMetadataBlock cicmBlockHeader; ///< CICM metadata header (if present).
uint8_t *cicmBlock; ///< CICM XML payload.
DumpHardwareHeader dumpHardwareHeader; ///< Dump hardware header.
struct DumpHardwareEntriesWithData *dumpHardwareEntriesWithData; ///< Array of dump hardware entries + strings.
ImageInfo imageInfo; ///< Exposed high-level image info summary.
CdEccContext *eccCdContext; ///< CD ECC/EDC helper tables (allocated on demand).
uint8_t numberOfDataTracks; ///< Count of tracks considered "data" (sequence 1..99 heuristics).
TrackEntry *dataTracks; ///< Filtered list of data tracks (subset of trackEntries).
bool *readableSectorTags; ///< Per-sector boolean array (optical tags read successfully?).
struct CacheHeader blockHeaderCache; ///< LRU/Cache header for block headers.
struct CacheHeader blockCache; ///< LRU/Cache header for block payloads.
Checksums checksums; ///< Whole-image checksums discovered.
mediaTagEntry *mediaTags; ///< Hash table of extra media tags (uthash root).
DdtHeader2 userDataDdtHeader; ///< Active user data DDT v2 header (primary table meta).
int ddtVersion; ///< DDT version in use (1=legacy, 2=v2 hierarchical).
uint16_t *userDataDdtMini; ///< DDT entries (small variant) primary/secondary current.
uint32_t *userDataDdtBig; ///< DDT entries (big variant) primary/secondary current.
uint16_t *sectorPrefixDdtMini; ///< CD sector prefix corrected DDT (small) if present.
uint16_t *sectorSuffixDdtMini; ///< CD sector suffix corrected DDT (small) if present.
uint64_t cachedDdtOffset; ///< File offset of currently cached secondary DDT (0=none).
uint64_t cachedDdtPosition; ///< Position index of cached secondary DDT.
uint64_t primaryDdtOffset; ///< File offset of the primary DDT v2 table.
uint16_t *cachedSecondaryDdtSmall; ///< Cached secondary table (small entries) or NULL.
uint32_t *cachedSecondaryDdtBig; ///< Cached secondary table (big entries) or NULL.
bool isWriting; ///< True if context opened/created for writing.
BlockHeader currentBlockHeader; ///< Header for block currently being assembled (write path).
uint8_t *writingBuffer; ///< Accumulation buffer for current block data.
int currentBlockOffset; ///< Logical offset inside block (units: bytes or sectors depending on path).
crc64_ctx *crc64Context; ///< Opaque CRC64 context for streaming updates.
int writingBufferPosition; ///< Current size / position within writingBuffer.
uint64_t nextBlockPosition; ///< Absolute file offset where next block will be written.
UT_array *indexEntries; ///< Flattened index entries (UT_array of IndexEntry).
hash_map_t *sectorHashMap; ///< Deduplication hash map (fingerprint->entry mapping).
bool deduplicate; ///< Storage deduplication active (duplicates coalesce).
} aaruformatContext;
/** \struct DumpHardwareEntriesWithData
* \brief In-memory representation of a dump hardware entry plus decoded variable-length fields & extents.
*
* All string pointers are NUL-terminated UTF-8 copies of on-disk data (or NULL if absent). extents array may be NULL
* when no ranges were recorded. Freed during context teardown.
*/
typedef struct DumpHardwareEntriesWithData
{
DumpHardwareEntry entry;
struct DumpExtent *extents;
uint8_t *manufacturer;
uint8_t *model;
uint8_t *revision;
uint8_t *firmware;
uint8_t *serial;
uint8_t *softwareName;
uint8_t *softwareVersion;
uint8_t *softwareOperatingSystem;
DumpHardwareEntry entry; ///< Fixed-size header with lengths & counts.
struct DumpExtent *extents; ///< Array of extents (entry.extents elements) or NULL.
uint8_t *manufacturer; ///< Manufacturer string (UTF-8) or NULL.
uint8_t *model; ///< Model string or NULL.
uint8_t *revision; ///< Hardware revision string or NULL.
uint8_t *firmware; ///< Firmware version string or NULL.
uint8_t *serial; ///< Serial number string or NULL.
uint8_t *softwareName; ///< Dump software name or NULL.
uint8_t *softwareVersion; ///< Dump software version or NULL.
uint8_t *softwareOperatingSystem; ///< Host operating system string or NULL.
} DumpHardwareEntriesWithData;
#pragma pack(push, 1)
/** \struct DumpExtent
* \brief Inclusive [start,end] logical sector range contributed by a single hardware environment.
*/
typedef struct DumpExtent
{
uint64_t start;
uint64_t end;
uint64_t start; ///< Starting LBA (inclusive).
uint64_t end; ///< Ending LBA (inclusive); >= start.
} DumpExtent;
#pragma pack(pop)

View File

@@ -20,11 +20,49 @@
#define LIBAARUFORMAT_CRC64_H
#include <stdint.h>
/** \file aaruformat/crc64.h
* \brief CRC64 (ECMA-182) core context and precomputed slicing-by-4 tables.
*
* Exposes:
* - \ref crc64_ctx: minimal incremental state (initialize crc to CRC64_ECMA_SEED).
* - crc64_table[4][256]: 4-way (slicing-by-4) lookup tables for high-throughput updates.
* - CRC64_ECMA_POLY / CRC64_ECMA_SEED macros matching ECMA-182 (reflected polynomial, all-bits-set seed).
*
* Algorithm characteristics:
* - Polynomial: 0xC96C5795D7870F42 (reflected form).
* - Seed / initial value: 0xFFFFFFFFFFFFFFFFULL.
* - Final XOR: none (raw accumulator is the result).
* - Bit order: reflected; least significant bit processed first.
*
* Table layout & optimization:
* Four 256-entry tables are used (slicing-by-4) allowing 4-byte chunks to be folded per iteration, reducing data
* dependency chains compared to a single-table approach. This improves throughput on modern CPUs with abundant ILP.
*
* Incremental usage (pseudo-code):
* \code{.c}
* crc64_ctx ctx = { .crc = CRC64_ECMA_SEED };
* ctx.crc = crc64_update(ctx.crc, buf, len); // internal helper using crc64_table
* // ctx.crc now holds ECMA-182 CRC64 value.
* \endcode
*
* Thread safety: The table is read-only; each thread must use its own crc64_ctx.
* Endianness: Table values are host-endian 64-bit constants; algorithm result is endianness-agnostic.
*/
/** \struct crc64_ctx
* \brief Minimal ECMA-182 CRC64 incremental state container (running value only).
*/
typedef struct
{
uint64_t crc;
uint64_t crc; ///< Running CRC value (initialize to CRC64_ECMA_SEED before first update).
} crc64_ctx;
/** \var crc64_table
* \brief Precomputed slicing-by-4 ECMA-182 CRC64 lookup tables (4 * 256 * 8 = 8192 bytes).
*
* Each row corresponds to one byte lane in a 4-byte block update; actual folding logic resides in the implementation.
* Content generated offline; do not modify manually.
*/
const static uint64_t crc64_table[4][256] = {
{0x0000000000000000, 0xB32E4CBE03A75F6F, 0xF4843657A840A05B, 0x47AA7AE9ABE7FF34, 0x7BD0C384FF8F5E33,
0xC8FE8F3AFC28015C, 0x8F54F5D357CFFE68, 0x3C7AB96D5468A107, 0xF7A18709FF1EBC66, 0x448FCBB7FCB9E309,
@@ -236,7 +274,9 @@ const static uint64_t crc64_table[4][256] = {
0x1E5CD90C6EC2440D}
};
#define CRC64_ECMA_POLY 0xC96C5795D7870F42
#define CRC64_ECMA_SEED 0xFFFFFFFFFFFFFFFF
/** ECMA-182 reflected polynomial constant. */
#define CRC64_ECMA_POLY 0xC96C5795D7870F42ULL
/** ECMA-182 initial seed (all bits set). */
#define CRC64_ECMA_SEED 0xFFFFFFFFFFFFFFFFULL
#endif // LIBAARUFORMAT_CRC64_H

View File

@@ -19,35 +19,136 @@
#ifndef LIBAARUFORMAT_ERRORS_H
#define LIBAARUFORMAT_ERRORS_H
#define AARUF_ERROR_NOT_AARUFORMAT (-1)
#define AARUF_ERROR_FILE_TOO_SMALL (-2)
#define AARUF_ERROR_INCOMPATIBLE_VERSION (-3)
#define AARUF_ERROR_CANNOT_READ_INDEX (-4)
#define AARUF_ERROR_SECTOR_OUT_OF_BOUNDS (-5)
#define AARUF_ERROR_CANNOT_READ_HEADER (-6)
#define AARUF_ERROR_CANNOT_READ_BLOCK (-7)
#define AARUF_ERROR_UNSUPPORTED_COMPRESSION (-8)
#define AARUF_ERROR_NOT_ENOUGH_MEMORY (-9)
#define AARUF_ERROR_BUFFER_TOO_SMALL (-10)
#define AARUF_ERROR_MEDIA_TAG_NOT_PRESENT (-11)
#define AARUF_ERROR_INCORRECT_MEDIA_TYPE (-12)
#define AARUF_ERROR_TRACK_NOT_FOUND (-13)
#define AARUF_ERROR_REACHED_UNREACHABLE_CODE (-14)
#define AARUF_ERROR_INVALID_TRACK_FORMAT (-15)
#define AARUF_ERROR_SECTOR_TAG_NOT_PRESENT (-16)
#define AARUF_ERROR_CANNOT_DECOMPRESS_BLOCK (-17)
#define AARUF_ERROR_INVALID_BLOCK_CRC (-18)
#define AARUF_ERROR_CANNOT_CREATE_FILE (-19)
#define AARUF_ERROR_INVALID_APP_NAME_LENGTH (-20)
#define AARUF_ERROR_CANNOT_WRITE_HEADER (-21)
#define AARUF_READ_ONLY (-22)
#define AARUF_ERROR_CANNOT_WRITE_BLOCK_HEADER (-23)
#define AARUF_ERROR_CANNOT_WRITE_BLOCK_DATA (-24)
#define AARUF_ERROR_CANNOT_SET_DDT_ENTRY (-25)
/** \file aaruformat/errors.h
* \brief Public error and status code definitions for libaaruformat.
*
* Negative values represent fatal / non-recoverable error conditions returned by library functions.
* Non-negative values (>=0) are either success (0) or sector-level status annotations used when
* decoding per-sector metadata (e.g. a sector not dumped or with corrected/unrecoverable errors).
*
* Usage guidelines:
* - Always test for < 0 to check generic failure without enumerating all codes.
* - Use exact comparisons for caller-specific handling (e.g. retry on AARUF_ERROR_CANNOT_READ_BLOCK).
* - Sector status codes are never returned as fatal function results; they appear in output parameters
* populated by read/identify routines.
*
* Helper: see aaruformat_error_string() for a human-readable textual description suitable for logs.
*/
#define AARUF_STATUS_OK 0
#define AARUF_STATUS_SECTOR_NOT_DUMPED 1
#define AARUF_STATUS_SECTOR_WITH_ERRORS 2
#define AARUF_STATUS_SECTOR_DELETED 3
/** \name Fatal / library-level error codes (negative)
* @{ */
#define AARUF_ERROR_NOT_AARUFORMAT (-1) ///< Input file/stream failed magic or structural validation.
#define AARUF_ERROR_FILE_TOO_SMALL (-2) ///< File size insufficient for mandatory header / structures.
#define AARUF_ERROR_INCOMPATIBLE_VERSION (-3) ///< Image uses a newer incompatible on-disk version.
#define AARUF_ERROR_CANNOT_READ_INDEX (-4) ///< Index block unreadable / truncated / bad identifier.
#define AARUF_ERROR_SECTOR_OUT_OF_BOUNDS (-5) ///< Requested logical sector outside media bounds.
#define AARUF_ERROR_CANNOT_READ_HEADER (-6) ///< Failed to read container header.
#define AARUF_ERROR_CANNOT_READ_BLOCK (-7) ///< Generic block read failure (seek/read error).
#define AARUF_ERROR_UNSUPPORTED_COMPRESSION (-8) ///< Block marked with unsupported compression algorithm.
#define AARUF_ERROR_NOT_ENOUGH_MEMORY (-9) ///< Memory allocation failure (critical).
#define AARUF_ERROR_BUFFER_TOO_SMALL (-10) ///< Caller-supplied buffer insufficient for data.
#define AARUF_ERROR_MEDIA_TAG_NOT_PRESENT (-11) ///< Requested media tag absent.
#define AARUF_ERROR_INCORRECT_MEDIA_TYPE (-12) ///< Operation incompatible with image media type.
#define AARUF_ERROR_TRACK_NOT_FOUND (-13) ///< Referenced track number not present.
#define AARUF_ERROR_REACHED_UNREACHABLE_CODE (-14) ///< Internal logic assertion hit unexpected path.
#define AARUF_ERROR_INVALID_TRACK_FORMAT (-15) ///< Track metadata internally inconsistent or malformed.
#define AARUF_ERROR_SECTOR_TAG_NOT_PRESENT (-16) ///< Requested sector tag (e.g. subchannel/prefix) not stored.
#define AARUF_ERROR_CANNOT_DECOMPRESS_BLOCK (-17) ///< Decompression routine failed or size mismatch.
#define AARUF_ERROR_INVALID_BLOCK_CRC (-18) ///< CRC64 mismatch indicating corruption.
#define AARUF_ERROR_CANNOT_CREATE_FILE (-19) ///< Output file could not be created / opened for write.
#define AARUF_ERROR_INVALID_APP_NAME_LENGTH (-20) ///< Application name field length invalid (sanity limit).
#define AARUF_ERROR_CANNOT_WRITE_HEADER (-21) ///< Failure writing container header.
#define AARUF_READ_ONLY (-22) ///< Operation requires write mode but context is read-only.
#define AARUF_ERROR_CANNOT_WRITE_BLOCK_HEADER (-23) ///< Failure writing block header.
#define AARUF_ERROR_CANNOT_WRITE_BLOCK_DATA (-24) ///< Failure writing block payload.
#define AARUF_ERROR_CANNOT_SET_DDT_ENTRY (-25) ///< Failed to encode/store a DDT entry (overflow or IO).
/** @} */
/** \name Non-fatal sector status codes (non-negative)
* Returned through output parameters to describe individual sector state.
* @{ */
#define AARUF_STATUS_OK 0 ///< Sector present and read without uncorrectable errors.
#define AARUF_STATUS_SECTOR_NOT_DUMPED 1 ///< Sector not captured (gap / missing / intentionally skipped).
#define AARUF_STATUS_SECTOR_WITH_ERRORS 2 ///< Sector present but with unrecoverable or flagged errors.
#define AARUF_STATUS_SECTOR_DELETED 3 ///< Sector logically marked deleted (e.g. filesystem deleted area).
/** @} */
/** \brief Convert an AaruFormat error or status code to a static human-readable string.
*
* Designed for diagnostics / logging; returns a constant string literal. Unknown codes yield
* "Unknown error/status". This helper is inline to avoid adding a separate translation unit.
*
* \param code Error (<0) or status (>=0) numeric code.
* \return Constant C string describing the code.
*/
static inline const char *aaruformat_error_string(int code)
{
switch(code)
{
/* Errors */
case AARUF_ERROR_NOT_AARUFORMAT:
return "Not an AaruFormat image";
case AARUF_ERROR_FILE_TOO_SMALL:
return "File too small";
case AARUF_ERROR_INCOMPATIBLE_VERSION:
return "Incompatible image version";
case AARUF_ERROR_CANNOT_READ_INDEX:
return "Cannot read index";
case AARUF_ERROR_SECTOR_OUT_OF_BOUNDS:
return "Sector out of bounds";
case AARUF_ERROR_CANNOT_READ_HEADER:
return "Cannot read header";
case AARUF_ERROR_CANNOT_READ_BLOCK:
return "Cannot read block";
case AARUF_ERROR_UNSUPPORTED_COMPRESSION:
return "Unsupported compression";
case AARUF_ERROR_NOT_ENOUGH_MEMORY:
return "Not enough memory";
case AARUF_ERROR_BUFFER_TOO_SMALL:
return "Buffer too small";
case AARUF_ERROR_MEDIA_TAG_NOT_PRESENT:
return "Media tag not present";
case AARUF_ERROR_INCORRECT_MEDIA_TYPE:
return "Incorrect media type";
case AARUF_ERROR_TRACK_NOT_FOUND:
return "Track not found";
case AARUF_ERROR_REACHED_UNREACHABLE_CODE:
return "Internal unreachable code reached";
case AARUF_ERROR_INVALID_TRACK_FORMAT:
return "Invalid track format";
case AARUF_ERROR_SECTOR_TAG_NOT_PRESENT:
return "Sector tag not present";
case AARUF_ERROR_CANNOT_DECOMPRESS_BLOCK:
return "Cannot decompress block";
case AARUF_ERROR_INVALID_BLOCK_CRC:
return "Invalid block CRC";
case AARUF_ERROR_CANNOT_CREATE_FILE:
return "Cannot create file";
case AARUF_ERROR_INVALID_APP_NAME_LENGTH:
return "Invalid application name length";
case AARUF_ERROR_CANNOT_WRITE_HEADER:
return "Cannot write header";
case AARUF_READ_ONLY:
return "Read-only context";
case AARUF_ERROR_CANNOT_WRITE_BLOCK_HEADER:
return "Cannot write block header";
case AARUF_ERROR_CANNOT_WRITE_BLOCK_DATA:
return "Cannot write block data";
case AARUF_ERROR_CANNOT_SET_DDT_ENTRY:
return "Cannot set DDT entry";
/* Status */
case AARUF_STATUS_OK:
return "OK";
case AARUF_STATUS_SECTOR_NOT_DUMPED:
return "Sector not dumped";
case AARUF_STATUS_SECTOR_WITH_ERRORS:
return "Sector with errors";
case AARUF_STATUS_SECTOR_DELETED:
return "Sector deleted";
}
return "Unknown error/status";
}
#endif // LIBAARUFORMAT_ERRORS_H

View File

@@ -22,22 +22,40 @@
#include <stdbool.h>
#include <stdlib.h>
/** \struct kv_pair_t
* \brief Single key/value slot used internally by the open-addressing hash map.
*
* Collision resolution strategy (implementation detail): linear or quadratic probing (see source). An empty
* slot is typically represented by a key sentinel (e.g. 0 or another reserved value) callers never interact
* with individual kv_pair_t entries directly; they are managed through the map API.
*/
typedef struct
{
uint64_t key;
uint64_t value;
uint64_t key; ///< Stored key (64-bit). May use a reserved sentinel to denote an empty slot.
uint64_t value; ///< Associated value payload (64-bit) stored alongside the key.
} kv_pair_t;
/** \struct hash_map_t
* \brief Minimal open-addressing hash map for 64-bit key/value pairs used in deduplication lookup.
*
* Fields:
* - table: Pointer to contiguous array of kv_pair_t entries (capacity == size).
* - size: Total number of slots allocated in table (must be >= 1).
* - count: Number of occupied (non-empty) slots currently in use.
*
* Load factor guidance: insert performance degrades as count approaches size; callers may rebuild with a larger
* size when (count * 10 / size) exceeds a chosen threshold (e.g. 70 80%). No automatic resizing is performed.
*/
typedef struct
{
kv_pair_t *table;
size_t size;
size_t count;
kv_pair_t *table; ///< Array of key/value slots of length == size.
size_t size; ///< Allocated slot capacity of table.
size_t count; ///< Number of active (filled) entries.
} hash_map_t;
hash_map_t *create_map(size_t size);
void free_map(hash_map_t *map);
bool insert_map(hash_map_t *map, uint64_t key, uint64_t value);
bool lookup_map(const hash_map_t *map, uint64_t key, uint64_t *out_value);
void free_map(hash_map_t *map);
bool insert_map(hash_map_t *map, uint64_t key, uint64_t value);
bool lookup_map(const hash_map_t *map, uint64_t key, uint64_t *out_value);
#endif // LIBAARUFORMAT_HASH_MAP_H

View File

@@ -8,49 +8,49 @@
#include <stdint.h>
#include <uthash.h>
/** \struct CacheEntry
* \brief Single hash entry in the in-memory cache.
*
* This structure is managed by uthash (open addressing with chaining semantics provided by macros).
* It represents one key/value association tracked by the cache. The cache implementation supports
* both string keys (null-terminated) and 64-bit numeric keys; numeric keys are stored by casting
* to a temporary string buffer upstream (see implementation). Callers do not allocate or free
* individual entries directly; use the cache API helpers.
*
* Lifetime & ownership:
* - key points either to a heap-allocated C string owned by the cache or to a short-lived buffer
* duplicated internally; callers must not free it after insertion.
* - value is an opaque pointer supplied by caller; the cache does not take ownership of the pointee
* (caller remains responsible for the underlying object unless documented otherwise).
*/
struct CacheEntry
{
char *key;
void *value;
UT_hash_handle hh;
char *key; ///< Null-terminated key string (unique within the cache). May encode numeric keys.
void *value; ///< Opaque value pointer associated with key (not freed automatically on eviction/clear).
UT_hash_handle hh; ///< uthash handle linking this entry into the hash table (must remain last or per uthash docs).
};
/** \struct CacheHeader
* \brief Cache top-level descriptor encapsulating the hash table root and capacity limit.
*
* The cache enforces an upper bound (max_items) on the number of tracked entries. Insert helpers are expected
* to evict (or refuse) when the limit is exceeded (strategy defined in implementation; current behavior may be
* simple non-evicting if not yet implemented as a true LRU). The cache pointer holds the uthash root (NULL when
* empty).
*
* Fields:
* - max_items: Maximum number of entries allowed; 0 means "no explicit limit" if accepted by implementation.
* - cache: uthash root pointer; NULL when the cache is empty.
*/
struct CacheHeader
{
uint64_t max_items;
struct CacheEntry *cache;
uint64_t max_items; ///< Hard limit for number of entries (policy: enforce/ignore depends on implementation).
struct CacheEntry *cache; ///< Hash root (uthash). NULL when empty.
};
/**
* Finds an item in the specified cache
* @param cache Pointer to the cache header
* @param key Key
* @return Value if found, NULL if not
*/
void *find_in_cache(struct CacheHeader *cache, const char *key);
/**
* Adds an item to the specified cache
* @param cache Pointer to the cache header
* @param key Key
* @param value Value
*/
void add_to_cache(struct CacheHeader *cache, const char *key, void *value);
/**
* Finds an item in the specified cache using a 64-bit integer key
* @param cache Pointer to the cache header
* @param key Key
* @return Value if found, NULL if not
*/
void add_to_cache(struct CacheHeader *cache, const char *key, void *value);
void *find_in_cache_uint64(struct CacheHeader *cache, uint64_t key);
/**
* Adds an item to the specified cache using a 64-bit integer key
* @param cache Pointer to the cache header
* @param key Key
* @param value Value
*/
void add_to_cache_uint64(struct CacheHeader *cache, uint64_t key, void *value);
void add_to_cache_uint64(struct CacheHeader *cache, uint64_t key, void *value);
#endif // LIBAARUFORMAT_LRU_H

View File

@@ -19,29 +19,80 @@
#ifndef LIBAARUFORMAT_CHECKSUM_H
#define LIBAARUFORMAT_CHECKSUM_H
#include <stdint.h> // Fixed-width integer types for on-disk structures.
#pragma pack(push, 1)
/**
* Checksum block, contains a checksum of all user data sectors (except for optical discs that is 2352 uint8_ts raw
* sector if available
* */
typedef struct ChecksumHeader {
/**Identifier, <see cref="BlockType.ChecksumBlock" /> */
uint32_t identifier;
/**Length in uint8_ts of the block */
uint32_t length;
/**How many checksums follow */
uint8_t entries;
* \file aaruformat/structs/checksum.h
* \brief On-disk layout definitions for the checksum block (BlockType::ChecksumBlock).
*
* A checksum block stores one or more whole-image (user data) checksums. For optical media the
* user data definition follows the format's raw sector rules (e.g. 2352-byte raw sector when available).
*
* Binary layout (all integers are little-endian, structure is packed):
*
* +------------------------------+-------------------------------+
* | Field | Size (bytes) |
* +==============================+===============================+
* | ChecksumHeader | sizeof(ChecksumHeader)=9 |
* | identifier | 4 (BlockType::ChecksumBlock) |
* | length | 4 (payload bytes that follow)|
* | entries | 1 (number of checksum entries)|
* +------------------------------+-------------------------------+
* | Repeated for each entry: |
* | ChecksumEntry | sizeof(ChecksumEntry)=5 |
* | type | 1 (ChecksumAlgorithm) |
* | length | 4 (digest length) |
* | digest bytes | length |
* +------------------------------+-------------------------------+
*
* Thus, the payload size (ChecksumHeader.length) MUST equal the sum over all entries of:
* sizeof(ChecksumEntry) + entry.length.
*
* Typical digest lengths:
* - Md5: 16 bytes
* - Sha1: 20 bytes
* - Sha256: 32 bytes
* - SpamSum: variable length ASCII, NOT null-terminated on disk (a terminating '\0' may be appended in memory).
*
* \warning The structures are packed; never rely on host compiler default padding or directly casting from a buffer
* without ensuring correct endianness if porting to big-endian systems (current implementation assumes LE).
*
* \see BlockType
* \see ChecksumAlgorithm
*/
/**
* \struct ChecksumHeader
* \brief Header that precedes the sequence of checksum entries for a checksum block.
*
* After this header, exactly \ref ChecksumHeader::length bytes follow containing \ref ChecksumHeader::entries
* consecutive \ref ChecksumEntry records, each immediately followed by its digest payload.
*/
typedef struct ChecksumHeader
{
uint32_t identifier; ///< Block identifier, must be BlockType::ChecksumBlock.
uint32_t length; ///< Length in bytes of the payload (all entries + their digest data, excluding this header).
uint8_t entries; ///< Number of checksum entries that follow in the payload.
} ChecksumHeader;
/**Checksum entry, followed by checksum data itself */
typedef struct ChecksumEntry {
/**Checksum algorithm */
uint8_t type;
/**Length in uint8_ts of checksum that follows this structure */
uint32_t length;
/**
* \struct ChecksumEntry
* \brief Per-checksum metadata immediately followed by the digest / signature bytes.
*
* For fixed-length algorithms the \ref length MUST match the known digest size. For SpamSum it is variable.
* The bytes immediately following this structure (not null-terminated) constitute the digest and are exactly
* \ref length bytes long.
*
* Order of entries is not mandated; readers should scan all entries and match by \ref type.
*/
typedef struct ChecksumEntry
{
uint8_t type; ///< Algorithm used (value from \ref ChecksumAlgorithm).
uint32_t length; ///< Length in bytes of the digest that immediately follows this structure.
} ChecksumEntry;
#pragma pack(pop)
#endif //LIBAARUFORMAT_CHECKSUM_H
#endif // LIBAARUFORMAT_CHECKSUM_H

View File

@@ -19,37 +19,82 @@
#ifndef LIBAARUFORMAT_DATA_H
#define LIBAARUFORMAT_DATA_H
#include <stdint.h> // Fixed width integer types used in on-disk packed structs.
#pragma pack(push, 1)
/**Block header, precedes block data */
typedef struct BlockHeader {
/**Identifier, <see cref="BlockType.DataBlock" /> */
uint32_t identifier;
/**Type of data contained by this block */
uint16_t type;
/**Compression algorithm used to compress the block */
uint16_t compression;
/**Size in uint8_ts of each sector contained in this block */
uint32_t sectorSize;
/**Compressed length for the block */
uint32_t cmpLength;
/**Uncompressed length for the block */
uint32_t length;
/**CRC64-ECMA of the compressed block */
uint64_t cmpCrc64;
/**CRC64-ECMA of the uncompressed block */
uint64_t crc64;
/**
* \file aaruformat/structs/data.h
* \brief On-disk layout structures for data-bearing and geometry blocks.
*
* These packed structures describe the headers that precede variable-length payloads
* inside blocks whose identifiers are enumerated in \ref BlockType.
* All integer fields are stored little-endian on disk. The library currently assumes a
* little-endian host; if ported to a big-endian architecture explicit byte swapping will be required.
*
* Layout of a data block (BlockType::DataBlock):
* BlockHeader (sizeof(BlockHeader) bytes)
* Compressed payload (cmpLength bytes)
*
* Payload decoding:
* - Apply the algorithm indicated by \ref BlockHeader::compression (\ref CompressionType) to the
* cmpLength bytes following the header to obtain exactly \ref BlockHeader::length bytes.
* - The uncompressed data MUST be an integer multiple of \ref BlockHeader::sectorSize.
* - A CRC64-ECMA is provided for both compressed (cmpCrc64) and uncompressed (crc64) forms to allow
* validation at either stage of the pipeline.
*
* Geometry block (BlockType::GeometryBlock) has a \ref GeometryBlockHeader followed by no additional
* fixed payload in the current format version; it conveys legacy CHS-style logical geometry metadata.
*
* \warning These structs are packed; do not take their address and assume natural alignment.
* \see BlockType
* \see DataType
* \see CompressionType
*/
/**
* \struct BlockHeader
* \brief Header preceding the compressed data payload of a data block (BlockType::DataBlock).
*
* Invariants:
* - cmpLength > 0 unless length == 0 (empty block)
* - length == 0 implies cmpLength == 0
* - If compression == CompressionType::None then cmpLength == length
* - length % sectorSize == 0
*
* Validation strategy (recommended for readers):
* 1. Verify identifier == BlockType::DataBlock.
* 2. Verify sectorSize is non-zero and a power-of-two or a commonly used size (512/1024/2048/4096/2352).
* 3. Verify invariants above and CRCs after (de)compression.
*/
typedef struct BlockHeader
{
uint32_t identifier; ///< Block identifier, must be BlockType::DataBlock.
uint16_t type; ///< Logical data classification (value from \ref DataType).
uint16_t compression; ///< Compression algorithm used (value from \ref CompressionType).
uint32_t sectorSize; ///< Size in bytes of each logical sector represented in this block.
uint32_t cmpLength; ///< Size in bytes of the compressed payload immediately following this header.
uint32_t length; ///< Size in bytes of the uncompressed payload resulting after decompression.
uint64_t cmpCrc64; ///< CRC64-ECMA of the compressed payload (cmpLength bytes).
uint64_t crc64; ///< CRC64-ECMA of the uncompressed payload (length bytes).
} BlockHeader;
/**Geometry block, contains physical geometry information */
typedef struct GeometryBlockHeader {
/**Identifier, <see cref="BlockType.GeometryBlock" /> */
uint32_t identifier;
uint32_t cylinders;
uint32_t heads;
uint32_t sectorsPerTrack;
/**
* \struct GeometryBlockHeader
* \brief Legacy CHS style logical geometry metadata (BlockType::GeometryBlock).
*
* Total logical sectors implied by this header is cylinders * heads * sectorsPerTrack.
* Sector size is not included here and must be derived from context (e.g., accompanying metadata
* or defaulting to 512 for many block devices).
*/
typedef struct GeometryBlockHeader
{
uint32_t identifier; ///< Block identifier, must be BlockType::GeometryBlock.
uint32_t cylinders; ///< Number of cylinders.
uint32_t heads; ///< Number of heads (tracks per cylinder).
uint32_t sectorsPerTrack; ///< Number of sectors per track.
} GeometryBlockHeader;
#pragma pack(pop)
#endif //LIBAARUFORMAT_DATA_H
#endif // LIBAARUFORMAT_DATA_H

View File

@@ -19,71 +19,149 @@
#ifndef LIBAARUFORMAT_DDT_H
#define LIBAARUFORMAT_DDT_H
#include <stdint.h> // fixed-width types for on-disk layout
#pragma pack(push, 1)
/**Header for a deduplication table. Table follows it */
/** \file aaruformat/structs/ddt.h
* \brief On-disk headers for Deduplication Data Tables (DDT) versions 1 and 2.
*
* A DDT maps logical sector indices (LBAs within an image's logical address space) to (block, sector)
* pairs plus a base file offset, enabling content de-duplication inside the container. Two generations
* exist:
* - DdtHeader ("version 1") flat table.
* - DdtHeader2 ("version 2") hierarchical, multi-level subtables for scalability.
*
* All integers are little-endian. Structures are packed (1-byte alignment). When porting to a big-endian
* architecture callers must perform byte swapping. Do not rely on compiler-introduced padding.
*
* Compression of the table body (entries array) follows the same conventions as data blocks: first
* decompress according to the compression enum, then validate CRC64 for uncompressed contents.
*
* Related enumerations:
* - BlockType::DeDuplicationTable / BlockType::DeDuplicationTable2
* - CompressionType
* - DataType
* - DdtSizeType (for DdtHeader2::sizeType)
*/
/**
* \struct DdtHeader
* \brief Header preceding a version 1 (flat) deduplication table body.
*
* Immediately after this header there are \ref entries table records (compressed if \ref compression != None).
* Each table record encodes a pointer using an 8-bit file offset component and a sector offset inside a block:
* logicalEntryValue = ((uint64_t)fileByteOffset << shift) + sectorOffsetWithinBlock
* where fileByteOffset is measured in bytes (granularity depends on shift) and sectorOffsetWithinBlock is
* relative to the start of the referenced data block. The sector size must be taken from the corresponding
* data block(s) (see BlockHeader::sectorSize) or higher-level metadata.
*
* Invariants:
* - cmpLength == length if compression == CompressionType::None
* - length % (entrySize) == 0 after decompression (implementation-defined entry size)
* - entries * entrySize == length
* - entries > 0 implies length > 0
*/
typedef struct DdtHeader
{
/**Identifier, <see cref="BlockType.DeDuplicationTable" /> */
uint32_t identifier;
/**Type of data pointed by this DDT */
uint16_t type;
/**Compression algorithm used to compress the DDT */
uint16_t compression;
/**Each entry is ((uint8_t offset in file) &lt;&lt; shift) + (sector offset in block) */
uint8_t shift;
/**How many entries are in the table */
uint64_t entries;
/**Compressed length for the DDT */
uint64_t cmpLength;
/**Uncompressed length for the DDT */
uint64_t length;
/**CRC64-ECMA of the compressed DDT */
uint64_t cmpCrc64;
/**CRC64-ECMA of the uncompressed DDT */
uint64_t crc64;
uint32_t identifier; ///< Block identifier, must be BlockType::DeDuplicationTable.
uint16_t type; ///< Data classification (\ref DataType) for sectors referenced by this table.
uint16_t compression; ///< Compression algorithm for the table body (\ref CompressionType).
uint8_t shift; ///< Left shift applied to per-entry file offset component forming logicalEntryValue.
uint64_t entries; ///< Number of deduplication entries contained in (uncompressed) table.
uint64_t cmpLength; ///< Size in bytes of compressed entries payload.
uint64_t length; ///< Size in bytes of uncompressed entries payload.
uint64_t cmpCrc64; ///< CRC64-ECMA of the compressed payload.
uint64_t crc64; ///< CRC64-ECMA of the uncompressed payload.
} DdtHeader;
/**
* \struct DdtHeader2
* \brief Header preceding a version 2 hierarchical deduplication table.
*
* Version 2 introduces multi-level tables to efficiently address very large images by subdividing
* the logical address space. Tables at higher levels partition regions; leaves contain direct
* (block, sector) entry mappings. Navigation uses \ref tableLevel (0 = root) and \ref levels (total depth).
*
* Logical sector (LBA) mapping (actual implementation in decode_ddt_{single,multi}_level_v2):
* 1. Let L be the requested logical sector (can be negative externally). Internal index I = L + negative.
* Valid range: 0 <= I < blocks. (Total user-data sectors often = blocks - negative - overflow.)
* 2. If tableShift == 0 (single-level): entryIndex = I.
* Else (multi-level):
* itemsPerPrimaryEntry = 1 << tableShift
* primaryIndex = I / itemsPerPrimaryEntry
* secondaryIndex = I % itemsPerPrimaryEntry
* The primary table entry at primaryIndex yields a secondary DDT file offset (scaled by 2^blockAlignmentShift),
* whose table entries are then indexed by secondaryIndex.
* 3. Read raw DDT entry value E (16-bit if sizeType == SmallDdtSizeType, 32-bit if BigDdtSizeType).
* 4. If E == 0: sector_status = SectorStatusNotDumped; offset=block_offset=0.
* Otherwise extract:
* statusBits = E >> 12 (small) or E >> 28 (big)
* baseBits = E & 0x0FFF (small) or E & 0x0FFFFFFF (big)
* sectorOffsetWithinBlock = baseBits & ((1 << dataShift) - 1)
* blockIndex = baseBits >> dataShift
* block_offset (bytes) = blockIndex << blockAlignmentShift
* offset (sector units inside block) = sectorOffsetWithinBlock
* 5. The consumer combines block_offset, offset, and the (external) logical sector size to locate data.
*
* Field roles:
* - negative: Count of leading negative LBAs supported; added to L to form internal index.
* - overflow: Count of trailing LBAs beyond the user area upper bound that are still dumped and have
* normal DDT entries (e.g. optical disc lead-out). Symmetrical to 'negative' on the high end.
* - start: For secondary tables, base internal index covered (written when creating new tables). Current decoding
* logic does not consult this field (future-proof placeholder).
* - blockAlignmentShift: log2 alignment of stored data blocks (byte granularity of block_offset).
* - dataShift: log2 of the number of addressable sectors per increment of blockIndex bitfield unit.
* - tableShift: log2 of number of logical sectors covered by a single primary-table pointer (multi-level only).
* - sizeType: Selects entry width (small=16b, big=32b) impacting available bits for blockIndex+offset.
*
* Notes & current limitations:
* - User area sector count = blocks - negative - overflow.
* - Valid external LBA range exposed by the image = [-negative, (blocks - negative - 1)].
* * Negative range: [-negative, -1]
* * User area range: [0, (blocks - negative - overflow - 1)]
* * Overflow range: [(blocks - negative - overflow), (blocks - negative - 1)]
* - Both negative and overflow ranges are stored with normal DDT entries (if present), enabling complete
* reproduction of lead-in / lead-out or similar padding regions.
* - start is presently ignored during decoding; integrity checks against it may be added in future revisions.
* - No masking is applied to I besides array bounds; callers must ensure L is within representable range.
*
* Example (Compact Disc):
* Disc has 360000 user sectors. Lead-in captured as 15000 negative sectors and lead-out as 15000 overflow sectors.
* negative = 15000
* overflow = 15000
* user sectors = 360000
* blocks (internal span) = negative + user + overflow = 390000
* External LBA spans: -15000 .. 374999
* * Negative: -15000 .. -1 (15000 sectors)
* * User: 0 .. 359999 (360000 sectors)
* * Overflow: 360000 .. 374999 (15000 sectors)
* Internal index I for any external L is I = L + negative.
* User area sector count reported to callers (ctx->imageInfo.Sectors) = blocks - negative - overflow = 360000.
*/
typedef struct DdtHeader2
{
/**Identifier, <see cref="BlockType.DeDuplicationTable" /> */
uint32_t identifier;
/**Type of data pointed by this DDT */
uint16_t type;
/**Compression algorithm used to compress the DDT */
uint16_t compression;
/**How many levels of subtables are present */
uint8_t levels;
/**Which level this table belongs to */
uint8_t tableLevel;
/**Pointer to absolute byte offset in file where the previous level table is located */
uint64_t previousLevelOffset;
/**Negative displacement of LBAs */
uint16_t negative;
/**Number of blocks in media */
uint64_t blocks;
/**Positive overflow displacement of LBAs */
uint16_t overflow;
/**First LBA contained in this table */
uint64_t start;
/**Block alignment boundaries */
uint8_t blockAlignmentShift;
/**Data shift */
uint8_t dataShift;
/**Table shift */
uint8_t tableShift;
/**Size type */
uint8_t sizeType;
/**Entries in this table */
uint64_t entries;
/**Compressed length for the DDT */
uint64_t cmpLength;
/**Uncompressed length for the DDT */
uint64_t length;
/**CRC64-ECMA of the compressed DDT */
uint64_t cmpCrc64;
/**CRC64-ECMA of the uncompressed DDT */
uint64_t crc64;
uint32_t identifier; ///< Block identifier, must be BlockType::DeDuplicationTable2.
uint16_t type; ///< Data classification (\ref DataType) for sectors referenced by this table.
uint16_t compression; ///< Compression algorithm for this table body (\ref CompressionType).
uint8_t levels; ///< Total number of hierarchy levels (root depth); > 0.
uint8_t tableLevel; ///< Zero-based level index of this table (0 = root, increases downward).
uint64_t previousLevelOffset; ///< Absolute byte offset of the parent (previous) level table; 0 if root.
uint16_t negative; ///< Leading negative LBA count; added to external L to build internal index.
uint64_t blocks; ///< Total internal span (negative + usable + overflow) in logical sectors.
uint16_t overflow; ///< Trailing dumped sectors beyond user area (overflow range), still mapped with entries.
uint64_t
start; ///< Base internal index covered by this table (used for secondary tables; currently informational).
uint8_t blockAlignmentShift; ///< 2^blockAlignmentShift = block alignment boundary in bytes.
uint8_t dataShift; ///< 2^dataShift = sectors represented per increment in blockIndex field.
uint8_t tableShift; ///< 2^tableShift = number of logical sectors per primary entry (multi-level only; 0 for
///< single-level or secondary tables).
uint8_t sizeType; ///< Entry size variant (\ref DdtSizeType) controlling width of E.
uint64_t entries; ///< Number of entries contained in (uncompressed) table payload.
uint64_t cmpLength; ///< Compressed payload size in bytes.
uint64_t length; ///< Uncompressed payload size in bytes.
uint64_t cmpCrc64; ///< CRC64-ECMA of compressed table payload.
uint64_t crc64; ///< CRC64-ECMA of uncompressed table payload.
} DdtHeader2;
#pragma pack(pop)

View File

@@ -19,42 +19,109 @@
#ifndef LIBAARUFORMAT_DUMP_H
#define LIBAARUFORMAT_DUMP_H
#include <stdint.h> /* Fixed-width integer types for ondisk packed structures */
#pragma pack(push, 1)
/**Dump hardware block, contains a list of hardware used to dump the media on this image */
typedef struct DumpHardwareHeader {
/**Identifier, <see cref="BlockType.DumpHardwareBlock" /> */
uint32_t identifier;
/**How many entries follow this header */
uint16_t entries;
/**Size of the whole block, not including this header, in uint8_ts */
uint32_t length;
/**CRC64-ECMA of the block */
uint64_t crc64;
/** \file aaruformat/structs/dump.h
* \brief Packed on-disk structures describing hardware and software used during image acquisition.
*
* A Dump Hardware block (identifier = BlockType::DumpHardwareBlock) records one or more dump "environments"
* typically combinations of a physical device (drive, controller, adapter) and the software stack that
* performed the read operation. Each environment is represented by a \ref DumpHardwareEntry followed by a
* sequence of UTF8 strings and an optional array of extent ranges (\ref DumpExtent, defined in context.h) that
* delimit portions of the medium this environment contributed to.
*
* Binary layout (little-endian, packed, all multi-byte integers LE):
*
* DumpHardwareHeader (sizeof = 16 bytes)
* identifier (4) -> BlockType::DumpHardwareBlock
* entries (2) -> number of following hardware entries
* length (4) -> total bytes of payload that follow this header
* crc64 (8) -> CRC64-ECMA of the payload bytes
*
* Repeated for i in [0, entries):
* DumpHardwareEntry (36 bytes)
* manufacturerLength (4)
* modelLength (4)
* revisionLength (4)
* firmwareLength (4)
* serialLength (4)
* softwareNameLength (4)
* softwareVersionLength (4)
* softwareOperatingSystemLength (4)
* extents (4) -> number of DumpExtent structs after the strings
*
* Variable-length UTF-8 strings (not NUL-terminated on disk) appear immediately after the entry, in the
* exact order of the length fields above; each string is present only if its length > 0. The reader allocates
* an extra byte to append '\0' for in-memory convenience.
*
* Array of 'extents' DumpExtent structures (each 16 bytes: start, end) follows the strings if extents > 0.
* The semantic of each extent is an inclusive [start, end] logical sector (or unit) range contributed by
* this hardware/software combination.
*
* CRC semantics:
* - crc64 covers exactly 'length' bytes immediately following the header.
* - For legacy images with header.imageMajorVersion <= AARUF_VERSION_V1 the original C# writer produced a
* byte-swapped CRC; the library compensates internally (see process_dumphw_block()).
*
* Invariants / validation recommendations:
* - identifier == BlockType::DumpHardwareBlock
* - Accumulated size of all (entry + strings + extents arrays) == length
* - All length fields are trusted only after bounds checking against remaining payload bytes
* - Strings are raw UTF-8 data with no implicit terminator
* - extents * sizeof(DumpExtent) fits inside remaining payload
*
* Memory management notes (runtime library):
* - Each string is malloc'ed with +1 byte for terminator during processing.
* - Extents array is malloc'ed per entry when extents > 0.
* - See aaruformatContext::dumpHardwareEntriesWithData for owning pointers.
*
* \warning Structures are packed; never rely on natural alignment when mapping from a byte buffer.
* \see DumpHardwareHeader
* \see DumpHardwareEntry
* \see DumpExtent (in context.h)
* \see BlockType
*/
/** \struct DumpHardwareHeader
* \brief Header that precedes a sequence of dump hardware entries and their variable-length payload.
*/
typedef struct DumpHardwareHeader
{
uint32_t identifier; ///< Block identifier, must be BlockType::DumpHardwareBlock.
uint16_t entries; ///< Number of DumpHardwareEntry records that follow.
uint32_t length; ///< Total payload bytes after this header (sum of entries, strings, and extents arrays).
uint64_t crc64; ///< CRC64-ECMA of the payload (byte-swapped for legacy v1 images, handled automatically).
} DumpHardwareHeader;
/**Dump hardware entry, contains length of strings that follow, in the same order as the length, this structure */
typedef struct DumpHardwareEntry {
/**Length of UTF-8 manufacturer string */
uint32_t manufacturerLength;
/**Length of UTF-8 model string */
uint32_t modelLength;
/**Length of UTF-8 revision string */
uint32_t revisionLength;
/**Length of UTF-8 firmware version string */
uint32_t firmwareLength;
/**Length of UTF-8 serial string */
uint32_t serialLength;
/**Length of UTF-8 software name string */
uint32_t softwareNameLength;
/**Length of UTF-8 software version string */
uint32_t softwareVersionLength;
/**Length of UTF-8 software operating system string */
uint32_t softwareOperatingSystemLength;
/**How many extents are after the strings */
uint32_t extents;
/** \struct DumpHardwareEntry
* \brief Per-environment length table describing subsequent UTF-8 strings and optional extent array.
*
* Immediately after this structure the variable-length UTF8 strings appear in the documented order, each
* present only if its corresponding length is non-zero. No padding is present between strings. When all
* strings are consumed, an array of \ref DumpExtent follows if \ref extents > 0.
*
* All length fields measure bytes (not characters) and exclude any in-memory NUL terminator added by the reader.
*
* Typical semantics:
* - manufacturer/model/revision/firmware/serial identify the hardware device.
* - softwareName/softwareVersion/softwareOperatingSystem identify the acquisition software environment.
* - extents list which logical ranges this environment actually dumped (useful for multi-device composites).
*/
typedef struct DumpHardwareEntry
{
uint32_t manufacturerLength; ///< Length in bytes of manufacturer UTF-8 string.
uint32_t modelLength; ///< Length in bytes of model UTF-8 string.
uint32_t revisionLength; ///< Length in bytes of revision / hardware revision string.
uint32_t firmwareLength; ///< Length in bytes of firmware version string.
uint32_t serialLength; ///< Length in bytes of device serial number string.
uint32_t softwareNameLength; ///< Length in bytes of dumping software name string.
uint32_t softwareVersionLength; ///< Length in bytes of dumping software version string.
uint32_t softwareOperatingSystemLength; ///< Length in bytes of host operating system string.
uint32_t extents; ///< Number of DumpExtent records following the strings (0 = none).
} DumpHardwareEntry;
#pragma pack(pop)
#endif //LIBAARUFORMAT_DUMP_H
#endif // LIBAARUFORMAT_DUMP_H

View File

@@ -19,73 +19,111 @@
#ifndef LIBAARUFORMAT_HEADER_H
#define LIBAARUFORMAT_HEADER_H
#define AARU_HEADER_APP_NAME_LEN 64
#define GUID_SIZE 16
/** \file aaruformat/structs/header.h
* \brief On-disk container header structures (v1 and v2) for Aaru images.
*
* These packed headers appear at the very beginning (offset 0) of every Aaru image file and
* advertise container format version, creator application, indexing offset and optional extended
* feature capability bitfields (v2+). All multi-byte integers are little-endian. Strings stored
* in the fixed-size application field are UTF16LE and zero padded (not necessarily NUL-terminated
* if fully filled). The GUID field (v2) allows derivative / child images to reference an origin.
*
* Version progression:
* - v1: \ref AaruHeader (no GUID, no alignment or shift metadata, no feature bitfields).
* - v2: \ref AaruHeaderV2 introduces GUID, block/data/table shift hints (mirroring DDT metadata),
* and three 64bit feature bitmaps to negotiate reader/writer compatibility.
*
* Compatibility handling (recommended logic for consumers):
* 1. If any bit set in featureIncompatible is not implemented by the reader: abort (cannot safely read/write).
* 2. Else if any bit set in featureCompatibleRo is not implemented: allow readonly operations.
* 3. Bits only present in featureCompatible but not implemented MAY be ignored for both read/write while
* still preserving roundtrip capability (writer should not clear unknown bits when resaving).
*
* Alignment & shift semantics (duplicated here for quick reference, see DdtHeader2 for full details):
* - blockAlignmentShift: underlying blocks are aligned to 2^blockAlignmentShift bytes.
* - dataShift: data pointer / DDT entry low bits encode offsets modulo 2^dataShift sectors/items.
* - tableShift: primary DDT entries span 2^tableShift logical sectors (0 implies single-level tables).
*
* Invariants:
* - identifier == AARU_MAGIC (external constant; not defined here).
* - For v1: sizeof(AaruHeader) exact and indexOffset > 0 (indexOffset == 0 => corrupt/unreadable image).
* - For v2: sizeof(AaruHeaderV2) exact; indexOffset > 0; blockAlignmentShift, dataShift, tableShift within
* sane bounds (e.g. < 63). Zero is permissible only for the shift fields (not for indexOffset).
*
* Security / robustness considerations:
* - Always bounds-check indexOffset against file size before seeking.
* - Treat application field as untrusted UTF16LE; validate surrogate pairs if necessary.
* - Unknown feature bits MUST be preserved if a file is rewritten to avoid capability loss.
*/
#define AARU_HEADER_APP_NAME_LEN 64 /**< Size in bytes (UTF-16LE) of application name field (32 UTF-16 code units). */
#define GUID_SIZE 16 /**< Size in bytes of GUID / UUID-like binary identifier. */
#pragma pack(push, 1)
/**Header, at start of file */
typedef struct AaruHeader {
/**Header identifier, <see cref="AARU_MAGIC" /> */
uint64_t identifier;
/**UTF-16LE name of the application that created the image */
uint8_t application[AARU_HEADER_APP_NAME_LEN];
/**Image format major version. A new major version means a possibly incompatible change of format */
uint8_t imageMajorVersion;
/**Image format minor version. A new minor version indicates a compatible change of format */
uint8_t imageMinorVersion;
/**Major version of the application that created the image */
uint8_t applicationMajorVersion;
/**Minor version of the application that created the image */
uint8_t applicationMinorVersion;
/**Type of media contained on image */
uint32_t mediaType;
/**Offset to index */
uint64_t indexOffset;
/**Windows filetime (100 nanoseconds since 1601/01/01 00:00:00 UTC) of image creation time */
int64_t creationTime;
/**Windows filetime (100 nanoseconds since 1601/01/01 00:00:00 UTC) of image last written time */
int64_t lastWrittenTime;
/** \struct AaruHeader
* \brief Version 1 container header placed at offset 0 for legacy / initial format.
*
* Field summary:
* - identifier: magic signature (AARU_MAGIC) identifying the container.
* - application: UTF16LE creator application name (fixed 64 bytes, zero padded).
* - imageMajorVersion / imageMinorVersion: container format version of the file itself (not the app).
* - applicationMajorVersion / applicationMinorVersion: version of the creating application.
* - mediaType: media type enumeration (\ref MediaType).
* - indexOffset: byte offset to the first index block (must be > 0).
* - creationTime / lastWrittenTime: 64-bit Windows FILETIME timestamps (100 ns intervals since 1601-01-01 UTC).
*/
typedef struct AaruHeader
{
uint64_t identifier; ///< File magic (AARU_MAGIC).
uint8_t application[AARU_HEADER_APP_NAME_LEN]; ///< UTF-16LE creator application name (fixed-size buffer).
uint8_t imageMajorVersion; ///< Container format major version (incompatible changes when incremented).
uint8_t imageMinorVersion; ///< Container format minor version (backward compatible evolutions).
uint8_t applicationMajorVersion; ///< Creator application major version.
uint8_t applicationMinorVersion; ///< Creator application minor / patch version.
uint32_t mediaType; ///< Media type enumeration (value from \ref MediaType).
uint64_t indexOffset; ///< Absolute byte offset to primary index block (MUST be > 0; 0 => corrupt/unreadable).
int64_t creationTime; ///< Creation FILETIME (100 ns since 1601-01-01 UTC).
int64_t lastWrittenTime; ///< Last modification FILETIME (100 ns since 1601-01-01 UTC).
} AaruHeader;
/**Header, at start of file */
typedef struct AaruHeaderV2 {
/**Header identifier, see AARU_MAGIC */
uint64_t identifier;
/**UTF-16LE name of the application that created the image */
uint8_t application[AARU_HEADER_APP_NAME_LEN];
/**Image format major version. A new major version means a possibly incompatible change of format */
uint8_t imageMajorVersion;
/**Image format minor version. A new minor version indicates a compatible change of format */
uint8_t imageMinorVersion;
/**Major version of the application that created the image */
uint8_t applicationMajorVersion;
/**Minor version of the application that created the image */
uint8_t applicationMinorVersion;
/**Type of media contained on image */
uint32_t mediaType;
/**Offset to index */
uint64_t indexOffset;
/**Windows filetime (100 nanoseconds since 1601/01/01 00:00:00 UTC) of image creation time */
int64_t creationTime;
/**Windows filetime (100 nanoseconds since 1601/01/01 00:00:00 UTC) of image last written time */
int64_t lastWrittenTime;
/**Unique identifier that allows children images to recognize and find this image.*/
uint8_t guid[GUID_SIZE];
/**Block alignment shift. All blocks in the image are aligned at 2 << blockAlignmentShift bytes */
uint8_t blockAlignmentShift;
/**Data shift. All data blocks in the image contain 2 << dataShift items at most */
uint8_t dataShift;
/**Table shift. All deduplication tables in the image use this shift to calculate the position of an item */
uint8_t tableShift;
/**Features used in this image that if unsupported are still compatible for reading and writing implementations */
uint64_t featureCompatible;
/**Features used in this image that if unsupported are still compatible for reading implementations but not for writing */
uint64_t featureCompatibleRo;
/**Featured used in this image that if unsupported prevent reading or writing the image*/
uint64_t featureIncompatible;
/** \struct AaruHeaderV2
* \brief Version 2 container header with GUID, alignment shifts, and feature negotiation bitmaps.
*
* Additions over v1:
* - guid: stable 128-bit identifier enabling linkage by derivative images.
* - blockAlignmentShift / dataShift / tableShift: global structural hints copied into data & DDT blocks.
* - featureCompatible / featureCompatibleRo / featureIncompatible: capability bitmasks.
*
* Feature bitmask semantics:
* - featureCompatible: Optional features; absence of implementation should not impact R/W correctness.
* - featureCompatibleRo: If unimplemented, image MAY be opened read-only.
* - featureIncompatible: If any bit unimplemented, image MUST NOT be opened (prevent misinterpretation).
*
* Readers should AND their supported bit set with the header masks to decide access level (see file
* documentation). Writers must preserve unknown bits when saving an existing image.
*/
typedef struct AaruHeaderV2
{
uint64_t identifier; ///< File magic (AARU_MAGIC).
uint8_t application[AARU_HEADER_APP_NAME_LEN]; ///< UTF-16LE creator application name (fixed 64 bytes).
uint8_t imageMajorVersion; ///< Container format major version.
uint8_t imageMinorVersion; ///< Container format minor version.
uint8_t applicationMajorVersion; ///< Creator application major version.
uint8_t applicationMinorVersion; ///< Creator application minor / patch version.
uint32_t mediaType; ///< Media type enumeration (value from \ref MediaType).
uint64_t indexOffset; ///< Absolute byte offset to primary index block (MUST be > 0; 0 => corrupt/unreadable).
int64_t creationTime; ///< Creation FILETIME (100 ns since 1601-01-01 UTC).
int64_t lastWrittenTime; ///< Last modification FILETIME (100 ns since 1601-01-01 UTC).
uint8_t guid[GUID_SIZE]; ///< 128-bit image GUID (binary, not text); stable across children.
uint8_t blockAlignmentShift; ///< log2 block alignment (block size alignment = 2^blockAlignmentShift bytes).
uint8_t dataShift; ///< log2 sectors/items per block-index increment in DDT entries (2^dataShift).
uint8_t tableShift; ///< log2 sectors spanned by each primary DDT entry (0 = single-level).
uint64_t featureCompatible; ///< Feature bits: unimplemented bits are ignorable (still R/W safe).
uint64_t featureCompatibleRo; ///< Feature bits: unimplemented -> degrade to read-only access.
uint64_t featureIncompatible; ///< Feature bits: any unimplemented -> abort (cannot open safely).
} AaruHeaderV2;
#pragma pack(pop)
#endif //LIBAARUFORMAT_HEADER_H
#endif // LIBAARUFORMAT_HEADER_H

View File

@@ -21,50 +21,95 @@
#pragma pack(push, 1)
/**Header for the index, followed by entries */
/** \file aaruformat/structs/index.h
* \brief Ondisk index block header and entry structures (versions 1, 2 and 3).
*
* The index provides a directory of all blocks contained in an Aaru image. Each index block starts with
* a versioned header (IndexHeader / IndexHeader2 / IndexHeader3) followed by a contiguous array of
* fixedsize \ref IndexEntry records. Version 3 adds support for hierarchical (chained / nested) subindexes.
*
* Version mapping by block identifier (see \ref BlockType):
* - IndexBlock (v1) -> \ref IndexHeader followed by 16bit entry count entries.
* - IndexBlock2 (v2) -> \ref IndexHeader2 followed by 64bit entry count entries.
* - IndexBlock3 (v3) -> \ref IndexHeader3 with optional hierarchical subindex references.
*
* CRC coverage & endianness:
* - The crc64 field stores a CRC64-ECMA over the entries array ONLY (header bytes are excluded).
* - For images with imageMajorVersion <= AARUF_VERSION_V1 a legacy writer byte-swapped the CRC; readers
* compensate (see verify_index_v1/v2/v3). The value in the header remains whatever was originally written.
*
* Hierarchical (v3) behavior:
* - Entries whose blockType == IndexBlock3 refer to subindex blocks; readers recursively load and flatten.
* - IndexHeader3::previous can point to a preceding index segment (for append / incremental scenarios) or 0.
* - CRC of the main index does NOT cover subindex contents; each subindex has its own header + CRC.
*
* Invariants / validation recommendations:
* - identifier must equal the expected BlockType variant for that version.
* - entries > 0 implies the entries array byte size == entries * sizeof(IndexEntry).
* - crc64 must match recomputed CRC64( entries array ) (after legacy byte swap handling if required).
* - For v3, if previous != 0 it should point to another IndexBlock3 header (optional besteffort check).
*
* Notes:
* - Structures are packed (1byte alignment). All multi-byte integers are littleendian on disk.
* - The index does not store per-entry CRC; integrity relies on each individual block's own CRC plus the index CRC.
* - dataType in \ref IndexEntry is meaningful only for block types that carry typed data (e.g. DataBlock,
* DumpHardwareBlock, etc.).
*
* See also: verify_index_v1(), verify_index_v2(), verify_index_v3() for integrity procedures.
*/
/** \struct IndexHeader
* \brief Index header (version 1) for legacy images (identifier == IndexBlock).
*
* Uses a 16bit entry counter limiting the number of indexable blocks in v1.
*/
typedef struct IndexHeader
{
/**Identifier, <see cref="BlockType.Index" /> */
uint32_t identifier;
/**How many entries follow this header */
uint16_t entries;
/**CRC64-ECMA of the index */
uint64_t crc64;
uint32_t identifier; ///< Block identifier (must be BlockType::IndexBlock).
uint16_t entries; ///< Number of \ref IndexEntry records that follow immediately.
uint64_t crc64; ///< CRC64-ECMA of the entries array (legacy byte-swapped for early images).
} IndexHeader;
/**Header for the index, followed by entries */
/** \struct IndexHeader2
* \brief Index header (version 2) with 64bit entry counter (identifier == IndexBlock2).
*
* Enlarges the entry count field to 64 bits for large images; otherwise structurally identical to v1.
*/
typedef struct IndexHeader2
{
/**Identifier, <see cref="BlockType.Index" /> */
uint32_t identifier;
/**How many entries follow this header */
uint64_t entries;
/**CRC64-ECMA of the index */
uint64_t crc64;
uint32_t identifier; ///< Block identifier (must be BlockType::IndexBlock2).
uint64_t entries; ///< Number of \ref IndexEntry records that follow immediately.
uint64_t crc64; ///< CRC64-ECMA of the entries array (legacy byte-swapped rule still applies for old versions).
} IndexHeader2;
/**Header for the index, followed by entries */
/** \struct IndexHeader3
* \brief Index header (version 3) adding hierarchical chaining (identifier == IndexBlock3).
*
* Supports flattened hierarchical indexes: entries referencing additional IndexBlock3 subindexes.
* The 'previous' pointer allows chaining earlier index segments (e.g., incremental append) enabling
* cumulative discovery without rewriting earlier headers.
*/
typedef struct IndexHeader3
{
/**Identifier, <see cref="BlockType.Index" /> */
uint32_t identifier;
/**How many entries follow this header */
uint64_t entries;
/**CRC64-ECMA of the index */
uint64_t crc64;
/**Pointer to the previous index header */
uint64_t previous;
uint32_t identifier; ///< Block identifier (must be BlockType::IndexBlock3).
uint64_t entries; ///< Number of \ref IndexEntry records that follow in this (sub)index block.
uint64_t crc64; ///< CRC64-ECMA of the local entries array (does NOT cover subindexes or previous chains).
uint64_t previous; ///< File offset of a previous IndexBlock3 header (0 if none / root segment).
} IndexHeader3;
/**Index entry */
/** \struct IndexEntry
* \brief Single index entry describing a block's type, (optional) data classification, and file offset.
*
* Semantics by blockType (see \ref BlockType):
* - DataBlock / GeometryBlock / ChecksumBlock / etc.: dataType conveys specific stored data category (\ref DataType).
* - Deduplication (DDT) or Index blocks: dataType may be ignored or set to a sentinel.
* - IndexBlock3: this entry refers to a subindex; offset points to another IndexHeader3.
*/
typedef struct IndexEntry
{
/**Type of item pointed by this entry */
uint32_t blockType;
/**Type of data contained by the block pointed by this entry */
uint16_t dataType;
/**Offset in file where item is stored */
uint64_t offset;
uint32_t blockType; ///< Block identifier of the referenced block (value from \ref BlockType).
uint16_t dataType; ///< Data classification (value from \ref DataType) or unused for untyped blocks.
uint64_t offset; ///< Absolute byte offset in the image where the referenced block header begins.
} IndexEntry;
#pragma pack(pop)

View File

@@ -21,73 +21,95 @@
#pragma pack(push, 1)
/**Metadata block, contains metadata */
typedef struct MetadataBlockHeader {
/**Identifier, <see cref="BlockType.MetadataBlock" /> */
uint32_t identifier;
/**Size in uint8_ts of this whole metadata block */
uint32_t blockSize;
/**Sequence of media set this media belongs to */
int32_t mediaSequence;
/**Total number of media on the media set this media belongs to */
int32_t lastMediaSequence;
/**Offset to start of creator string from start of this block */
uint32_t creatorOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t creatorLength;
/**Offset to start of creator string from start of this block */
uint32_t commentsOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t commentsLength;
/**Offset to start of creator string from start of this block */
uint32_t mediaTitleOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t mediaTitleLength;
/**Offset to start of creator string from start of this block */
uint32_t mediaManufacturerOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t mediaManufacturerLength;
/**Offset to start of creator string from start of this block */
uint32_t mediaModelOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t mediaModelLength;
/**Offset to start of creator string from start of this block */
uint32_t mediaSerialNumberOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t mediaSerialNumberLength;
/**Offset to start of creator string from start of this block */
uint32_t mediaBarcodeOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t mediaBarcodeLength;
/**Offset to start of creator string from start of this block */
uint32_t mediaPartNumberOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t mediaPartNumberLength;
/**Offset to start of creator string from start of this block */
uint32_t driveManufacturerOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t driveManufacturerLength;
/**Offset to start of creator string from start of this block */
uint32_t driveModelOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t driveModelLength;
/**Offset to start of creator string from start of this block */
uint32_t driveSerialNumberOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t driveSerialNumberLength;
/**Offset to start of creator string from start of this block */
uint32_t driveFirmwareRevisionOffset;
/**Length in uint8_ts of the null-terminated UTF-16LE creator string */
uint32_t driveFirmwareRevisionLength;
/** \file aaruformat/structs/metadata.h
* \brief Packed on-disk metadata block headers for descriptive strings and CICM XML (if present).
*
* Two metadata-related block header layouts are defined:
* - \ref MetadataBlockHeader (BlockType::MetadataBlock): offsets + lengths for several UTF-16LE strings.
* - \ref CicmMetadataBlock (BlockType::CicmBlock): length of embedded CICM XML metadata payload.
*
* All multi-byte integers are little-endian. Structures are packed (1-byte alignment). All textual fields
* referenced by offsets are UTF-16LE, null-terminated (0x0000). Length fields include the terminating
* null (i.e. length >= 2 and an even number). Offsets are relative to the start of the corresponding block
* header (byte 0 = first byte of the header). No padding is implicitly added between strings; producers
* may pack them tightly or align them manually (alignment not required by the specification).
*
* Metadata block layout (conceptual):
* MetadataBlockHeader (fixed size)
* <variable region holding each present UTF-16LE string in any order chosen by the writer>
*
* Invariants / validation recommendations for MetadataBlockHeader:
* - identifier == BlockType::MetadataBlock
* - blockSize >= sizeof(MetadataBlockHeader)
* - For every (offset,length) pair where length > 0:
* * offset >= sizeof(MetadataBlockHeader)
* * offset + length <= blockSize
* * length % 2 == 0
* * The 16-bit code unit at (offset + length - 2) == 0x0000 (null terminator)
* - mediaSequence >= 0 and lastMediaSequence >= 0; if lastMediaSequence > 0 then 0 <= mediaSequence <
* lastMediaSequence
*
* CICM metadata block layout:
* CicmMetadataBlock (header)
* <length bytes of UTF-8 or XML text payload (implementation-defined, not null-terminated)>
*
* NOTE: The library code reading these blocks must not assume strings are present; a zero length means the
* corresponding field is omitted. Offsets for omitted fields MAY be zero or arbitrary; readers should skip them
* whenever length == 0.
*/
/** \struct MetadataBlockHeader
* \brief Header for a metadata block containing offsets and lengths to UTF-16LE descriptive strings.
*
* Descriptive fields (all optional): creator, comments, media title/manufacturer/model/serial/barcode/part number,
* drive manufacturer/model/serial/firmware revision. Strings can be used to describe both physical medium and
* acquisition hardware. Length values include the UTF-16LE null terminator (two zero bytes).
*/
typedef struct MetadataBlockHeader
{
uint32_t identifier; ///< Block identifier, must be BlockType::MetadataBlock.
uint32_t blockSize; ///< Total size in bytes of the entire metadata block (header + strings).
int32_t mediaSequence; ///< Sequence number within a multi-disc / multi-volume set (0-based or 1-based as
///< producer defines).
int32_t lastMediaSequence; ///< Total number of media in the set; 0 or 1 if single item.
uint32_t creatorOffset; ///< Offset to UTF-16LE creator string (or undefined if creatorLength==0).
uint32_t creatorLength; ///< Length in bytes (including null) of creator string (0 if absent).
uint32_t commentsOffset; ///< Offset to UTF-16LE comments string.
uint32_t commentsLength; ///< Length in bytes (including null) of comments string.
uint32_t mediaTitleOffset; ///< Offset to UTF-16LE media title string.
uint32_t mediaTitleLength; ///< Length in bytes (including null) of media title string.
uint32_t mediaManufacturerOffset; ///< Offset to UTF-16LE media manufacturer string.
uint32_t mediaManufacturerLength; ///< Length in bytes (including null) of media manufacturer string.
uint32_t mediaModelOffset; ///< Offset to UTF-16LE media model string.
uint32_t mediaModelLength; ///< Length in bytes (including null) of media model string.
uint32_t mediaSerialNumberOffset; ///< Offset to UTF-16LE media serial number string.
uint32_t mediaSerialNumberLength; ///< Length in bytes (including null) of media serial number string.
uint32_t mediaBarcodeOffset; ///< Offset to UTF-16LE media barcode string.
uint32_t mediaBarcodeLength; ///< Length in bytes (including null) of media barcode string.
uint32_t mediaPartNumberOffset; ///< Offset to UTF-16LE media part number string.
uint32_t mediaPartNumberLength; ///< Length in bytes (including null) of media part number string.
uint32_t driveManufacturerOffset; ///< Offset to UTF-16LE drive manufacturer string.
uint32_t driveManufacturerLength; ///< Length in bytes (including null) of drive manufacturer string.
uint32_t driveModelOffset; ///< Offset to UTF-16LE drive model string.
uint32_t driveModelLength; ///< Length in bytes (including null) of drive model string.
uint32_t driveSerialNumberOffset; ///< Offset to UTF-16LE drive serial number string.
uint32_t driveSerialNumberLength; ///< Length in bytes (including null) of drive serial number string.
uint32_t driveFirmwareRevisionOffset; ///< Offset to UTF-16LE drive firmware revision string.
uint32_t driveFirmwareRevisionLength; ///< Length in bytes (including null) of drive firmware revision string.
} MetadataBlockHeader;
/**Geometry block, contains physical geometry information */
typedef struct CicmMetadataBlock {
/**Identifier, <see cref="BlockType.CicmBlock" /> */
uint32_t identifier;
uint32_t length;
/** \struct CicmMetadataBlock
* \brief Header for a CICM XML metadata block (identifier == BlockType::CicmBlock).
*
* The following 'length' bytes immediately after the header contain the CICM XML payload. Encoding is typically
* UTF-8; the payload is not required to be null-terminated.
*/
typedef struct CicmMetadataBlock
{
uint32_t identifier; ///< Block identifier, must be BlockType::CicmBlock.
uint32_t length; ///< Length in bytes of the CICM metadata payload that follows.
} CicmMetadataBlock;
#pragma pack(pop)
#endif //LIBAARUFORMAT_METADATA_H
#endif // LIBAARUFORMAT_METADATA_H

View File

@@ -21,36 +21,65 @@
#pragma pack(push, 1)
/**Contains list of optical disc tracks */
typedef struct TracksHeader {
/**Identifier, <see cref="BlockType.TracksBlock" /> */
uint32_t identifier;
/**How many entries follow this header */
uint16_t entries;
/**CRC64-ECMA of the block */
uint64_t crc64;
/** \file aaruformat/structs/optical.h
* \brief On-disk structures describing optical disc tracks (Track list block).
*
* An optical tracks block (identifier == BlockType::TracksBlock) stores a list of \ref TrackEntry
* records describing the logical layout of tracks and sessions for CD/DVD/BD and similar media.
*
* Layout:
* TracksHeader (fixed)
* TrackEntry[ entries ] (array, packed)
*
* CRC semantics:
* - TracksHeader::crc64 is a CRC64-ECMA over the contiguous TrackEntry array ONLY (header excluded).
* - For legacy images (imageMajorVersion <= AARUF_VERSION_V1) a byte swap is applied when verifying.
*
* Field semantics (TrackEntry):
* - sequence: Logical track number (1..99 typical for CD). Values outside that range may encode extras.
* - type: Value from \ref TrackType (Audio, Data, Mode variants, etc.).
* - start / end: Inclusive Logical Block Address (LBA) bounds for the track. end >= start.
* - pregap: Number of sectors of pre-gap *preceding* the track's first user-accessible sector (can be 0 or negative
* if representing lead-in semantics; negative interpretation is implementation-defined).
* - session: Session number starting at 1 for multi-session discs (1 for single session).
* - isrc: 13-byte ISRC (raw code, no terminating null). If fewer significant characters, remaining bytes are 0.
* - flags: Bitmask of track/control flags. Unless otherwise specified, recommended mapping (mirrors CD subchannel Q
* control bits) is: bit0 Pre-emphasis, bit1 Copy permitted, bit2 Data track, bit3 Four-channel audio,
* bits4-7 reserved. Actual semantics may be extended by the format specification.
*
* Invariants / validation recommendations:
* - identifier == BlockType::TracksBlock
* - entries * sizeof(TrackEntry) bytes are present after the header in the block image.
* - 1 <= sequence <= 99 for standard CD tracks (non-conforming values allowed but should be documented).
* - start <= end; pregap >= 0 (if negative pregaps unsupported in implementation).
* - ISRC bytes either all zero (no ISRC) or printable ASCII (A-Z 0-9 -) per ISO 3901 (without hyphen formatting).
*/
/** \struct TracksHeader
* \brief Header for an optical tracks block listing track entries.
*/
typedef struct TracksHeader
{
uint32_t identifier; ///< Block identifier (must be BlockType::TracksBlock).
uint16_t entries; ///< Number of TrackEntry records following this header.
uint64_t crc64; ///< CRC64-ECMA of the TrackEntry array (header excluded, legacy byte-swap for early versions).
} TracksHeader;
/**Optical disc track */
typedef struct TrackEntry {
/**Track sequence */
uint8_t sequence;
/**Track type */
uint8_t type;
/**Track starting LBA */
int64_t start;
/**Track last LBA */
int64_t end;
/**Track pregap in sectors */
int64_t pregap;
/**Track session */
uint8_t session;
/**Track's ISRC in ASCII */
uint8_t isrc[13];
/**Track flags */
uint8_t flags;
/** \struct TrackEntry
* \brief Single optical disc track descriptor (sequence, type, LBAs, session, ISRC, flags).
*/
typedef struct TrackEntry
{
uint8_t sequence; ///< Track number (1..99 typical for CD audio/data). 0 may indicate placeholder/non-standard.
uint8_t type; ///< Track type (value from \ref TrackType).
int64_t start; ///< Inclusive starting LBA of the track.
int64_t end; ///< Inclusive ending LBA of the track.
int64_t pregap; ///< Pre-gap length in sectors preceding track start (0 if none).
uint8_t session; ///< Session number (1-based). 1 for single-session discs.
uint8_t isrc[13]; ///< ISRC raw 13-byte code (no null terminator). All zeros if not present.
uint8_t flags; ///< Control / attribute bitfield (see file documentation for suggested bit mapping).
} TrackEntry;
#pragma pack(pop)
#endif //LIBAARUFORMAT_OPTICAL_H
#endif // LIBAARUFORMAT_OPTICAL_H

View File

@@ -19,19 +19,214 @@
#ifndef LIBAARUFORMAT_OPTIONS_H
#define LIBAARUFORMAT_OPTIONS_H
#include <stdbool.h> ///< For bool type used in aaru_options.
#include <stdint.h> ///< For fixed-width integer types.
/** \file aaruformat/structs/options.h
* \brief Image creation / open tuning options structure and related semantics.
*
* The library accepts a semicolon-delimited key=value options string (see parse_options()). Recognized keys:
* compress=true|false Enable/disable block compression (LZMA for data blocks, FLAC for audio tracks).
* deduplicate=true|false If true, identical (duplicate) sectors are stored once (DDT entries point to same
* physical block). If false, duplicates are still tracked in DDT but each occurrence
* is stored independently (no storage savings). DDT itself is always present.
* dictionary=<bytes> LZMA dictionary size in bytes (fallback default 33554432 if 0 or invalid).
* table_shift=<n> DDT v2 table shift (default 9) (items per primary entry = 2^n when multi-level).
* data_shift=<n> Global data shift (default 12). Defines per-block address granularity: the low
* 2^n range encodes the sector (or unit) offset within a block; higher bits combine
* with block_alignment to derive block file offsets. Used by DDT but not limited to it.
* block_alignment=<n> log2 alignment of underlying data blocks (default 9 => 512 bytes) (block size = 2^n).
* md5=true|false Generate MD5 checksum (stored in checksum block if true).
* sha1=true|false Generate SHA-1 checksum.
* sha256=true|false Generate SHA-256 checksum.
* blake3=true|false Generate BLAKE3 checksum (may require build-time support; ignored if unsupported).
* spamsum=true|false Generate SpamSum fuzzy hash.
*
* Defaults (when option string NULL or key omitted):
* compress=true, deduplicate=true, dictionary=33554432, table_shift=9, data_shift=12,
* block_alignment=9, md5=false, sha1=false, sha256=false, blake3=false, spamsum=false.
*
* Validation / normalization done in parse_options():
* - Zero / missing dictionary resets to default 33554432.
* - Zero table_shift resets to 9.
* - Zero data_shift resets to 12.
* - Zero block_alignment resets to 9.
*
* Rationale:
* - table_shift, data_shift and block_alignment mirror fields stored in on-disk headers (see AaruHeaderV2 &
* DdtHeader2); data_shift is a global per-block granularity exponent (not DDT-specific) governing how in-block offsets
* are encoded.
* - compress selects adaptive codec usage: LZMA applied to generic/data blocks, FLAC applied to audio track payloads.
* - deduplicate toggles storage optimization only: the DDT directory is always built for addressing; disabling simply
* forces each sector's content to be written even if already present (useful for forensic byte-for-byte
* duplication).
* - dictionary tunes compression ratio/memory use; large values increase memory footprint.
* - Checksums are optional; enabling multiple increases CPU time at write finalization.
*
* Performance / space trade-offs (deduplicate=false):
* - Significantly larger image size: every repeated sector payload is written again.
* - Higher write I/O and longer creation time for highly redundant sources (e.g., zero-filled regions) compared to
* deduplicate=true, although CPU time spent on duplicate detection/hash lookups is reduced.
* - Potentially simpler post-process forensic validation (physical ordering preserved without logical coalescing).
* - Use when exact physical repetition is more critical than storage efficiency, or to benchmark raw device
* throughput.
* - For typical archival use-cases with large zero / repeated patterns, deduplicate=true markedly reduces footprint.
*
* Approximate in-RAM hash map usage for deduplication (deduplicate=true):
* The on-disk DDT can span many secondary tables, but only the primary table plus a currently loaded secondary (and
* possibly a small cache) reside in memory; their footprint is typically <<5% of total indexed media space and is
* often negligible compared to the hash map used to detect duplicate sectors. Therefore we focus here on the hash /
* lookup structure ("hash_map") memory, not the entire DDT on-disk size.
*
* Worst-case (all sectors unique) per 1 GiB of user data:
* sectors_per_GiB = 2^30 / sector_size
* hash_bytes ≈ sectors_per_GiB * H (H ≈ 16 bytes: 8-byte fingerprint + ~8 bytes map overhead)
*
* Resulting hash_map RAM per GiB (unique sectors):
* +--------------+------------------+------------------------------+
* | Sector size | Sectors / GiB | Hash map (~16 B / sector) |
* +--------------+------------------+------------------------------+
* | 512 bytes | 2,097,152 | ~33.5 MiB (≈32.036.0 MiB) |
* | 2048 bytes | 524,288 | ~ 8.0 MiB (≈7.58.5 MiB) |
* | 4096 bytes | 262,144 | ~ 4.0 MiB (≈3.84.3 MiB) |
* +--------------+------------------+------------------------------+
*
* (Range reflects allocator + load factor variation.)
*
* Targeted projections (hash map only, R=1):
* 2048byte sectors (~8 MiB per GiB unique)
* Capacity | Hash map (MiB) | Hash map (GiB)
* ---------+---------------+----------------
* 25 GiB | ~200 | 0.20
* 50 GiB | ~400 | 0.39
*
* 512byte sectors (~34 MiB per GiB unique; using 33.5 MiB for calc)
* Capacity | Hash map (MiB) | Hash map (GiB)
* ---------+---------------+----------------
* 128 GiB | ~4288 | 4.19
* 500 GiB | ~16750 | 16.36
* 1 TiB* | ~34304 | 33.50
* 2 TiB* | ~68608 | 67.00
*
* *TiB = 1024 GiB binary. For decimal TB reduce by ~7% (×0.93).
*
* Duplicate ratio scaling:
* Effective hash RAM ≈ table_value * R, where R = unique_sectors / total_sectors.
* Example: 500 GiB @512 B, R=0.4 ⇒ ~16750 MiB * 0.4 ≈ 6700 MiB (~6.54 GiB).
*
* Quick rule of thumb (hash only):
* hash_bytes_per_GiB ≈ 16 * (2^30 / sector_size) ≈ (17.1799e9 / sector_size) bytes
* → ≈ 33.6 MiB (512 B), 8.4 MiB (2048 B), 4.2 MiB (4096 B) per GiB unique.
*
* Memory planning tip:
* If projected hash_map usage risks exceeding available RAM, consider:
* - Increasing table_shift (reduces simultaneous secondary loads / contention)
* - Lowering data_shift (if practical) to encourage earlier big DDT adoption with fewer unique blocks
* - Segmenting the dump into phases (if workflow permits)
* - Accepting higher duplicate ratio by pre-zero detection or sparse treatment externally.
* - Resuming the dump in multiple passes: each resume rebuilds the hash_map from scratch, so peak RAM still
* matches a single-pass estimate, but average RAM over total wall time can drop if you unload between passes.
*
* NOTE: DDT in-RAM portion (primary + one secondary) usually adds only a few additional MiB even for very large
* images, hence omitted from sizing tables. Include +5% safety margin if extremely tight on memory.
*
* Guidance for table_shift / data_shift selection:
* Let:
* S = total logical sectors expected in image (estimate if unknown).
* T = table_shift (items per primary DDT entry = 2^T when multi-level; 0 => single-level).
* D = data_shift (in-block sector offset span = 2^D).
* BA = block_alignment (bytes) = 2^block_alignment.
* SS = sector size (bytes).
*
* 1. data_shift constraints:
* - For SMALL DDT entries (12 payload bits after status): D must satisfy 0 < D < 12 and (12 - D) >= 1 so that at
* least one bit remains for block index. Practical range for small DDT: 6..10 (leaves 2+ bits for block index).
* - For BIG DDT entries (28 payload bits after status): D may be larger (up to 27) but values >16 rarely useful.
* - Effective address granularity inside a block = min(2^D * SS, physical block span implied by BA).
* - Choosing D too large wastes bits (larger offset range than block actually contains) and reduces the number of
* block index bits within a small entry, potentially forcing upgrade to big DDT earlier.
*
* Recommended starting points:
* * 512byte sectors, 512byte block alignment: D=9 (512 offsets) or D=8 (256 offsets) keeps small DDT viable.
* * 2048byte optical sectors, 2048byte alignment: D=8 (256 offsets) typically sufficient.
* * Mixed / large logical block sizes: keep D so that (2^D * SS) ≈ typical dedup block region you want
* addressable.
*
* 2. block capacity within an entry:
* - SMALL DDT: usable block index bits = 12 - D.
* Max representable block index (small) = 2^(12-D) - 1.
* - BIG DDT: usable block index bits = 28 - D.
* Max representable block index (big) = 2^(28-D) - 1.
* - If (requiredBlockIndex > max) you must either reduce D or rely on big DDT.
*
* Approximate requiredBlockIndex ≈ (TotalUniqueBlocks) where
* TotalUniqueBlocks ≈ (S * SS) / (BA * (2^D * SS / (SS))) = S / (2^D * (BA / SS))
* Simplified (assuming BA = SS): TotalUniqueBlocks ≈ S / 2^D.
*
* 3. table_shift considerations (multi-level DDT):
* - Primary entries count ≈ ceil(S / 2^T). Choose T so this count fits memory and keeps lookup fast.
* - Larger T reduces primary table size, increasing secondary table dereferences.
* - Typical balanced values: T in [8..12] (256..4096 sectors per primary entry).
* - Set T=0 for single-level when S is small enough that all entries fit comfortably in memory.
*
* Memory rough estimate for single-level SMALL DDT:
* bytes ≈ S * 2 (each small entry 2 bytes). For BIG DDT: bytes ≈ S * 4.
* Multi-level: primary table bytes ≈ (S / 2^T) * entrySize + sum(secondary tables).
*
* 4. Example scenarios:
* - 50M sectors (≈25 GiB @512B), want small DDT: pick D=8 (256); block index bits=4 (max 16 blocks) insufficient.
* Need either D=6 (1024 block indices) or accept BIG DDT (28-8=20 bits => million+ blocks). So prefer BIG DDT
* here.
* - 2M sectors, 2048B alignment, optical: D=8 gives S/2^D ≈ 7812 unique offsets; small DDT block index bits=4 (max
* 16) inadequate → choose D=6 (offset span 64 sectors) giving 6 block index bits (max 64) or just use big DDT.
*
* 5. Practical recommendations:
* - If unsure and image > ~1M sectors: keep defaults (data_shift=12, table_shift=9) and allow big DDT.
* - For small archival (<100k sectors): T=0 (single-level), D≈8..10 to keep small DDT feasible.
* - Benchmark before lowering D purely to stay in small DDT; increased secondary lookups or larger primary tables
* can offset saved space.
*
* Recommended presets (approximate bands):
* +----------------------+----------------------+---------------------------+-------------------------------+
* | Total logical sectors | table_shift (T) | data_shift (D) | Notes |
* +----------------------+----------------------+---------------------------+-------------------------------+
* | < 50,000 | 0 | 8 10 | Single-level small DDT likely |
* | 50K 1,000,000 | 8 9 | 9 10 | Still feasible small DDT |
* | 1M 10,000,000 | 9 10 | 10 12 | Borderline small -> big DDT |
* | 10M 100,000,000 | 10 11 | 11 12 | Prefer big DDT; tune T for mem|
* | > 100,000,000 | 11 12 | 12 | Big DDT; higher T saves memory|
* +----------------------+----------------------+---------------------------+-------------------------------+
* Ranges show typical stable regions; pick the lower end of table_shift if memory is ample, higher if minimizing
* primary table size. Always validate actual unique block count vs payload bits.
*
* NOTE: The library will automatically fall back to BIG DDT where needed; these settings bias structure, they do not
* guarantee small DDT retention.
*
* Thread-safety: aaru_options is a plain POD struct; caller may copy freely. parse_options() returns by value.
*
* Future compatibility: unknown keys are ignored by current parser; consumers should preserve original option
* strings if round-tripping is required.
*/
/** \struct aaru_options
* \brief Parsed user-specified tunables controlling compression, deduplication, hashing and DDT geometry.
*
* All shifts are exponents of two.
*/
typedef struct
{
bool compress;
bool deduplicate;
uint32_t dictionary;
uint8_t table_shift;
uint8_t data_shift;
uint8_t block_alignment;
bool md5;
bool sha1;
bool sha256;
bool blake3;
bool spamsum;
bool compress; ///< Enable adaptive compression (LZMA for data blocks, FLAC for audio). Default: true.
bool deduplicate; ///< Storage dedup flag (DDT always exists). true=share identical sector content, false=store
///< each instance.
uint32_t dictionary; ///< LZMA dictionary size in bytes (>= 4096 recommended). Default: 33554432 (32 MiB).
uint8_t table_shift; ///< DDT table shift (multi-level fan-out exponent). Default: 9.
uint8_t data_shift; ///< Global data shift: low bits encode sector offset inside a block (2^data_shift span).
uint8_t block_alignment; ///< log2 underlying block alignment (2^n bytes). Default: 9 (512 bytes).
bool md5; ///< Generate MD5 checksum (ChecksumAlgorithm::Md5) when finalizing image.
bool sha1; ///< Generate SHA-1 checksum (ChecksumAlgorithm::Sha1) when finalizing image.
bool sha256; ///< Generate SHA-256 checksum (ChecksumAlgorithm::Sha256) when finalizing image.
bool blake3; ///< Generate BLAKE3 checksum if supported (not stored if algorithm unavailable).
bool spamsum; ///< Generate SpamSum fuzzy hash (ChecksumAlgorithm::SpamSum) if enabled.
} aaru_options;
#endif // LIBAARUFORMAT_OPTIONS_H