Add more IDDB variants; port GetDuplicates to IDDB

This commit is contained in:
Matt Nadareski
2024-03-19 23:15:58 -04:00
parent f8c5690ddf
commit efc184627a
4 changed files with 272 additions and 0 deletions

View File

@@ -500,6 +500,76 @@ namespace SabreTools.DatFiles
} }
} }
/// <summary>
/// List all duplicates found in a DAT based on a DatItem
/// </summary>
/// <param name="datItem">Item to try to match</param>
/// <param name="sorted">True if the DAT is already sorted accordingly, false otherwise (default)</param>
/// <returns>List of matched DatItem objects</returns>
public ConcurrentList<(long, DatItem)> GetDuplicates(DatItem datItem, bool sorted = false)
{
ConcurrentList<(long, DatItem)> output = [];
// Check for an empty rom list first
if (DatStatistics.TotalCount == 0)
return output;
// We want to get the proper key for the DatItem
string key = SortAndGetKey(datItem, sorted);
// If the key doesn't exist, return the empty list
var roms = GetDatItemsForBucket(key);
if (roms == null || roms.Length == 0)
return output;
// Try to find duplicates
ConcurrentList<(long, DatItem)> left = [];
for (int i = 0; i < roms.Length; i++)
{
DatItem other = roms[i].Item2;
if (other.GetBoolFieldValue(DatItem.RemoveKey) == true)
continue;
if (datItem.Equals(other))
{
other.SetFieldValue<bool?>(DatItem.RemoveKey, true);
output.Add(other);
}
else
{
left.Add(other);
}
}
// Add back all roms with the proper flags
_buckets[key] = output.Concat(left).Select(i => i.Item1).ToConcurrentList();
return output;
}
/// <summary>
/// Check if a DAT contains the given DatItem
/// </summary>
/// <param name="datItem">Item to try to match</param>
/// <param name="sorted">True if the DAT is already sorted accordingly, false otherwise (default)</param>
/// <returns>True if it contains the rom, false otherwise</returns>
public bool HasDuplicates(DatItem datItem, bool sorted = false)
{
// Check for an empty rom list first
if (DatStatistics.TotalCount == 0)
return false;
// We want to get the proper key for the DatItem
string key = SortAndGetKey(datItem, sorted);
// If the key doesn't exist
var roms = GetDatItemsForBucket(key);
if (roms == null || roms.Length == 0)
return false;
// Try to find duplicates
return roms.Any(r => datItem.Equals(r.Item2)) == true;
}
/// <summary> /// <summary>
/// Merge an arbitrary set of item pairs based on the supplied information /// Merge an arbitrary set of item pairs based on the supplied information
/// </summary> /// </summary>
@@ -620,6 +690,42 @@ namespace SabreTools.DatFiles
return output; return output;
} }
/// <summary>
/// Get the highest-order Field value that represents the statistics
/// </summary>
private ItemKey GetBestAvailable()
{
// Get the required counts
long diskCount = DatStatistics.GetItemCount(ItemType.Disk);
long mediaCount = DatStatistics.GetItemCount(ItemType.Media);
long romCount = DatStatistics.GetItemCount(ItemType.Rom);
long nodumpCount = DatStatistics.GetStatusCount(ItemStatus.Nodump);
// If all items are supposed to have a SHA-512, we bucket by that
if (diskCount + mediaCount + romCount - nodumpCount == DatStatistics.GetHashCount(HashType.SHA512))
return ItemKey.SHA512;
// If all items are supposed to have a SHA-384, we bucket by that
else if (diskCount + mediaCount + romCount - nodumpCount == DatStatistics.GetHashCount(HashType.SHA384))
return ItemKey.SHA384;
// If all items are supposed to have a SHA-256, we bucket by that
else if (diskCount + mediaCount + romCount - nodumpCount == DatStatistics.GetHashCount(HashType.SHA256))
return ItemKey.SHA256;
// If all items are supposed to have a SHA-1, we bucket by that
else if (diskCount + mediaCount + romCount - nodumpCount == DatStatistics.GetHashCount(HashType.SHA1))
return ItemKey.SHA1;
// If all items are supposed to have a MD5, we bucket by that
else if (diskCount + mediaCount + romCount - nodumpCount == DatStatistics.GetHashCount(HashType.MD5))
return ItemKey.MD5;
// Otherwise, we bucket by CRC
else
return ItemKey.CRC;
}
/// <summary> /// <summary>
/// Get the bucketing key for a given item index /// Get the bucketing key for a given item index
/// <param name="itemIndex">Index of the current item</param> /// <param name="itemIndex">Index of the current item</param>
@@ -916,6 +1022,22 @@ namespace SabreTools.DatFiles
return true; return true;
} }
/// <summary>
/// Sort the input DAT and get the key to be used by the item
/// </summary>
/// <param name="datItem">Item to try to match</param>
/// <param name="sorted">True if the DAT is already sorted accordingly, false otherwise (default)</param>
/// <returns>Key to try to use</returns>
private string SortAndGetKey(DatItem datItem, bool sorted = false)
{
// If we're not already sorted, take care of it
if (!sorted)
BucketBy(GetBestAvailable(), DedupeType.None);
// Now that we have the sorted type, we get the proper key
return datItem.GetKey(_bucketedBy);
}
#endregion #endregion
#region Filtering #region Filtering

View File

@@ -108,6 +108,90 @@ namespace SabreTools.DatTools
datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.NameKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.NameKey)}"); datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.NameKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.NameKey)}");
datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.DescriptionKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.DescriptionKey)}"); datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.DescriptionKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.DescriptionKey)}");
datFile.Items.ClearMarked(); datFile.Items.ClearMarked();
return success;
}
/// <summary>
/// Verify a DatFile against a set of depots, leaving only missing files
/// </summary>
/// <param name="datFile">Current DatFile object to verify against</param>
/// <param name="inputs">List of input directories to compare against</param>
/// <returns>True if verification was a success, false otherwise</returns>
public static bool VerifyDepotDB(DatFile datFile, List<string> inputs)
{
bool success = true;
var watch = new InternalStopwatch("Verifying all from supplied depots");
// Now loop through and get only directories from the input paths
List<string> directories = [];
foreach (string input in inputs)
{
// Add to the list if the input is a directory
if (Directory.Exists(input))
{
logger.Verbose($"Adding depot: {input}");
directories.Add(input);
}
}
// If we don't have any directories, we want to exit
if (directories.Count == 0)
return success;
// Now that we have a list of depots, we want to bucket the input DAT by SHA-1
datFile.ItemsDB.BucketBy(ItemKey.SHA1, DedupeType.None);
// Then we want to loop through each of the hashes and see if we can rebuild
var keys = datFile.ItemsDB.SortedKeys.ToList();
foreach (string hash in keys)
{
// Pre-empt any issues that could arise from string length
if (hash.Length != Constants.SHA1Length)
continue;
logger.User($"Checking hash '{hash}'");
// Get the extension path for the hash
string? subpath = Utilities.GetDepotPath(hash, datFile.Header.GetFieldValue<DepotInformation?>(DatHeader.InputDepotKey)?.Depth ?? 0);
if (subpath == null)
continue;
// Find the first depot that includes the hash
string? foundpath = null;
foreach (string directory in directories)
{
if (System.IO.File.Exists(Path.Combine(directory, subpath)))
{
foundpath = Path.Combine(directory, subpath);
break;
}
}
// If we didn't find a path, then we continue
if (foundpath == null)
continue;
// If we have a path, we want to try to get the rom information
GZipArchive tgz = new(foundpath);
BaseFile? fileinfo = tgz.GetTorrentGZFileInfo();
// If the file information is null, then we continue
if (fileinfo == null)
continue;
// Now we want to remove all duplicates from the DAT
datFile.ItemsDB.GetDuplicates(new Rom(fileinfo))
.AddRange(datFile.ItemsDB.GetDuplicates(new Disk(fileinfo)));
}
watch.Stop();
// Set fixdat headers in case of writing out
datFile.Header.SetFieldValue<string?>(DatHeader.FileNameKey, $"fixDAT_{datFile.Header.GetStringFieldValue(DatHeader.FileNameKey)}");
datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.NameKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.NameKey)}");
datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.DescriptionKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.DescriptionKey)}");
datFile.ItemsDB.ClearMarked(); datFile.ItemsDB.ClearMarked();
return success; return success;
@@ -158,6 +242,50 @@ namespace SabreTools.DatTools
datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.NameKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.NameKey)}"); datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.NameKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.NameKey)}");
datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.DescriptionKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.DescriptionKey)}"); datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.DescriptionKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.DescriptionKey)}");
datFile.Items.ClearMarked(); datFile.Items.ClearMarked();
return success;
}
/// <summary>
/// Verify a DatFile against a set of inputs, leaving only missing files
/// </summary>
/// <param name="datFile">Current DatFile object to verify against</param>
/// <param name="hashOnly">True if only hashes should be checked, false for full file information</param>
/// <returns>True if verification was a success, false otherwise</returns>
public static bool VerifyGenericDB(DatFile datFile, bool hashOnly)
{
bool success = true;
var watch = new InternalStopwatch("Verifying all from supplied paths");
// Force bucketing according to the flags
if (hashOnly)
datFile.ItemsDB.BucketBy(ItemKey.CRC, DedupeType.Full);
else
datFile.ItemsDB.BucketBy(ItemKey.Machine, DedupeType.Full);
// Then mark items for removal
var keys = datFile.ItemsDB.SortedKeys.ToList();
foreach (string key in keys)
{
var items = datFile.ItemsDB.GetDatItemsForBucket(key);
if (items == null)
continue;
for (int i = 0; i < items.Length; i++)
{
// Unmatched items will have a source ID of int.MaxValue, remove all others
if (items[i].Item2.GetFieldValue<Source?>(DatItem.SourceKey)?.Index != int.MaxValue)
items[i].Item2.SetFieldValue<bool?>(DatItem.RemoveKey, true);
}
}
watch.Stop();
// Set fixdat headers in case of writing out
datFile.Header.SetFieldValue<string?>(DatHeader.FileNameKey, $"fixDAT_{datFile.Header.GetStringFieldValue(DatHeader.FileNameKey)}");
datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.NameKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.NameKey)}");
datFile.Header.SetFieldValue<string?>(Models.Metadata.Header.DescriptionKey, $"fixDAT_{datFile.Header.GetStringFieldValue(Models.Metadata.Header.DescriptionKey)}");
datFile.ItemsDB.ClearMarked(); datFile.ItemsDB.ClearMarked();
return success; return success;

View File

@@ -69,9 +69,11 @@ namespace SabreTools.DatTools
// Bucket roms by game name, if not already // Bucket roms by game name, if not already
datFile.Items.BucketBy(ItemKey.Machine, DedupeType.None); datFile.Items.BucketBy(ItemKey.Machine, DedupeType.None);
datFile.ItemsDB.BucketBy(ItemKey.Machine, DedupeType.None);
// Output the number of items we're going to be writing // Output the number of items we're going to be writing
logger.User($"A total of {datFile.Items.DatStatistics.TotalCount - datFile.Items.DatStatistics.RemovedCount} items will be written out to '{datFile.Header.GetStringFieldValue(DatHeader.FileNameKey)}'"); logger.User($"A total of {datFile.Items.DatStatistics.TotalCount - datFile.Items.DatStatistics.RemovedCount} items will be written out to '{datFile.Header.GetStringFieldValue(DatHeader.FileNameKey)}'");
//logger.User($"A total of {datFile.ItemsDB.DatStatistics.TotalCount - datFile.ItemsDB.DatStatistics.RemovedCount} items will be written out to '{datFile.Header.GetStringFieldValue(DatHeader.FileNameKey)}'");
// Get the outfile names // Get the outfile names
Dictionary<DatFormat, string> outfiles = datFile.Header.CreateOutFileNames(outDir!, overwrite); Dictionary<DatFormat, string> outfiles = datFile.Header.CreateOutFileNames(outDir!, overwrite);
@@ -128,15 +130,28 @@ namespace SabreTools.DatTools
if (diskCount + mediaCount + romCount == 0) if (diskCount + mediaCount + romCount == 0)
datFile.Items.RecalculateStats(); datFile.Items.RecalculateStats();
diskCount = datFile.ItemsDB.DatStatistics.GetItemCount(ItemType.Disk);
mediaCount = datFile.ItemsDB.DatStatistics.GetItemCount(ItemType.Media);
romCount = datFile.ItemsDB.DatStatistics.GetItemCount(ItemType.Rom);
if (diskCount + mediaCount + romCount == 0)
datFile.ItemsDB.RecalculateStats();
datFile.Items.BucketBy(ItemKey.Machine, DedupeType.None, norename: true); datFile.Items.BucketBy(ItemKey.Machine, DedupeType.None, norename: true);
datFile.ItemsDB.BucketBy(ItemKey.Machine, DedupeType.None, norename: true);
datFile.Items.DatStatistics.DisplayName = datFile.Header.GetStringFieldValue(DatHeader.FileNameKey); datFile.Items.DatStatistics.DisplayName = datFile.Header.GetStringFieldValue(DatHeader.FileNameKey);
datFile.Items.DatStatistics.MachineCount = datFile.Items.Keys.Count; datFile.Items.DatStatistics.MachineCount = datFile.Items.Keys.Count;
datFile.Items.DatStatistics.IsDirectory = false; datFile.Items.DatStatistics.IsDirectory = false;
datFile.ItemsDB.DatStatistics.DisplayName = datFile.Header.GetStringFieldValue(DatHeader.FileNameKey);
datFile.ItemsDB.DatStatistics.MachineCount = datFile.Items.Keys.Count;
datFile.ItemsDB.DatStatistics.IsDirectory = false;
var statsList = new List<DatStatistics> var statsList = new List<DatStatistics>
{ {
datFile.Items.DatStatistics, datFile.Items.DatStatistics,
//datFile.ItemsDB.DatStatistics,
}; };
var consoleOutput = BaseReport.Create(StatReportFormat.None, statsList); var consoleOutput = BaseReport.Create(StatReportFormat.None, statsList);
consoleOutput!.WriteToFile(null, true, true); consoleOutput!.WriteToFile(null, true, true);
@@ -206,14 +221,19 @@ namespace SabreTools.DatTools
{ {
// Force a statistics recheck, just in case // Force a statistics recheck, just in case
datFile.Items.RecalculateStats(); datFile.Items.RecalculateStats();
datFile.ItemsDB.RecalculateStats();
// If there's nothing there, abort // If there's nothing there, abort
if (datFile.Items.DatStatistics.TotalCount == 0) if (datFile.Items.DatStatistics.TotalCount == 0)
return false; return false;
if (datFile.ItemsDB.DatStatistics.TotalCount == 0)
return false;
// If every item is removed, abort // If every item is removed, abort
if (datFile.Items.DatStatistics.TotalCount == datFile.Items.DatStatistics.RemovedCount) if (datFile.Items.DatStatistics.TotalCount == datFile.Items.DatStatistics.RemovedCount)
return false; return false;
if (datFile.ItemsDB.DatStatistics.TotalCount == datFile.ItemsDB.DatStatistics.RemovedCount)
return false;
return true; return true;
} }

View File

@@ -94,6 +94,7 @@ namespace SabreTools.Features
} }
Verification.VerifyGeneric(datdata, hashOnly); Verification.VerifyGeneric(datdata, hashOnly);
//Verification.VerifyGenericDB(datdata, hashOnly);
} }
// Now write out if there are any items left // Now write out if there are any items left
@@ -145,6 +146,7 @@ namespace SabreTools.Features
} }
Verification.VerifyGeneric(datdata, hashOnly); Verification.VerifyGeneric(datdata, hashOnly);
//Verification.VerifyGenericDB(datdata, hashOnly);
} }
// Now write out if there are any items left // Now write out if there are any items left