Split deduplication from bucketing, add short-circuit

This commit is contained in:
Matt Nadareski
2025-01-14 20:21:54 -05:00
parent 0e67113200
commit 6e365c3f03
16 changed files with 196 additions and 181 deletions

View File

@@ -322,7 +322,7 @@ namespace SabreTools.DatFiles
private void SetOneGamePerRegionImpl(List<string> regionList)
{
// For sake of ease, the first thing we want to do is bucket by game
BucketBy(ItemKey.Machine, DedupeType.None, norename: true);
BucketBy(ItemKey.Machine, norename: true);
// Then we want to get a mapping of all machines to parents
Dictionary<string, List<string>> parents = [];

View File

@@ -19,7 +19,7 @@ namespace SabreTools.DatFiles
_logger.User("Creating device non-merged sets from the DAT");
// For sake of ease, the first thing we want to do is bucket by game
BucketBy(ItemKey.Machine, DedupeType.None, norename: true);
BucketBy(ItemKey.Machine, norename: true);
// Now we want to loop through all of the games and set the correct information
while (AddItemsFromDevices(false, false)) ;
@@ -38,7 +38,7 @@ namespace SabreTools.DatFiles
_logger.User("Creating fully merged sets from the DAT");
// For sake of ease, the first thing we want to do is bucket by game
BucketBy(ItemKey.Machine, DedupeType.None, norename: true);
BucketBy(ItemKey.Machine, norename: true);
// Now we want to loop through all of the games and set the correct information
AddItemsFromChildren(true, false);
@@ -59,7 +59,7 @@ namespace SabreTools.DatFiles
_logger.User("Creating fully non-merged sets from the DAT");
// For sake of ease, the first thing we want to do is bucket by game
BucketBy(ItemKey.Machine, DedupeType.None, norename: true);
BucketBy(ItemKey.Machine, norename: true);
// Now we want to loop through all of the games and set the correct information
while (AddItemsFromDevices(true, true)) ;
@@ -82,7 +82,7 @@ namespace SabreTools.DatFiles
_logger.User("Creating merged sets from the DAT");
// For sake of ease, the first thing we want to do is bucket by game
BucketBy(ItemKey.Machine, DedupeType.None, norename: true);
BucketBy(ItemKey.Machine, norename: true);
// Now we want to loop through all of the games and set the correct information
AddItemsFromChildren(true, true);
@@ -103,7 +103,7 @@ namespace SabreTools.DatFiles
_logger.User("Creating non-merged sets from the DAT");
// For sake of ease, the first thing we want to do is bucket by game
BucketBy(ItemKey.Machine, DedupeType.None, norename: true);
BucketBy(ItemKey.Machine, norename: true);
// Now we want to loop through all of the games and set the correct information
AddItemsFromCloneOfParent();
@@ -124,7 +124,7 @@ namespace SabreTools.DatFiles
_logger.User("Creating split sets from the DAT");
// For sake of ease, the first thing we want to do is bucket by game
BucketBy(ItemKey.Machine, DedupeType.None, norename: true);
BucketBy(ItemKey.Machine, norename: true);
// Now we want to loop through all of the games and set the correct information
RemoveItemsFromCloneOfChild();

View File

@@ -254,13 +254,22 @@ namespace SabreTools.DatFiles
/// Take the arbitrarily bucketed Files Dictionary and convert to one bucketed by a user-defined method
/// </summary>
/// <param name="bucketBy">ItemKey enum representing how to bucket the individual items</param>
/// <param name="dedupeType">Dedupe type that should be used</param>
/// <param name="lower">True if the key should be lowercased (default), false otherwise</param>
/// <param name="norename">True if games should only be compared on game and file name, false if system and source are counted</param>
public void BucketBy(ItemKey bucketBy, DedupeType dedupeType, bool lower = true, bool norename = true)
public void BucketBy(ItemKey bucketBy, bool lower = true, bool norename = true)
{
Items.BucketBy(bucketBy, dedupeType, lower, norename);
ItemsDB.BucketBy(bucketBy, dedupeType, lower, norename);
Items.BucketBy(bucketBy, lower, norename);
ItemsDB.BucketBy(bucketBy, lower, norename);
}
/// <summary>
/// Perform deduplication based on the deduplication type provided
/// </summary>
/// <param name="dedupeType">Dedupe type that should be used</param>
public void Deduplicate(DedupeType dedupeType)
{
Items.Deduplicate(dedupeType);
ItemsDB.Deduplicate(dedupeType);
}
/// <summary>

View File

@@ -461,8 +461,9 @@ namespace SabreTools.DatFiles
if (itemFieldNames.Count > 0)
{
// For comparison's sake, we want to use CRC as the base bucketing
datFile.BucketBy(ItemKey.CRC, DedupeType.Full);
intDat.BucketBy(ItemKey.CRC, DedupeType.None);
datFile.BucketBy(ItemKey.CRC);
datFile.Deduplicate(DedupeType.Full);
intDat.BucketBy(ItemKey.CRC);
// Then we do a hashwise comparison against the base DAT
#if NET452_OR_GREATER || NETCOREAPP
@@ -509,8 +510,9 @@ namespace SabreTools.DatFiles
if (machineFieldNames.Count > 0)
{
// For comparison's sake, we want to use Machine Name as the base bucketing
datFile.BucketBy(ItemKey.Machine, DedupeType.Full);
intDat.BucketBy(ItemKey.Machine, DedupeType.None);
datFile.BucketBy(ItemKey.Machine);
datFile.Deduplicate(DedupeType.Full);
intDat.BucketBy(ItemKey.Machine);
// Then we do a namewise comparison against the base DAT
#if NET452_OR_GREATER || NETCOREAPP
@@ -579,8 +581,9 @@ namespace SabreTools.DatFiles
if (itemFieldNames.Count > 0)
{
// For comparison's sake, we want to use CRC as the base bucketing
datFile.BucketBy(ItemKey.CRC, DedupeType.Full);
intDat.BucketBy(ItemKey.CRC, DedupeType.None);
datFile.BucketBy(ItemKey.CRC);
datFile.Deduplicate(DedupeType.Full);
intDat.BucketBy(ItemKey.CRC);
// Then we do a hashwise comparison against the base DAT
#if NET452_OR_GREATER || NETCOREAPP
@@ -620,8 +623,9 @@ namespace SabreTools.DatFiles
if (machineFieldNames.Count > 0)
{
// For comparison's sake, we want to use Machine Name as the base bucketing
datFile.BucketBy(ItemKey.Machine, DedupeType.Full);
intDat.BucketBy(ItemKey.Machine, DedupeType.None);
datFile.BucketBy(ItemKey.Machine);
datFile.Deduplicate(DedupeType.Full);
intDat.BucketBy(ItemKey.Machine);
// Then we do a namewise comparison against the base DAT
#if NET452_OR_GREATER || NETCOREAPP
@@ -669,19 +673,18 @@ namespace SabreTools.DatFiles
/// <param name="useGames">True to diff using games, false to use hashes</param>
public static void DiffAgainst(DatFile datFile, DatFile intDat, bool useGames)
{
// For comparison's sake, we want to use a base ordering
if (useGames)
datFile.BucketBy(ItemKey.Machine, DedupeType.None);
else
datFile.BucketBy(ItemKey.CRC, DedupeType.None);
InternalStopwatch watch = new($"Comparing '{intDat.Header.GetStringFieldValue(DatHeader.FileNameKey)}' to base DAT");
// For comparison's sake, we want to a the base bucketing
if (useGames)
intDat.BucketBy(ItemKey.Machine, DedupeType.None);
{
intDat.BucketBy(ItemKey.Machine);
}
else
intDat.BucketBy(ItemKey.CRC, DedupeType.Full);
{
intDat.BucketBy(ItemKey.CRC);
intDat.Deduplicate(DedupeType.Full);
}
// Then we compare against the base DAT
#if NET452_OR_GREATER || NETCOREAPP
@@ -781,7 +784,7 @@ namespace SabreTools.DatFiles
List<DatFile> outDats = [];
// Ensure the current DatFile is sorted optimally
datFile.BucketBy(ItemKey.CRC, DedupeType.None);
datFile.BucketBy(ItemKey.CRC);
// Loop through each of the inputs and get or create a new DatData object
InternalStopwatch watch = new("Initializing and filling all output DATs");

View File

@@ -417,10 +417,9 @@ namespace SabreTools.DatFiles
/// Take the arbitrarily bucketed Files Dictionary and convert to one bucketed by a user-defined method
/// </summary>
/// <param name="bucketBy">ItemKey enum representing how to bucket the individual items</param>
/// <param name="dedupeType">Dedupe type that should be used</param>
/// <param name="lower">True if the key should be lowercased (default), false otherwise</param>
/// <param name="norename">True if games should only be compared on game and file name, false if system and source are counted</param>
public void BucketBy(ItemKey bucketBy, DedupeType dedupeType, bool lower = true, bool norename = true)
public void BucketBy(ItemKey bucketBy, bool lower = true, bool norename = true)
{
// If we have a situation where there's no dictionary or no keys at all, we skip
if (_items == null || _items.Count == 0)
@@ -433,18 +432,50 @@ namespace SabreTools.DatFiles
PerformBucketing(bucketBy, lower, norename);
}
// If the merge type isn't the same, we want to merge the dictionary accordingly
if (_mergedBy != dedupeType)
// Sort the dictionary to be consistent
_logger.User($"Sorting roms by {bucketBy}");
PerformSorting();
}
/// <summary>
/// Perform deduplication based on the deduplication type provided
/// </summary>
/// <param name="dedupeType">Dedupe type that should be used</param>
public void Deduplicate(DedupeType dedupeType)
{
// Set the sorted type
_mergedBy = dedupeType;
// If no deduplication is requested, just return
if (dedupeType == DedupeType.None)
return;
#if NET452_OR_GREATER || NETCOREAPP
Parallel.ForEach(SortedKeys, Core.Globals.ParallelOptions, key =>
#elif NET40_OR_GREATER
Parallel.ForEach(SortedKeys, key =>
#else
foreach (var key in SortedKeys)
#endif
{
_logger.User($"Deduping roms by {dedupeType}");
PerformDeduplication(bucketBy, dedupeType);
}
// If the merge type is the same, we want to sort the dictionary to be consistent
else
{
_logger.User($"Sorting roms by {bucketBy}");
PerformSorting();
// Get the possibly unsorted list
List<DatItem> sortedList = GetItemsForBucket(key);
// Sort the list of items to be consistent
Sort(ref sortedList, false);
// If we're merging the roms, do so
if (dedupeType == DedupeType.Full || (dedupeType == DedupeType.Game && _bucketedBy == ItemKey.Machine))
sortedList = DatFileTool.Merge(sortedList);
// Add the list back to the dictionary
RemoveBucket(key);
sortedList.ForEach(item => AddItem(key, item));
#if NET40_OR_GREATER || NETCOREAPP
});
#else
}
#endif
}
/// <summary>
@@ -638,44 +669,6 @@ namespace SabreTools.DatFiles
#endif
}
/// <summary>
/// Perform deduplication based on the deduplication type provided
/// </summary>
/// <param name="bucketBy">ItemKey enum representing how to bucket the individual items</param>
/// <param name="dedupeType">Dedupe type that should be used</param>
private void PerformDeduplication(ItemKey bucketBy, DedupeType dedupeType)
{
// Set the sorted type
_mergedBy = dedupeType;
#if NET452_OR_GREATER || NETCOREAPP
Parallel.ForEach(SortedKeys, Core.Globals.ParallelOptions, key =>
#elif NET40_OR_GREATER
Parallel.ForEach(SortedKeys, key =>
#else
foreach (var key in SortedKeys)
#endif
{
// Get the possibly unsorted list
List<DatItem> sortedList = GetItemsForBucket(key);
// Sort the list of items to be consistent
Sort(ref sortedList, false);
// If we're merging the roms, do so
if (dedupeType == DedupeType.Full || (dedupeType == DedupeType.Game && bucketBy == ItemKey.Machine))
sortedList = DatFileTool.Merge(sortedList);
// Add the list back to the dictionary
RemoveBucket(key);
sortedList.ForEach(item => AddItem(key, item));
#if NET40_OR_GREATER || NETCOREAPP
});
#else
}
#endif
}
/// <summary>
/// Perform inplace sorting of the dictionary
/// </summary>
@@ -772,7 +765,7 @@ namespace SabreTools.DatFiles
{
// If we're not already sorted, take care of it
if (!sorted)
BucketBy(GetBestAvailable(), DedupeType.None);
BucketBy(GetBestAvailable());
// Now that we have the sorted type, we get the proper key
return GetBucketKey(datItem, _bucketedBy, lower: true, norename: true);

View File

@@ -693,11 +693,10 @@ namespace SabreTools.DatFiles
/// Update the bucketing dictionary
/// </summary>
/// <param name="bucketBy">ItemKey enum representing how to bucket the individual items</param>
/// <param name="dedupeType">Dedupe type that should be used</param>
/// <param name="lower">True if the key should be lowercased (default), false otherwise</param>
/// <param name="norename">True if games should only be compared on game and file name, false if system and source are counted</param>
/// <returns></returns>
public void BucketBy(ItemKey bucketBy, DedupeType dedupeType, bool lower = true, bool norename = true)
public void BucketBy(ItemKey bucketBy, bool lower = true, bool norename = true)
{
// If the sorted type isn't the same, we want to sort the dictionary accordingly
if (_bucketedBy != bucketBy && bucketBy != ItemKey.NULL)
@@ -706,18 +705,60 @@ namespace SabreTools.DatFiles
PerformBucketing(bucketBy, lower, norename);
}
// If the merge type isn't the same, we want to merge the dictionary accordingly
if (dedupeType != DedupeType.None)
// Sort the dictionary to be consistent
_logger.User($"Sorting roms by {bucketBy}");
PerformSorting(norename);
}
/// <summary>
/// Perform deduplication based on the deduplication type provided
/// </summary>
/// <param name="dedupeType">Dedupe type that should be used</param>
public void Deduplicate(DedupeType dedupeType)
{
// If no deduplication is requested, just return
if (dedupeType == DedupeType.None)
return;
// Get the current list of bucket keys
string[] bucketKeys = [.. _buckets.Keys];
#if NET452_OR_GREATER || NETCOREAPP
Parallel.For(0, bucketKeys.Length, Core.Globals.ParallelOptions, i =>
#elif NET40_OR_GREATER
Parallel.For(0, bucketKeys.Length, i =>
#else
for (int i = 0; i < bucketKeys.Length; i++)
#endif
{
_logger.User($"Deduping roms by {dedupeType}");
PerformDeduplication(bucketBy, dedupeType);
}
// If the merge type is the same, we want to sort the dictionary to be consistent
else
{
_logger.User($"Sorting roms by {bucketBy}");
PerformSorting(norename);
#if NET40_OR_GREATER || NETCOREAPP
if (!_buckets.TryGetValue(bucketKeys[i], out var itemIndices))
return;
#else
var itemIndices = _buckets[bucketKeys[i]];
#endif
if (itemIndices == null || itemIndices.Count == 0)
return;
var datItems = itemIndices
.FindAll(i => _items.ContainsKey(i))
.Select(i => new KeyValuePair<long, DatItem>(i, _items[i]))
.ToList();
Sort(ref datItems, false);
// If we're merging the roms, do so
if (dedupeType == DedupeType.Full || (dedupeType == DedupeType.Game && _bucketedBy == ItemKey.Machine))
datItems = Merge(datItems);
#if NET40_OR_GREATER || NETCOREAPP
_buckets.TryAdd(bucketKeys[i], [.. datItems.Select(kvp => kvp.Key)]);
});
#else
_buckets[bucketKeys[i]] = [.. datItems.Select(kvp => kvp.Key)];
}
#endif
}
/// <summary>
@@ -1034,54 +1075,6 @@ namespace SabreTools.DatFiles
}
}
/// <summary>
/// Perform deduplication based on the deduplication type provided
/// </summary>
/// <param name="bucketBy">ItemKey enum representing how to bucket the individual items</param>
/// <param name="dedupeType">Dedupe type that should be used</param>
private void PerformDeduplication(ItemKey bucketBy, DedupeType dedupeType)
{
// Get the current list of bucket keys
string[] bucketKeys = [.. _buckets.Keys];
#if NET452_OR_GREATER || NETCOREAPP
Parallel.For(0, bucketKeys.Length, Core.Globals.ParallelOptions, i =>
#elif NET40_OR_GREATER
Parallel.For(0, bucketKeys.Length, i =>
#else
for (int i = 0; i < bucketKeys.Length; i++)
#endif
{
#if NET40_OR_GREATER || NETCOREAPP
if (!_buckets.TryGetValue(bucketKeys[i], out var itemIndices))
return;
#else
var itemIndices = _buckets[bucketKeys[i]];
#endif
if (itemIndices == null || itemIndices.Count == 0)
return;
var datItems = itemIndices
.FindAll(i => _items.ContainsKey(i))
.Select(i => new KeyValuePair<long, DatItem>(i, _items[i]))
.ToList();
Sort(ref datItems, false);
// If we're merging the roms, do so
if (dedupeType == DedupeType.Full || (dedupeType == DedupeType.Game && bucketBy == ItemKey.Machine))
datItems = Merge(datItems);
#if NET40_OR_GREATER || NETCOREAPP
_buckets.TryAdd(bucketKeys[i], [.. datItems.Select(kvp => kvp.Key)]);
});
#else
_buckets[bucketKeys[i]] = [.. datItems.Select(kvp => kvp.Key)];
}
#endif
}
/// <summary>
/// Sort existing buckets for consistency
/// </summary>
@@ -1197,7 +1190,7 @@ namespace SabreTools.DatFiles
{
// If we're not already sorted, take care of it
if (!sorted)
BucketBy(GetBestAvailable(), DedupeType.None);
BucketBy(GetBestAvailable());
// Now that we have the sorted type, we get the proper key
return GetBucketKey(datItem.Key, _bucketedBy, lower: true, norename: true);