using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; using System.Linq; using System.Runtime.CompilerServices; using System.Text.RegularExpressions; using System.Threading.Tasks; using SabreTools.Core; using SabreTools.DatFiles; using SabreTools.DatItems; using SabreTools.Logging; [assembly: InternalsVisibleTo("SabreTools.Test")] namespace SabreTools.Filtering { /// /// Represents the cleaning operations that need to be performed on a set of items, usually a DAT /// public class Cleaner { #region Fields /// /// Clean all names to WoD standards /// public bool Clean { get; set; } /// /// Deduplicate items using the given method /// public DedupeType DedupeRoms { get; set; } /// /// Set Machine Description from Machine Name /// public bool DescriptionAsName { get; set; } /// /// Keep machines that don't contain any items /// public bool KeepEmptyGames { get; set; } /// /// Enable "One Rom, One Region (1G1R)" mode /// public bool OneGamePerRegion { get; set; } /// /// Ordered list of regions for "One Rom, One Region (1G1R)" mode /// public List RegionList { get; set; } /// /// Ensure each rom is in their own game /// public bool OneRomPerGame { get; set; } /// /// Remove all unicode characters /// public bool RemoveUnicode { get; set; } /// /// Include root directory when determing trim sizes /// public string Root { get; set; } /// /// Remove scene dates from the beginning of machine names /// public bool SceneDateStrip { get; set; } /// /// Change all machine names to "!" /// public bool Single { get; set; } /// /// Trim total machine and item name to not exceed NTFS limits /// public bool Trim { get; set; } #endregion #region Logging /// /// Logging object /// private readonly Logger logger = new(); #endregion #region Running /// /// Apply cleaning methods to the DatFile /// /// Current DatFile object to run operations on /// True if the error that is thrown should be thrown back to the caller, false otherwise /// True if cleaning was successful, false on error public bool ApplyCleaning(DatFile datFile, bool throwOnError = false) { InternalStopwatch watch = new("Applying cleaning steps to DAT"); try { // Perform item-level cleaning CleanDatItems(datFile); // Bucket and dedupe according to the flag if (DedupeRoms == DedupeType.Full) datFile.Items.BucketBy(ItemKey.CRC, DedupeRoms); else if (DedupeRoms == DedupeType.Game) datFile.Items.BucketBy(ItemKey.Machine, DedupeRoms); // Process description to machine name if (DescriptionAsName == true) MachineDescriptionToName(datFile); // If we are removing scene dates, do that now if (SceneDateStrip == true) StripSceneDatesFromItems(datFile); // Run the one rom per game logic, if required if (OneGamePerRegion == true) SetOneGamePerRegion(datFile); // Run the one rom per game logic, if required if (OneRomPerGame == true) SetOneRomPerGame(datFile); // Remove all marked items datFile.Items.ClearMarked(); // We remove any blanks, if we aren't supposed to have any if (KeepEmptyGames == false) datFile.Items.ClearEmpty(); } catch (Exception ex) when (!throwOnError) { logger.Error(ex); return false; } finally { watch.Stop(); } return true; } /// /// Clean individual items based on the current filter /// /// Current DatFile object to run operations on internal void CleanDatItems(DatFile datFile) { List keys = datFile.Items.Keys.ToList(); foreach (string key in keys) { // For every item in the current key ConcurrentList items = datFile.Items[key]; foreach (DatItem item in items) { // If we have a null item, we can't clean it it if (item == null) continue; // Run cleaning per item CleanDatItem(item); } // Assign back for caution datFile.Items[key] = items; } } /// /// Clean a DatItem according to the cleaner /// /// DatItem to clean internal void CleanDatItem(DatItem datItem) { // If we're stripping unicode characters, strip machine name and description if (RemoveUnicode) { datItem.Machine.Name = RemoveUnicodeCharacters(datItem.Machine.Name); datItem.Machine.Description = RemoveUnicodeCharacters(datItem.Machine.Description); datItem.SetName(RemoveUnicodeCharacters(datItem.GetName())); } // If we're in cleaning mode, sanitize machine name and description if (Clean) { datItem.Machine.Name = CleanGameName(datItem.Machine.Name); datItem.Machine.Description = CleanGameName(datItem.Machine.Description); } // If we are in single game mode, rename the machine if (Single) datItem.Machine.Name = "!"; // If we are in NTFS trim mode, trim the item name if (Trim && datItem.GetName() != null) { // Windows max name length is 260 int usableLength = 260 - datItem.Machine.Name.Length - (Root?.Length ?? 0); if (datItem.GetName().Length > usableLength) { string ext = Path.GetExtension(datItem.GetName()); datItem.SetName(datItem.GetName()[..(usableLength - ext.Length)] + ext); } } } /// /// Clean a game (or rom) name to the WoD standard /// /// Name of the game to be cleaned /// The cleaned name internal static string CleanGameName(string game) { if (game == null) return null; ///Run the name through the filters to make sure that it's correct game = NormalizeChars(game); game = RussianToLatin(game); game = SearchPattern(game); game = new Regex(@"(([[(].*[\)\]] )?([^([]+))").Match(game).Groups[1].Value; game = game.TrimStart().TrimEnd(); return game; } /// /// Use game descriptions as names in the DAT, updating cloneof/romof/sampleof /// /// Current DatFile object to run operations on /// True if the error that is thrown should be thrown back to the caller, false otherwise internal void MachineDescriptionToName(DatFile datFile, bool throwOnError = false) { try { // First we want to get a mapping for all games to description ConcurrentDictionary mapping = new(); Parallel.ForEach(datFile.Items.Keys, Globals.ParallelOptions, key => { ConcurrentList items = datFile.Items[key]; foreach (DatItem item in items) { // If the key mapping doesn't exist, add it mapping.TryAdd(item.Machine.Name, item.Machine.Description.Replace('/', '_').Replace("\"", "''").Replace(":", " -")); } }); // Now we loop through every item and update accordingly Parallel.ForEach(datFile.Items.Keys, Globals.ParallelOptions, key => { ConcurrentList items = datFile.Items[key]; ConcurrentList newItems = new(); foreach (DatItem item in items) { // Update machine name if (!string.IsNullOrWhiteSpace(item.Machine.Name) && mapping.ContainsKey(item.Machine.Name)) item.Machine.Name = mapping[item.Machine.Name]; // Update cloneof if (!string.IsNullOrWhiteSpace(item.Machine.CloneOf) && mapping.ContainsKey(item.Machine.CloneOf)) item.Machine.CloneOf = mapping[item.Machine.CloneOf]; // Update romof if (!string.IsNullOrWhiteSpace(item.Machine.RomOf) && mapping.ContainsKey(item.Machine.RomOf)) item.Machine.RomOf = mapping[item.Machine.RomOf]; // Update sampleof if (!string.IsNullOrWhiteSpace(item.Machine.SampleOf) && mapping.ContainsKey(item.Machine.SampleOf)) item.Machine.SampleOf = mapping[item.Machine.SampleOf]; // Add the new item to the output list newItems.Add(item); } // Replace the old list of roms with the new one datFile.Items.Remove(key); datFile.Items.AddRange(key, newItems); }); } catch (Exception ex) when (!throwOnError) { logger.Warning(ex.ToString()); } } /// /// Replace accented characters /// /// String to be parsed /// String with characters replaced internal static string NormalizeChars(string input) { if (input == null) return null; string[,] charmap = { { "Á", "A" }, { "á", "a" }, { "À", "A" }, { "à", "a" }, { "Â", "A" }, { "â", "a" }, { "Ä", "Ae" }, { "ä", "ae" }, { "Ã", "A" }, { "ã", "a" }, { "Å", "A" }, { "å", "a" }, { "Æ", "Ae" }, { "æ", "ae" }, { "Ç", "C" }, { "ç", "c" }, { "Ð", "D" }, { "ð", "d" }, { "É", "E" }, { "é", "e" }, { "È", "E" }, { "è", "e" }, { "Ê", "E" }, { "ê", "e" }, { "Ë", "E" }, { "ë", "e" }, { "ƒ", "f" }, { "Í", "I" }, { "í", "i" }, { "Ì", "I" }, { "ì", "i" }, { "Î", "I" }, { "î", "i" }, { "Ï", "I" }, { "ï", "i" }, { "Ñ", "N" }, { "ñ", "n" }, { "Ó", "O" }, { "ó", "o" }, { "Ò", "O" }, { "ò", "o" }, { "Ô", "O" }, { "ô", "o" }, { "Ö", "Oe" }, { "ö", "oe" }, { "Õ", "O" }, { "õ", "o" }, { "Ø", "O" }, { "ø", "o" }, { "Š", "S" }, { "š", "s" }, { "ß", "ss" }, { "Þ", "B" }, { "þ", "b" }, { "Ú", "U" }, { "ú", "u" }, { "Ù", "U" }, { "ù", "u" }, { "Û", "U" }, { "û", "u" }, { "Ü", "Ue" }, { "ü", "ue" }, { "ÿ", "y" }, { "Ý", "Y" }, { "ý", "y" }, { "Ž", "Z" }, { "ž", "z" }, }; for (int i = 0; i < charmap.GetLength(0); i++) { input = input.Replace(charmap[i, 0], charmap[i, 1]); } return input; } /// /// Remove all unicode-specific chars from a string /// /// Input string to clean /// Cleaned string internal static string RemoveUnicodeCharacters(string s) { if (s == null) return null; return new string(s.Where(c => c <= 255).ToArray()); } /// /// Convert Cyrillic lettering to Latin lettering /// /// String to be parsed /// String with characters replaced internal static string RussianToLatin(string input) { if (input == null) return null; string[,] charmap = { { "А", "A" }, { "Б", "B" }, { "В", "V" }, { "Г", "G" }, { "Д", "D" }, { "Е", "E" }, { "Ё", "Yo" }, { "Ж", "Zh" }, { "З", "Z" }, { "И", "I" }, { "Й", "J" }, { "К", "K" }, { "Л", "L" }, { "М", "M" }, { "Н", "N" }, { "О", "O" }, { "П", "P" }, { "Р", "R" }, { "С", "S" }, { "Т", "T" }, { "У", "U" }, { "Ф", "f" }, { "Х", "Kh" }, { "Ц", "Ts" }, { "Ч", "Ch" }, { "Ш", "Sh" }, { "Щ", "Sch" }, { "Ъ", string.Empty }, { "Ы", "y" }, { "Ь", string.Empty }, { "Э", "e" }, { "Ю", "yu" }, { "Я", "ya" }, { "а", "a" }, { "б", "b" }, { "в", "v" }, { "г", "g" }, { "д", "d" }, { "е", "e" }, { "ё", "yo" }, { "ж", "zh" }, { "з", "z" }, { "и", "i" }, { "й", "j" }, { "к", "k" }, { "л", "l" }, { "м", "m" }, { "н", "n" }, { "о", "o" }, { "п", "p" }, { "р", "r" }, { "с", "s" }, { "т", "t" }, { "у", "u" }, { "ф", "f" }, { "х", "kh" }, { "ц", "ts" }, { "ч", "ch" }, { "ш", "sh" }, { "щ", "sch" }, { "ъ", string.Empty }, { "ы", "y" }, { "ь", string.Empty }, { "э", "e" }, { "ю", "yu" }, { "я", "ya" }, }; for (int i = 0; i < charmap.GetLength(0); i++) { input = input.Replace(charmap[i, 0], charmap[i, 1]); } return input; } /// /// Replace special characters and patterns /// /// String to be parsed /// String with characters replaced internal static string SearchPattern(string input) { if (input == null) return null; string[,] charmap = { { @"~", " - " }, { @"_", " " }, { @":", " " }, { @">", ")" }, { @"<", "(" }, { @"\|", "-" }, { "\"", "'" }, { @"\*", "." }, { @"\\", "-" }, { @"/", "-" }, { @"\?", " " }, { @"\(([^)(]*)\(([^)]*)\)([^)(]*)\)", " " }, { @"\(([^)]+)\)", " " }, { @"\[([^]]+)\]", " " }, { @"\{([^}]+)\}", " " }, { @"(ZZZJUNK|ZZZ-UNK-|ZZZ-UNK |zzz unknow |zzz unk |Copy of |[.][a-z]{3}[.][a-z]{3}[.]|[.][a-z]{3}[.])", " " }, { @" (r|rev|v|ver)\s*[\d\.]+[^\s]*", " " }, { @"(( )|(\A))(\d{6}|\d{8})(( )|(\Z))", " " }, { @"(( )|(\A))(\d{1,2})-(\d{1,2})-(\d{4}|\d{2})", " " }, { @"(( )|(\A))(\d{4}|\d{2})-(\d{1,2})-(\d{1,2})", " " }, { @"[-]+", "-" }, { @"\A\s*\)", " " }, { @"\A\s*(,|-)", " " }, { @"\s+", " " }, { @"\s+,", "," }, { @"\s*(,|-)\s*\Z", " " }, }; for (int i = 0; i < charmap.GetLength(0); i++) { input = Regex.Replace(input, charmap[i, 0], charmap[i, 1]); } return input; } /// /// Filter a DAT using 1G1R logic given an ordered set of regions /// /// Current DatFile object to run operations on /// /// In the most technical sense, the way that the region list is being used does not /// confine its values to be just regions. Since it's essentially acting like a /// specialized version of the machine name filter, anything that is usually encapsulated /// in parenthesis would be matched on, including disc numbers, languages, editions, /// and anything else commonly used. Please note that, unlike other existing 1G1R /// solutions, this does not have the ability to contain custom mappings of parent /// to clone sets based on name, nor does it have the ability to match on the /// Release DatItem type. /// internal void SetOneGamePerRegion(DatFile datFile) { // If we have null region list, make it empty RegionList ??= new List(); // For sake of ease, the first thing we want to do is bucket by game datFile.Items.BucketBy(ItemKey.Machine, DedupeType.None, norename: true); // Then we want to get a mapping of all machines to parents Dictionary> parents = new(); foreach (string key in datFile.Items.Keys) { DatItem item = datFile.Items[key][0]; // Match on CloneOf first if (!string.IsNullOrEmpty(item.Machine.CloneOf)) { if (!parents.ContainsKey(item.Machine.CloneOf.ToLowerInvariant())) parents.Add(item.Machine.CloneOf.ToLowerInvariant(), new List()); parents[item.Machine.CloneOf.ToLowerInvariant()].Add(item.Machine.Name.ToLowerInvariant()); } // Then by RomOf else if (!string.IsNullOrEmpty(item.Machine.RomOf)) { if (!parents.ContainsKey(item.Machine.RomOf.ToLowerInvariant())) parents.Add(item.Machine.RomOf.ToLowerInvariant(), new List()); parents[item.Machine.RomOf.ToLowerInvariant()].Add(item.Machine.Name.ToLowerInvariant()); } // Otherwise, treat it as a parent else { if (!parents.ContainsKey(item.Machine.Name.ToLowerInvariant())) parents.Add(item.Machine.Name.ToLowerInvariant(), new List()); parents[item.Machine.Name.ToLowerInvariant()].Add(item.Machine.Name.ToLowerInvariant()); } } // Once we have the full list of mappings, filter out games to keep foreach (string key in parents.Keys) { // Find the first machine that matches the regions in order, if possible string machine = default; foreach (string region in RegionList) { machine = parents[key].FirstOrDefault(m => Regex.IsMatch(m, @"\(.*" + region + @".*\)", RegexOptions.IgnoreCase)); if (machine != default) break; } // If we didn't get a match, use the parent if (machine == default) machine = key; // Remove the key from the list parents[key].Remove(machine); // Remove the rest of the items from this key parents[key].ForEach(k => datFile.Items.Remove(k)); } // Finally, strip out the parent tags Splitter.RemoveTagsFromChild(datFile); } /// /// Ensure that all roms are in their own game (or at least try to ensure) /// /// Current DatFile object to run operations on internal static void SetOneRomPerGame(DatFile datFile) { // Because this introduces subfolders, we need to set the SuperDAT type datFile.Header.Type = "SuperDAT"; // For each rom, we want to update the game to be "/" Parallel.ForEach(datFile.Items.Keys, Globals.ParallelOptions, key => { ConcurrentList items = datFile.Items[key]; for (int i = 0; i < items.Count; i++) { SetOneRomPerGame(items[i]); } }); } /// /// Set internal names to match One Rom Per Game (ORPG) logic /// /// DatItem to run logic on internal static void SetOneRomPerGame(DatItem datItem) { if (datItem.GetName() == null) return; string[] splitname = datItem.GetName().Split('.'); datItem.Machine.Name += $"/{string.Join(".", splitname.Take(splitname.Length > 1 ? splitname.Length - 1 : 1))}"; datItem.SetName(Path.GetFileName(datItem.GetName())); } /// /// Strip the dates from the beginning of scene-style set names /// /// Current DatFile object to run operations on internal void StripSceneDatesFromItems(DatFile datFile) { // Output the logging statement logger.User("Stripping scene-style dates"); // Set the regex pattern to use string pattern = @"([0-9]{2}\.[0-9]{2}\.[0-9]{2}-)(.*?-.*?)"; // Now process all of the roms Parallel.ForEach(datFile.Items.Keys, Globals.ParallelOptions, key => { ConcurrentList items = datFile.Items[key]; for (int j = 0; j < items.Count; j++) { DatItem item = items[j]; if (Regex.IsMatch(item.Machine.Name, pattern)) item.Machine.Name = Regex.Replace(item.Machine.Name, pattern, "$2"); if (Regex.IsMatch(item.Machine.Description, pattern)) item.Machine.Description = Regex.Replace(item.Machine.Description, pattern, "$2"); items[j] = item; } datFile.Items.Remove(key); datFile.Items.AddRange(key, items); }); } #endregion } }