From cce8a18b03b9af6c6e7110cd1c0c62b5c69db5e9 Mon Sep 17 00:00:00 2001 From: HeroponRikiBestest <50224630+HeroponRikiBestest@users.noreply.github.com> Date: Wed, 24 Sep 2025 21:32:11 -0400 Subject: [PATCH] Add SpamSum fuzzy compare (#3) * Pre-cleaned drop in fuzzycompare * Finish cleaning up code. * Figure out why i can't debug my unit tests * Fix stupid mistake * First round of changes * Extra tests * Revert TestHelper.cs * Roll back TestHelper.cs correctly. * Roll back Options.cs and Program.cs --- SabreTools.Hashing.Test/SpamSumTests.cs | 40 +++++ SabreTools.Hashing/SpamSum/Comparisons.cs | 204 ++++++++++++++++++++++ SabreTools.Hashing/SpamSum/SpamSum.cs | 10 ++ 3 files changed, 254 insertions(+) create mode 100644 SabreTools.Hashing.Test/SpamSumTests.cs create mode 100644 SabreTools.Hashing/SpamSum/Comparisons.cs diff --git a/SabreTools.Hashing.Test/SpamSumTests.cs b/SabreTools.Hashing.Test/SpamSumTests.cs new file mode 100644 index 0000000..275ae19 --- /dev/null +++ b/SabreTools.Hashing.Test/SpamSumTests.cs @@ -0,0 +1,40 @@ +using System.Collections.Generic; +using Xunit; + +namespace SabreTools.Hashing.Test; + +// TODO: More thorough testing on the various things like block size checks, but this is likely enough as it is. +public class SpamSumTests +{ + [Theory] + [InlineData("3:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL", + "3:hMCERJAFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:huRJdxZENFs+rPSromekL", 41)] // Basic small data tests + [InlineData("12:Y+VH/3Ckg3xqMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMn:xHqVwMMMMMMMMMMMMMMMMMMMMMMMMMM0", + "12:Oqkg3xqMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMu:OqVwMMMMMMMMMMMMMMMMMMMMMMMMMMMd", 44)] + [InlineData("6:l+lq/MtlM8pJ0gt6lXWogE61UlT1Uqj1akMD5n:l+l6Mtl/n0gtOXmEuUl5UqpakM9n", + "6:mTj3qJskr+V+1o21+n0rtD2noPWKlAyjllZmMt6120EK+wlsS6T1oLwXuk4tk7:m/bk/1oQrJL3jTu20EK+wlsp5oO4tk7", 0)] + [InlineData("196608:Gbxf3F4OQK3IuUGM8Ylv1kqCLuDKeo5cRld6iZL6HAGpX7g08WCWDc4NNgs4NEv:qcgxU+UxR2gl5qAGpXjHDcCNgs4N","196608:EqKRzGWxtDOadbDCbZStQxNy+fox3UgOYorlhjolL0K1WJj5lYA:EbNf76db9xNVox3MRlh+sf", 0)] // Basic large data tests + [InlineData("24576:p+QxhkAcV6cUdRxczoy3NmO0ne3HFVjSeQ229SVjeONr+v:YQ/q6baz5Nqe3H2eQzStBa","24576:fCQxhkAcV6cUdRxczoyVQQFDSVRNihk24vXDj20sq:6Q/q6bazwMgRNihk24jtsq", 54)] + [InlineData("3:hMCERJAFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:huRJdxZENFs+rPSromekL", + "3:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL", 41)] // Basic small mirror data tests + [InlineData("12:Oqkg3xqMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMu:OqVwMMMMMMMMMMMMMMMMMMMMMMMMMMMd","12:Y+VH/3Ckg3xqMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMn:xHqVwMMMMMMMMMMMMMMMMMMMMMMMMMM0", 44)] + [InlineData("6:mTj3qJskr+V+1o21+n0rtD2noPWKlAyjllZmMt6120EK+wlsS6T1oLwXuk4tk7:m/bk/1oQrJL3jTu20EK+wlsp5oO4tk7","6:l+lq/MtlM8pJ0gt6lXWogE61UlT1Uqj1akMD5n:l+l6Mtl/n0gtOXmEuUl5UqpakM9n", 0)] + [InlineData("196608:EqKRzGWxtDOadbDCbZStQxNy+fox3UgOYorlhjolL0K1WJj5lYA:EbNf76db9xNVox3MRlh+sf","196608:Gbxf3F4OQK3IuUGM8Ylv1kqCLuDKeo5cRld6iZL6HAGpX7g08WCWDc4NNgs4NEv:qcgxU+UxR2gl5qAGpXjHDcCNgs4N", 0)] // Basic large mirror data tests + [InlineData("24576:fCQxhkAcV6cUdRxczoyVQQFDSVRNihk24vXDj20sq:6Q/q6bazwMgRNihk24jtsq","24576:p+QxhkAcV6cUdRxczoy3NmO0ne3HFVjSeQ229SVjeONr+v:YQ/q6baz5Nqe3H2eQzStBa", 54)] + [InlineData("500:AAAAAAAAAAAAAAAAAAAAAAAAyENFACBE+rW6Tj7SMQmK:4","500:AAAyENFACBE+rW6Tj7SMQmK:4", 100)] // Test duplicate sequence truncation + [InlineData("500:7SMQmKa:3","500:7SMQmKr:3", 0)] // Test rolling window - larger than 7 + [InlineData("500:7QmKa:3","500:7QmKr:3", 0)] // Test rolling window - smaller than 7 + [InlineData("9287:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL","5893:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL", 0)] // Test different blocksizes + [InlineData("3:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL","3:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL", 100)] + [InlineData(null,null, -1)] + [InlineData("3:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL",null, -1)] + [InlineData(null,"3:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL", -1)] + [InlineData("","", -1)] + [InlineData("3:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL","", -1)] + [InlineData("","3:hMCPQCE6AFQxWyENFACBE+rW6Tj7SMQmKozr9MVERkL:hZRdxZENFs+rPSromekL", -1)] + public void FuzzyCompareTest(string? stringOne, string? stringTwo, int expected) + { + var result = SpamSum.SpamSum.FuzzyCompare(stringOne, stringTwo); + Assert.Equal(expected, result); + } +} \ No newline at end of file diff --git a/SabreTools.Hashing/SpamSum/Comparisons.cs b/SabreTools.Hashing/SpamSum/Comparisons.cs new file mode 100644 index 0000000..7a81a0a --- /dev/null +++ b/SabreTools.Hashing/SpamSum/Comparisons.cs @@ -0,0 +1,204 @@ +using System; +using System.Text.RegularExpressions; + +namespace SabreTools.Hashing.SpamSum; + +internal static class Comparisons +{ + /// + /// Compares how similar two SpamSums are to each other. Implements ssdeep's fuzzy_compare. + /// + /// First hash to compare + /// Second hash to compare + /// -1 on validity failure, 0 if they're not comparable, score from 0 (least similar) to 100 (most similar) otherwise. + /// + public static int FuzzyCompare(string? firstHash, string? secondHash) + { + if (firstHash == null || secondHash == null) + return -1; + + // Each SpamSum string starts with its block size before the first semicolon. Verify it's there and return + // otherwise. + var stringOnePrefixIndex = firstHash.IndexOf(':'); + if (stringOnePrefixIndex == -1) + return -1; + if (!uint.TryParse(firstHash.Substring(0, stringOnePrefixIndex), out uint blockSizeOne)) + return -1; + var stringTwoPrefixIndex = secondHash.IndexOf(':'); + if (stringTwoPrefixIndex == -1) + return -1; + if (!uint.TryParse(secondHash.Substring(0, stringTwoPrefixIndex), out uint blockSizeTwo)) + return -1; + + // Check if blocksizes don't match. Each spamSum is broken up into two blocks. fuzzy_compare allows you to + // compare if one block in one hash is the same size as one block in the other hash, even if the other two are + // non-matching, so that's also checked for. + if (blockSizeOne != blockSizeTwo && + (blockSizeOne > uint.MaxValue / 2 || blockSizeOne * 2 != blockSizeTwo) && + (blockSizeOne % 2 == 1 || blockSizeOne / 2 != blockSizeTwo)) + return 0; + + // Get the spamSum strings starting past the blocksize prefix. + var stringOnePrefixOnwards = firstHash.Substring(stringOnePrefixIndex + 1); + var stringTwoPrefixOnwards = secondHash.Substring(stringTwoPrefixIndex + 1); + + // Make sure there's something there + if (string.IsNullOrEmpty(stringOnePrefixOnwards) || string.IsNullOrEmpty(stringTwoPrefixOnwards)) + return -1; + + // Split each spamSum into two blocks. + // Unclear why the second blocks must end before commas, but it is what fuzzy_compare does. + // If a spamSum doesn't have two parts past the prefix, it's malformed and must be returned. + + var tempSplit = stringOnePrefixOnwards.Split(':'); + var stringOneBlockOne = tempSplit[0]; + if (tempSplit.Length == 1 || string.IsNullOrEmpty(tempSplit[1])) + return -1; + var stringOneBlockTwo = tempSplit[1].Split(',')[0]; + tempSplit = stringTwoPrefixOnwards.Split(':'); + var stringTwoBlockOne = tempSplit[0]; + if (tempSplit.Length == 1 || string.IsNullOrEmpty(tempSplit[1])) + return -1; + var stringTwoBlockTwo = tempSplit[1].Split(',')[0]; + + // The comments for fuzzy_compare say to "Eliminate any sequences [of the same character] longer than 3". + // What this actually means is that any sequences of the same character longer than 3 need to be reduced to size 3, + // i.e. "9AgX87HAAAAAOKG5/Dqj3C2o/jlqW7Yn/nmcwlcKCwA9aJo9FcAKwf" becomes "9AgX87HAAAOKG5/Dqj3C2o/jlqW7Yn/nmcwlcKCwA9aJo9FcAKwf" + // The reason for doing this is that these sequences contain very little info, so cutting them down helps with + // part of scoring the strings later. + Regex r = new Regex("(.)(?<=\\1\\1\\1\\1)", RegexOptions.Compiled); + + stringOneBlockOne = r.Replace(stringOneBlockOne, string.Empty); + stringOneBlockTwo = r.Replace(stringOneBlockTwo, string.Empty); + stringTwoBlockOne = r.Replace(stringTwoBlockOne, string.Empty); + stringTwoBlockTwo = r.Replace(stringTwoBlockTwo, string.Empty); + + + // Return 100 immediately if both spamSums are identical. + if (blockSizeOne == blockSizeTwo && stringOneBlockOne.Length == stringTwoBlockOne.Length && + stringOneBlockTwo.Length == stringTwoBlockTwo.Length) + if (stringOneBlockOne == stringTwoBlockOne && stringOneBlockTwo == stringTwoBlockTwo) + return 100; + + // Choose different scoring combinations depending on block sizes present. + uint score; + if (blockSizeOne <= uint.MaxValue / 2) + { + if (blockSizeOne == blockSizeTwo) + { + var score1 = ScoreStrings(stringOneBlockOne, stringTwoBlockOne, blockSizeOne); + var score2 = ScoreStrings(stringOneBlockTwo, stringTwoBlockTwo, blockSizeOne * 2); + score = Math.Max(score1, score2); + } + else if (blockSizeOne * 2 == blockSizeTwo) + score = ScoreStrings(stringTwoBlockOne, stringOneBlockTwo, blockSizeTwo); + else + score = ScoreStrings(stringOneBlockOne, stringTwoBlockTwo, blockSizeOne); + } + else + { + if (blockSizeOne == blockSizeTwo) + score = ScoreStrings(stringOneBlockOne, stringTwoBlockOne, blockSizeOne); + else if (blockSizeOne % 2 == 0 && blockSizeOne / 2 == blockSizeTwo) + score = ScoreStrings(stringOneBlockOne, stringTwoBlockTwo, blockSizeOne); + else + score = 0; + } + + return (int)score; + } + + /// + /// Checks whether the two SpamSum strings have a common substring of 7 or more characters (as defined in fuzzy_compare's ROLLING_WINDOW size). + /// + /// First string to score + /// Second string to score + /// False if there is no common substring of 7 or more characters, true if there is. + + private static bool HasCommmonSubstring(string stringOne, string stringTwo) + { + var stringOneLength = stringOne.Length; + var stringTwoLength = stringTwo.Length; + var largestSubstring = 0; + + for (var i = 0; i < stringOneLength; i++) + for (var j = 0; j < stringTwoLength; j++) + { + var currentIndex = 0; + while ((i + currentIndex) < stringOneLength && (j + currentIndex) < stringTwoLength && stringOne[i + currentIndex] == stringTwo[j + currentIndex]) + currentIndex++; + + largestSubstring = Math.Max(largestSubstring, currentIndex); + } + + if (largestSubstring >= 7) + return true; + + return false; + } + + /// + /// Compares how similar two SpamSums are to each other. Implements ssdeep's fuzzy_compare. + /// + /// First string to score + /// Second string to score + /// Current blocksize + /// -1 on validity failure, 0 if they're not comparable, score from 0 (least similar) to 100 (most similar) otherwise. + private static uint ScoreStrings(string stringOne, string stringTwo, uint blockSize) + { + if (!HasCommmonSubstring(stringOne, stringTwo)) + return 0; + + const uint maxLength = 64; + const uint insertCost = 1; + const uint removeCost = 1; + const uint replaceCost = 2; + + var traverseOne = new uint[maxLength + 1]; + var traverseTwo = new uint[maxLength + 1]; + uint[] tempArray; + + uint indexOne, indexTwo; + for (indexTwo = 0; indexTwo <= stringTwo.Length; indexTwo++) + traverseOne[indexTwo] = indexTwo * removeCost; + for (indexOne = 0; indexOne < stringOne.Length; indexOne++) + { + traverseTwo[0] = (indexOne + 1) * insertCost; + for (indexTwo = 0; indexTwo < stringTwo.Length; indexTwo++) + { + var costA = traverseOne[indexTwo + 1] + insertCost; + var costD = traverseTwo[indexTwo] + removeCost; + var costR = traverseOne[indexTwo] + (stringOne[(int)indexOne] == stringTwo[(int)indexTwo] ? 0 : replaceCost); + traverseTwo[indexTwo + 1] = Math.Min(Math.Min(costA, costD), costR); + } + + tempArray = traverseOne; + traverseOne = traverseTwo; + traverseTwo = tempArray; + } + + long score = traverseOne[stringTwo.Length]; + + const int spamSumLength = 64; + const int rollingWindow = 7; + const int minBlocksize = 3; + score = (score * spamSumLength) / (stringOne.Length + stringTwo.Length); + + // Currently, the score ranges from 0-64 (64 being the length of a spamsum), with 0 being the strongest match + // and 64 being the weakest match. + + // Change scale to 0-100 + score = (100 * score) / spamSumLength; + + // Invert scale so 0 is the weakest possible match and 100 is the strongest + score = 100 - score; + + // Compensate for small blocksizes, so match isn't reported as overly strong. + if (blockSize >= (99 + rollingWindow) / rollingWindow * minBlocksize) + return (uint)score; + if (score > blockSize / minBlocksize * Math.Min(stringOne.Length, stringTwo.Length)) + score = blockSize / minBlocksize * Math.Min(stringOne.Length, stringTwo.Length); + + return (uint)score; + } +} \ No newline at end of file diff --git a/SabreTools.Hashing/SpamSum/SpamSum.cs b/SabreTools.Hashing/SpamSum/SpamSum.cs index 0ac7971..1929f53 100644 --- a/SabreTools.Hashing/SpamSum/SpamSum.cs +++ b/SabreTools.Hashing/SpamSum/SpamSum.cs @@ -352,5 +352,15 @@ namespace SabreTools.Hashing.SpamSum return n; } + + /// + /// Compares how similar two SpamSums are to each other. Implements ssdeep's fuzzy_compare. + /// + /// First hash to compare + /// Second hash to compare + /// -1 on validity failure, 0 if they're not comparable, score from 0 (least similar) to 100 (most similar) otherwise. + /// + public static int FuzzyCompare(string? firstHash, string? secondHash) + => Comparisons.FuzzyCompare(firstHash, secondHash); } }