From ba4ba01606ceb808057d30730b2d24013c6f783b Mon Sep 17 00:00:00 2001 From: Natalia Portillo Date: Sat, 15 Feb 2014 20:04:49 +0000 Subject: [PATCH] Added code --- DedupStat.sln | 20 ++++ DedupStat/DedupStat.csproj | 41 +++++++ DedupStat/Program.cs | 172 +++++++++++++++++++++++++++ DedupStat/Properties/AssemblyInfo.cs | 22 ++++ 4 files changed, 255 insertions(+) create mode 100644 DedupStat.sln create mode 100644 DedupStat/DedupStat.csproj create mode 100644 DedupStat/Program.cs create mode 100644 DedupStat/Properties/AssemblyInfo.cs diff --git a/DedupStat.sln b/DedupStat.sln new file mode 100644 index 0000000..78b16a1 --- /dev/null +++ b/DedupStat.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DedupStat", "DedupStat\DedupStat.csproj", "{E7C119C0-43C3-4211-8CFC-5FDD3B383F16}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x86 = Debug|x86 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {E7C119C0-43C3-4211-8CFC-5FDD3B383F16}.Debug|x86.ActiveCfg = Debug|x86 + {E7C119C0-43C3-4211-8CFC-5FDD3B383F16}.Debug|x86.Build.0 = Debug|x86 + {E7C119C0-43C3-4211-8CFC-5FDD3B383F16}.Release|x86.ActiveCfg = Release|x86 + {E7C119C0-43C3-4211-8CFC-5FDD3B383F16}.Release|x86.Build.0 = Release|x86 + EndGlobalSection + GlobalSection(MonoDevelopProperties) = preSolution + StartupItem = DedupStat\DedupStat.csproj + EndGlobalSection +EndGlobal diff --git a/DedupStat/DedupStat.csproj b/DedupStat/DedupStat.csproj new file mode 100644 index 0000000..d2bd9db --- /dev/null +++ b/DedupStat/DedupStat.csproj @@ -0,0 +1,41 @@ + + + + Debug + x86 + 10.0.0 + 2.0 + {E7C119C0-43C3-4211-8CFC-5FDD3B383F16} + Exe + DedupStat + DedupStat + + + true + full + false + bin\Debug + DEBUG; + prompt + 4 + true + x86 + + + full + true + bin\Release + prompt + 4 + true + x86 + + + + + + + + + + \ No newline at end of file diff --git a/DedupStat/Program.cs b/DedupStat/Program.cs new file mode 100644 index 0000000..1fcba40 --- /dev/null +++ b/DedupStat/Program.cs @@ -0,0 +1,172 @@ +/******************************************************************************************* + DedupStat - Shows an estimation of deduplication advantages for specified block size. + Copyright (C) 2014 Natalia Portillo + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*******************************************************************************************/ + +using System; +using System.IO; +using System.Collections.Generic; +using System.Security.Cryptography; +using System.Text; + +namespace DedupStat +{ + class MainClass + { + static Dictionary hashes; + static List files; + + public static void Main(string[] args) + { + UInt32 blocksize; + bool verbose = false; + if (args.Length != 2) + ShowHelp(); + else if (!UInt32.TryParse(args [0], out blocksize)) + ShowHelp(); + else if (blocksize % 512 != 0) + ShowHelp(); + else if (!Directory.Exists(args [1])) + ShowHelp(); + else + { + hashes = new Dictionary(); + ulong blocks = 0; + ulong overhead = 0; + ulong totalsize = 0; + DateTime start, end; + + Console.WriteLine("DedupStat - Shows an estimation of deduplication advantages for specified block size."); + Console.WriteLine("© 2014 Natalia Portillo"); + Console.WriteLine(); + start = DateTime.Now; + Console.WriteLine("Searching files..."); + files = new List(Directory.EnumerateFiles(args[1], "*", SearchOption.AllDirectories)); + Console.WriteLine("{0} files found.", files.Count); + Console.WriteLine("Counting {0} bytes sized blocks for found files.", blocksize); + + List wrongfiles = new List(); + + foreach (string filePath in files) + { + if (File.Exists(filePath)) + { + try + { + FileInfo fi = new FileInfo(filePath); + long fileBlocks = (long)Math.Ceiling((double)fi.Length / (double)blocksize); + long fileOverhead = fileBlocks * blocksize - fi.Length; + if(verbose) + Console.WriteLine("File \"{0}\" is {1} bytes, uses {2} blocks of {3} bytes each, for a total of {4} bytes ({5} overhead bytes)", + filePath, fi.Length, fileBlocks, blocksize, fileBlocks * blocksize, fileOverhead); + + blocks += (ulong)fileBlocks; + overhead += (ulong)fileOverhead; + totalsize += (ulong)fi.Length; + + if(verbose) + Console.WriteLine("Calculating block checksums"); + + FileStream fs = File.OpenRead(filePath); + + byte[] b = new byte[blocksize]; + int count = 1; + int fileUniqueBlocks = 0; + int fileDuplicatedBlocks = 0; + while (fs.Read(b, 0, (int)blocksize) > 0) + { + Console.Write("\rCalculating hash of block {0}/{1}", count, fileBlocks); + string hash = CalculateSHA1(b); + + if (hashes.ContainsKey(hash)) + { + ulong ref_count; + hashes.TryGetValue(hash, out ref_count); + hashes.Remove(hash); + ref_count++; + hashes.Add(hash, ref_count); + fileDuplicatedBlocks++; + } else + { + hashes.Add(hash, 1); + fileUniqueBlocks++; + } + + count++; + } + Console.Write("\r "); + if(verbose) + Console.WriteLine("{0} blocks, {1} unique, {2} duplicated", fileBlocks, fileUniqueBlocks, fileDuplicatedBlocks); + + fs.Close(); + } + catch (Exception Ex) + { + if(verbose) + Console.WriteLine("Exception \"{0}\" on file \"{1}\"", Ex.Message, filePath); + wrongfiles.Add(filePath); + } + } + else + { + wrongfiles.Add(filePath); + } + } + + foreach (string wrongfile in wrongfiles) + files.Remove(wrongfile); + + end = DateTime.Now; + + Console.WriteLine(); + Console.WriteLine("Summary:"); + Console.WriteLine("{0} files for a total of {1} bytes", files.Count, totalsize); + Console.WriteLine("{0} bytes/block, for a total of {1} blocks used, using {2} bytes", blocksize, blocks, blocksize*blocks); + Console.WriteLine("{0} wasted bytes (should be {1}, difference is {2})", overhead, (blocks * blocksize) - totalsize, blocks * blocksize - totalsize - overhead); + Console.WriteLine("{0} unique blocks, using {1} bytes, {2}%", hashes.Count, hashes.Count * blocksize, (double)hashes.Count*100/(double)blocks); + Console.WriteLine("{0} duplicate blocks, using {1} bytes, {2}%", blocks - (ulong)hashes.Count, (blocks - (ulong)hashes.Count) * blocksize, (double)(blocks - (ulong)hashes.Count)*100/(double)blocks); + Console.WriteLine("Took {0} seconds, approx. {1} Mb/sec", (end - start).TotalSeconds, totalsize / 1048576 / (end - start).TotalSeconds); + } + + } + + public static void ShowHelp() + { + Console.WriteLine("DedupStat - Shows an estimation of deduplication advantages for specified block size."); + Console.WriteLine("© 2014 Natalia Portillo"); + Console.WriteLine(); + Console.WriteLine("Usage: dedupstat "); + Console.WriteLine("\t\tBlock size in bytes, must be multiple of 512"); + Console.WriteLine("\t\tFolder path"); + } + + private static string CalculateSHA1(byte[] block) + { + using (SHA1Managed sha1 = new SHA1Managed()) + { + byte[] hash = sha1.ComputeHash(block); + StringBuilder formatted = new StringBuilder(2 * hash.Length); + foreach (byte b in hash) + { + formatted.AppendFormat("{0:X2}", b); + } + + return formatted.ToString(); + } + } + } +} diff --git a/DedupStat/Properties/AssemblyInfo.cs b/DedupStat/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..1a99392 --- /dev/null +++ b/DedupStat/Properties/AssemblyInfo.cs @@ -0,0 +1,22 @@ +using System.Reflection; +using System.Runtime.CompilerServices; + +// Information about this assembly is defined by the following attributes. +// Change them to the values specific to your project. +[assembly: AssemblyTitle("DedupStat")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("Claunia.com")] +[assembly: AssemblyProduct("")] +[assembly: AssemblyCopyright("© Claunia.com")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] +// The assembly version has the format "{Major}.{Minor}.{Build}.{Revision}". +// The form "{Major}.{Minor}.*" will automatically update the build and revision, +// and "{Major}.{Minor}.{Build}.*" will update just the revision. +[assembly: AssemblyVersion("1.0.*")] +// The following attributes are used to specify the signing key for the assembly, +// if desired. See the Mono documentation for more information about signing. +//[assembly: AssemblyDelaySign(false)] +//[assembly: AssemblyKeyFile("")] +