2020-12-08 14:53:49 -08:00
using System.Collections.Generic ;
2020-12-07 22:32:37 -08:00
using System.IO ;
using System.Linq ;
using System.Threading.Tasks ;
2020-10-05 17:43:44 -07:00
2020-12-08 13:23:59 -08:00
using SabreTools.Core ;
2020-12-10 22:16:53 -08:00
using SabreTools.Core.Tools ;
2020-12-10 22:31:23 -08:00
using SabreTools.FileTypes.Aaru ;
using SabreTools.FileTypes.CHD ;
2020-12-07 22:32:37 -08:00
using SabreTools.IO ;
using SabreTools.Logging ;
2020-12-08 14:53:49 -08:00
using SabreTools.Skippers ;
2020-12-07 22:32:37 -08:00
using Compress.ThreadReaders ;
2018-02-15 22:06:20 -08:00
2020-12-08 14:53:49 -08:00
namespace SabreTools.FileTypes
2018-02-15 22:06:20 -08:00
{
2019-02-08 20:51:44 -08:00
public class BaseFile
{
2020-12-09 14:33:47 -08:00
#region Constants
2020-12-09 14:56:38 -08:00
protected static readonly byte [ ] SevenZipSignature = { 0x37 , 0x7a , 0xbc , 0xaf , 0x27 , 0x1c } ;
protected static readonly byte [ ] AaruFormatSignature = { 0x41 , 0x41 , 0x52 , 0x55 , 0x46 , 0x52 , 0x4d , 0x54 } ;
protected static readonly byte [ ] BZ2Signature = { 0x42 , 0x5a , 0x68 } ;
protected static readonly byte [ ] CabinetSignature = { 0x4d , 0x53 , 0x43 , 0x46 } ;
protected static readonly byte [ ] CHDSignature = { 0x4d , 0x43 , 0x6f , 0x6d , 0x70 , 0x72 , 0x48 , 0x44 } ;
protected static readonly byte [ ] ELFSignature = { 0x7f , 0x45 , 0x4c , 0x46 } ;
protected static readonly byte [ ] FreeArcSignature = { 0x41 , 0x72 , 0x43 , 0x01 } ;
protected static readonly byte [ ] GzSignature = { 0x1f , 0x8b , 0x08 } ;
protected static readonly byte [ ] LRZipSignature = { 0x4c , 0x52 , 0x5a , 0x49 } ;
protected static readonly byte [ ] LZ4Signature = { 0x18 , 0x4d , 0x22 , 0x04 } ;
protected static readonly byte [ ] LZ4SkippableMinSignature = { 0x18 , 0x4d , 0x22 , 0x04 } ;
protected static readonly byte [ ] LZ4SkippableMaxSignature = { 0x18 , 0x4d , 0x2a , 0x5f } ;
protected static readonly byte [ ] PESignature = { 0x4d , 0x5a } ;
protected static readonly byte [ ] RarSignature = { 0x52 , 0x61 , 0x72 , 0x21 , 0x1a , 0x07 , 0x00 } ;
protected static readonly byte [ ] RarFiveSignature = { 0x52 , 0x61 , 0x72 , 0x21 , 0x1a , 0x07 , 0x01 , 0x00 } ;
protected static readonly byte [ ] TarSignature = { 0x75 , 0x73 , 0x74 , 0x61 , 0x72 , 0x20 , 0x20 , 0x00 } ;
protected static readonly byte [ ] TarZeroSignature = { 0x75 , 0x73 , 0x74 , 0x61 , 0x72 , 0x00 , 0x30 , 0x30 } ;
protected static readonly byte [ ] XZSignature = { 0xfd , 0x37 , 0x7a , 0x58 , 0x5a , 0x00 , 0x00 } ;
protected static readonly byte [ ] ZipSignature = { 0x50 , 0x4b , 0x03 , 0x04 } ;
protected static readonly byte [ ] ZipSignatureEmpty = { 0x50 , 0x4b , 0x05 , 0x06 } ;
protected static readonly byte [ ] ZipSignatureSpanned = { 0x50 , 0x4b , 0x07 , 0x08 } ;
protected static readonly byte [ ] ZPAQSignature = { 0x7a , 0x50 , 0x51 } ;
protected static readonly byte [ ] ZstdSignature = { 0xfd , 0x2f , 0xb5 } ;
2020-12-09 14:33:47 -08:00
#endregion
2019-02-08 20:51:44 -08:00
// TODO: Get all of these values automatically so there is no public "set"
2020-10-05 17:43:44 -07:00
#region Fields
/// <summary>
/// Internal type of the represented file
/// </summary>
2019-02-08 20:51:44 -08:00
public FileType Type { get ; protected set ; }
2020-10-05 17:43:44 -07:00
/// <summary>
/// Filename or path to the file
/// </summary>
2019-02-08 20:51:44 -08:00
public string Filename { get ; set ; }
2020-10-05 17:43:44 -07:00
/// <summary>
/// Direct parent of the file
/// </summary>
2019-02-08 20:51:44 -08:00
public string Parent { get ; set ; }
2020-10-05 17:43:44 -07:00
/// <summary>
/// Date stamp of the file
/// </summary>
2019-02-08 20:51:44 -08:00
public string Date { get ; set ; }
2020-10-05 17:43:44 -07:00
/// <summary>
/// Optional size of the file
/// </summary>
2019-02-08 20:51:44 -08:00
public long? Size { get ; set ; }
2020-10-05 17:43:44 -07:00
/// <summary>
/// Hashes that are available for the file
/// </summary>
public Hash AvailableHashes { get ; set ; } = Hash . Standard ;
/// <summary>
/// CRC32 hash of the file
/// </summary>
public byte [ ] CRC { get ; set ; } = null ;
/// <summary>
/// MD5 hash of the file
/// </summary>
public byte [ ] MD5 { get ; set ; } = null ;
/// <summary>
/// SHA-1 hash of the file
/// </summary>
public byte [ ] SHA1 { get ; set ; } = null ;
/// <summary>
/// SHA-256 hash of the file
/// </summary>
public byte [ ] SHA256 { get ; set ; } = null ;
/// <summary>
/// SHA-384 hash of the file
/// </summary>
public byte [ ] SHA384 { get ; set ; } = null ;
/// <summary>
/// SHA-512 hash of the file
/// </summary>
public byte [ ] SHA512 { get ; set ; } = null ;
/// <summary>
/// SpamSum fuzzy hash of the file
/// </summary>
public byte [ ] SpamSum { get ; set ; } = null ;
2019-02-08 20:51:44 -08:00
#endregion
#region Construtors
/// <summary>
/// Create a new BaseFile with no base file
/// </summary>
public BaseFile ( )
{
}
/// <summary>
/// Create a new BaseFile from the given file
/// </summary>
/// <param name="filename">Name of the file to use</param>
/// <param name="getHashes">True if hashes for this file should be calculated (default), false otherwise</param>
public BaseFile ( string filename , bool getHashes = true )
{
this . Filename = filename ;
if ( getHashes )
{
2020-12-08 00:13:22 -08:00
BaseFile temp = GetInfo ( this . Filename , hashes : this . AvailableHashes ) ;
2019-02-08 20:51:44 -08:00
if ( temp ! = null )
{
this . Parent = temp . Parent ;
this . Date = temp . Date ;
this . CRC = temp . CRC ;
this . MD5 = temp . MD5 ;
this . SHA1 = temp . SHA1 ;
this . SHA256 = temp . SHA256 ;
this . SHA384 = temp . SHA384 ;
this . SHA512 = temp . SHA512 ;
2020-09-04 15:02:15 -07:00
this . SpamSum = temp . SpamSum ;
2019-02-08 20:51:44 -08:00
}
}
}
/// <summary>
/// Create a new BaseFile from the given file
/// </summary>
/// <param name="filename">Name of the file to use</param>
/// <param name="stream">Stream to populate information from</param>
/// <param name="getHashes">True if hashes for this file should be calculated (default), false otherwise</param>
public BaseFile ( string filename , Stream stream , bool getHashes = true )
{
this . Filename = filename ;
if ( getHashes )
{
2020-12-07 22:32:37 -08:00
BaseFile temp = GetInfo ( stream , hashes : this . AvailableHashes ) ;
2020-07-15 09:41:59 -07:00
if ( temp ! = null )
2019-02-08 20:51:44 -08:00
{
this . Parent = temp . Parent ;
this . Date = temp . Date ;
this . CRC = temp . CRC ;
this . MD5 = temp . MD5 ;
this . SHA1 = temp . SHA1 ;
this . SHA256 = temp . SHA256 ;
this . SHA384 = temp . SHA384 ;
this . SHA512 = temp . SHA512 ;
2020-09-04 15:02:15 -07:00
this . SpamSum = temp . SpamSum ;
2019-02-08 20:51:44 -08:00
}
}
2020-07-15 09:41:59 -07:00
2019-02-08 20:51:44 -08:00
}
#endregion
2020-12-07 22:32:37 -08:00
#region Static Methods
2020-12-08 00:13:22 -08:00
/// <summary>
/// Returns the file type of an input file
/// </summary>
/// <param name="input">Input file to check</param>
/// <returns>FileType of inputted file (null on error)</returns>
public static FileType ? GetFileType ( string input )
{
FileType ? outFileType = null ;
// If the file is null, then we have no archive type
if ( input = = null )
return outFileType ;
// First line of defense is going to be the extension, for better or worse
2020-12-10 22:16:53 -08:00
if ( ! HasValidArchiveExtension ( input ) )
2020-12-08 00:13:22 -08:00
return outFileType ;
// Read the first bytes of the file and get the magic number
BinaryReader br = new BinaryReader ( File . OpenRead ( input ) ) ;
byte [ ] magic = br . ReadBytes ( 8 ) ;
br . Dispose ( ) ;
// Now try to match it to a known signature
2020-12-09 14:33:47 -08:00
if ( magic . StartsWith ( SevenZipSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . SevenZipArchive ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( AaruFormatSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . AaruFormat ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( CHDSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . CHD ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( GzSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . GZipArchive ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( LRZipSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . LRZipArchive ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( LZ4Signature )
| | magic . StartsWith ( LZ4SkippableMinSignature )
| | magic . StartsWith ( LZ4SkippableMaxSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . LZ4Archive ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( RarSignature )
| | magic . StartsWith ( RarFiveSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . RarArchive ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( TarSignature )
| | magic . StartsWith ( TarZeroSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . TapeArchive ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( XZSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . XZArchive ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( ZipSignature )
| | magic . StartsWith ( ZipSignatureEmpty )
| | magic . StartsWith ( ZipSignatureSpanned ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . ZipArchive ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( ZPAQSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . ZPAQArchive ;
}
2020-12-09 14:33:47 -08:00
else if ( magic . StartsWith ( ZstdSignature ) )
2020-12-08 00:13:22 -08:00
{
outFileType = FileType . ZstdArchive ;
}
return outFileType ;
}
/// <summary>
/// Retrieve file information for a single file
/// </summary>
/// <param name="input">Filename to get information from</param>
/// <param name="header">Populated string representing the name of the skipper to use, a blank string to use the first available checker, null otherwise</param>
/// <param name="hashes">Hashes to include in the information</param>
/// <param name="asFiles">TreatAsFiles representing special format scanning</param>
/// <returns>Populated BaseFile object if success, empty one on error</returns>
public static BaseFile GetInfo ( string input , string header = null , Hash hashes = Hash . Standard , TreatAsFile asFiles = 0x00 )
{
// Add safeguard if file doesn't exist
if ( ! File . Exists ( input ) )
return null ;
// Get input information
var fileType = GetFileType ( input ) ;
Stream inputStream = File . OpenRead ( input ) ;
// Try to match the supplied header skipper
if ( header ! = null )
{
2020-12-10 21:29:17 -08:00
SkipperMatch . Init ( ) ;
2020-12-08 14:53:49 -08:00
var rule = SkipperMatch . GetMatchingRule ( input , Path . GetFileNameWithoutExtension ( header ) ) ;
2020-12-08 00:13:22 -08:00
// If there's a match, transform the stream before getting info
if ( rule . Tests ! = null & & rule . Tests . Count ! = 0 )
{
// Create the output stream
MemoryStream outputStream = new MemoryStream ( ) ;
// Transform the stream and get the information from it
rule . TransformStream ( inputStream , outputStream , keepReadOpen : false , keepWriteOpen : true ) ;
inputStream = outputStream ;
}
}
// Get the info in the proper manner
BaseFile baseFile ;
if ( fileType = = FileType . AaruFormat & & ! asFiles . HasFlag ( TreatAsFile . AaruFormat ) )
baseFile = AaruFormat . Create ( inputStream ) ;
else if ( fileType = = FileType . CHD & & ! asFiles . HasFlag ( TreatAsFile . CHD ) )
baseFile = CHDFile . Create ( inputStream ) ;
else
baseFile = GetInfo ( inputStream , hashes : hashes , keepReadOpen : false ) ;
// Dispose of the input stream
inputStream ? . Dispose ( ) ;
// Add unique data from the file
baseFile . Filename = Path . GetFileName ( input ) ;
baseFile . Date = new FileInfo ( input ) . LastWriteTime . ToString ( "yyyy/MM/dd HH:mm:ss" ) ;
return baseFile ;
}
2020-12-07 22:32:37 -08:00
/// <summary>
/// Retrieve file information for a single file
/// </summary>
/// <param name="input">Filename to get information from</param>
/// <param name="size">Size of the input stream</param>
/// <param name="hashes">Hashes to include in the information</param>
/// <param name="keepReadOpen">True if the underlying read stream should be kept open, false otherwise</param>
/// <returns>Populated BaseFile object if success, empty one on error</returns>
public static BaseFile GetInfo ( Stream input , long size = - 1 , Hash hashes = Hash . Standard , bool keepReadOpen = false )
{
// If we want to automatically set the size
if ( size = = - 1 )
size = input . Length ;
try
{
// Get a list of hashers to run over the buffer
List < Hasher > hashers = new List < Hasher > ( ) ;
if ( hashes . HasFlag ( Hash . CRC ) )
hashers . Add ( new Hasher ( Hash . CRC ) ) ;
if ( hashes . HasFlag ( Hash . MD5 ) )
hashers . Add ( new Hasher ( Hash . MD5 ) ) ;
if ( hashes . HasFlag ( Hash . SHA1 ) )
hashers . Add ( new Hasher ( Hash . SHA1 ) ) ;
if ( hashes . HasFlag ( Hash . SHA256 ) )
hashers . Add ( new Hasher ( Hash . SHA256 ) ) ;
if ( hashes . HasFlag ( Hash . SHA384 ) )
hashers . Add ( new Hasher ( Hash . SHA384 ) ) ;
if ( hashes . HasFlag ( Hash . SHA512 ) )
hashers . Add ( new Hasher ( Hash . SHA512 ) ) ;
if ( hashes . HasFlag ( Hash . SpamSum ) )
hashers . Add ( new Hasher ( Hash . SpamSum ) ) ;
// Initialize the hashing helpers
int buffersize = 3 * 1024 * 1024 ;
2022-08-29 11:06:30 -07:00
byte [ ] buffer = new byte [ buffersize ] ;
2020-12-07 22:32:37 -08:00
/ *
Please note that some of the following code is adapted from
RomVault . This is a modified version of how RomVault does
threaded hashing . As such , some of the terminology and code
is the same , though variable names and comments may have
been tweaked to better fit this code base .
* /
2022-08-29 11:06:30 -07:00
// Pre load the buffer
int next = buffersize > size ? ( int ) size : buffersize ;
int current = input . Read ( buffer , 0 , next ) ;
2020-12-07 22:32:37 -08:00
long refsize = size ;
2022-08-29 11:06:30 -07:00
while ( refsize > 0 )
2020-12-07 22:32:37 -08:00
{
// Run hashes in parallel
2022-08-29 11:06:30 -07:00
if ( current > 0 )
Parallel . ForEach ( hashers , Globals . ParallelOptions , h = > h . Process ( buffer , current ) ) ;
2020-12-07 22:32:37 -08:00
2022-08-29 11:06:30 -07:00
// Load the next buffer
refsize - = current ;
next = buffersize > refsize ? ( int ) refsize : buffersize ;
2020-12-07 22:32:37 -08:00
2022-08-29 11:06:30 -07:00
if ( next > 0 )
current = input . Read ( buffer , 0 , next ) ;
2020-12-07 22:32:37 -08:00
}
// Finalize all hashing helpers
2021-01-15 13:06:17 -08:00
Parallel . ForEach ( hashers , Globals . ParallelOptions , h = > h . Terminate ( ) ) ;
2020-12-07 22:32:37 -08:00
// Get the results
BaseFile baseFile = new BaseFile ( )
{
Size = size ,
CRC = hashes . HasFlag ( Hash . CRC ) ? hashers . First ( h = > h . HashType = = Hash . CRC ) . GetHash ( ) : null ,
MD5 = hashes . HasFlag ( Hash . MD5 ) ? hashers . First ( h = > h . HashType = = Hash . MD5 ) . GetHash ( ) : null ,
SHA1 = hashes . HasFlag ( Hash . SHA1 ) ? hashers . First ( h = > h . HashType = = Hash . SHA1 ) . GetHash ( ) : null ,
SHA256 = hashes . HasFlag ( Hash . SHA256 ) ? hashers . First ( h = > h . HashType = = Hash . SHA256 ) . GetHash ( ) : null ,
SHA384 = hashes . HasFlag ( Hash . SHA384 ) ? hashers . First ( h = > h . HashType = = Hash . SHA384 ) . GetHash ( ) : null ,
SHA512 = hashes . HasFlag ( Hash . SHA512 ) ? hashers . First ( h = > h . HashType = = Hash . SHA512 ) . GetHash ( ) : null ,
SpamSum = hashes . HasFlag ( Hash . SpamSum ) ? hashers . First ( h = > h . HashType = = Hash . SpamSum ) . GetHash ( ) : null ,
} ;
// Dispose of the hashers
hashers . ForEach ( h = > h . Dispose ( ) ) ;
return baseFile ;
}
catch ( IOException ex )
{
LoggerImpl . Warning ( ex , "An exception occurred during hashing." ) ;
return new BaseFile ( ) ;
}
finally
{
if ( ! keepReadOpen )
input . Dispose ( ) ;
else
input . SeekIfPossible ( ) ;
}
}
2020-12-10 22:16:53 -08:00
/// <summary>
/// Get if the given path has a valid DAT extension
/// </summary>
/// <param name="path">Path to check</param>
/// <returns>True if the extension is valid, false otherwise</returns>
private static bool HasValidArchiveExtension ( string path )
{
// Get the extension from the path, if possible
string ext = path . GetNormalizedExtension ( ) ;
// Check against the list of known archive extensions
switch ( ext )
{
// Aaruformat
case "aaru" :
case "aaruf" :
case "aaruformat" :
case "aif" :
case "dicf" :
// Archives
case "7z" :
case "gz" :
case "lzma" :
case "rar" :
case "rev" :
case "r00" :
case "r01" :
case "tar" :
case "tgz" :
case "tlz" :
case "zip" :
case "zipx" :
// CHD
case "chd" :
return true ;
default :
return false ;
}
}
2020-12-07 22:32:37 -08:00
#endregion
2019-02-08 20:51:44 -08:00
}
2018-02-15 22:06:20 -08:00
}