Add archive.org deserialization test, fix issues

This commit is contained in:
Matt Nadareski
2023-07-12 17:21:19 -04:00
parent 225be86248
commit a351a02ee1
6 changed files with 336 additions and 8 deletions

View File

@@ -49,18 +49,56 @@ namespace SabreTools.Models.ArchiveDotOrg
[XmlElement("rotation")]
public long? Rotation { get; set; }
[XmlElement("hocr_char_to_word_module_version")]
public string? hOCRCharToWordModuleVersion { get; set; }
#region OCR-Related
[XmlElement("cloth_cover_detection_module_version")]
public string? ClothCoverDetectionModuleVersion { get; set; }
[XmlElement("hocr_char_to_word_hocr_version")]
public string? hOCRCharToWordhOCRVersion { get; set; }
[XmlElement("ocr_module_version")]
public string? TesseractOCRModuleVersion { get; set; }
[XmlElement("hocr_char_to_word_module_version")]
public string? hOCRCharToWordModuleVersion { get; set; }
[XmlElement("hocr_fts_text_hocr_version")]
public string? hOCRFtsTexthOCRVersion { get; set; }
[XmlElement("hocr_fts_text_module_version")]
public string? hOCRFtsTextModuleVersion { get; set; }
[XmlElement("hocr_pageindex_hocr_version")]
public string? hOCRPageIndexhOCRVersion { get; set; }
[XmlElement("hocr_pageindex_module_version")]
public string? hOCRPageIndexModuleVersion { get; set; }
[XmlElement("ocr")]
public string? TesseractOCR { get; set; }
[XmlElement("ocr_converted")]
public string? TesseractOCRConverted { get; set; }
[XmlElement("ocr_detected_lang")]
public string? TesseractOCRDetectedLang { get; set; }
[XmlElement("ocr_detected_lang_conf")]
public string? TesseractOCRDetectedLangConf { get; set; }
[XmlElement("ocr_detected_script")]
public string? TesseractOCRDetectedScript { get; set; }
[XmlElement("ocr_detected_script_conf")]
public string? TesseractOCRDetectedScriptConf { get; set; }
[XmlElement("ocr_parameters")]
public string? TesseractOCRParameters { get; set; }
[XmlElement("ocr_module_version")]
public string? TesseractOCRModuleVersion { get; set; }
[XmlElement("pdf_module_version")]
public string? PDFModuleVersion { get; set; }
[XmlElement("word_conf_0_10")]
public long? WordConfidenceInterval0To10 { get; set; }
@@ -90,5 +128,19 @@ namespace SabreTools.Models.ArchiveDotOrg
[XmlElement("word_conf_91_100")]
public long? WordConfidenceInterval91To100 { get; set; }
#endregion
#region DO NOT USE IN PRODUCTION
/// <remarks>Should be empty</remarks>
[XmlAnyAttribute]
public XmlAttribute[]? ADDITIONAL_ATTRIBUTES { get; set; }
/// <remarks>Should be empty</remarks>
[XmlAnyElement]
public object[]? ADDITIONAL_ELEMENTS { get; set; }
#endregion
}
}

View File

@@ -6,7 +6,19 @@ namespace SabreTools.Models.ArchiveDotOrg
[XmlRoot("files")]
public class Files
{
[XmlElement]
[XmlElement("file")]
public File[]? File { get; set; }
#region DO NOT USE IN PRODUCTION
/// <remarks>Should be empty</remarks>
[XmlAnyAttribute]
public XmlAttribute[]? ADDITIONAL_ATTRIBUTES { get; set; }
/// <remarks>Should be empty</remarks>
[XmlAnyElement]
public object[]? ADDITIONAL_ELEMENTS { get; set; }
#endregion
}
}

View File

@@ -1,8 +1,6 @@
using System;
using System.IO;
using SabreTools.DatFiles;
using SabreTools.DatTools;
using Xunit;
namespace SabreTools.Test.DatTools
@@ -43,7 +41,7 @@ namespace SabreTools.Test.DatTools
if (filename != null)
filename = Path.Combine(Environment.CurrentDirectory, "TestData", filename);
var datFile = Parser.CreateAndParse(filename, throwOnError: true);
var datFile = SabreTools.DatTools.Parser.CreateAndParse(filename, throwOnError: true);
Assert.Equal(datFormat, datFile.Header.DatFormat);
Assert.Equal(totalCount, datFile.Items.TotalCount);
}

View File

@@ -0,0 +1,37 @@
using System;
using System.Xml.Serialization;
using Xunit;
namespace SabreTools.Test.Parser
{
public class SerializationTests
{
[Fact]
public void ArchiveDotOrgDeserializeTest()
{
// Open the file for reading
string filename = System.IO.Path.Combine(Environment.CurrentDirectory, "TestData", "test-archivedotorg-files.xml");
using var fs = System.IO.File.OpenRead(filename);
// Setup the serializer
var serializer = new XmlSerializer(typeof(Models.ArchiveDotOrg.Files));
// Deserialize the file
var dat = serializer.Deserialize(fs) as Models.ArchiveDotOrg.Files;
// Validate the values
Assert.NotNull(dat);
Assert.NotNull(dat.File);
Assert.Equal(22, dat.File.Length);
// Validate we're not missing any attributes or elements
Assert.Null(dat.ADDITIONAL_ATTRIBUTES);
Assert.Null(dat.ADDITIONAL_ELEMENTS);
foreach (var file in dat.File)
{
Assert.Null(file.ADDITIONAL_ATTRIBUTES);
Assert.Null(file.ADDITIONAL_ELEMENTS);
}
}
}
}

View File

@@ -13,6 +13,7 @@
<ProjectReference Include="..\SabreTools.FileTypes\SabreTools.FileTypes.csproj" />
<ProjectReference Include="..\SabreTools.Filtering\SabreTools.Filtering.csproj" />
<ProjectReference Include="..\SabreTools.IO\SabreTools.IO.csproj" />
<ProjectReference Include="..\SabreTools.Models\SabreTools.Models.csproj" />
<ProjectReference Include="..\SabreTools.Skippers\SabreTools.Skippers.csproj" />
</ItemGroup>

View File

@@ -0,0 +1,228 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Source: https://ia902607.us.archive.org/4/items/adventuresofsher00doylrich/adventuresofsher00doylrich_files.xml
-->
<files>
<file name="__ia_thumb.jpg" source="original">
<mtime>1657820174</mtime>
<size>23393</size>
<md5>33ca910055358d18e116ea66a0abdf04</md5>
<crc32>aad0f2dc</crc32>
<sha1>fc6d673ef7fbe974bba88b8275c737eeb20b5ba8</sha1>
<format>Item Tile</format>
<rotation>0</rotation>
</file>
<file name="_cloth_detection.log" source="derivative">
<cloth_cover_detection_module_version>1.2</cloth_cover_detection_module_version>
<format>Cloth Cover Detection Log</format>
<original>scandata.zip</original>
<mtime>1682523106</mtime>
<size>717</size>
<md5>3ed8a80448263ebfad242b8fa5a1a18d</md5>
<crc32>d886dce1</crc32>
<sha1>4f8d103cf07c29e1d3bb8273870f1daca4a89f26</sha1>
</file>
<file name="adventuresofsher00doylrich.djvu" source="derivative">
<format>DjVu</format>
<original>adventuresofsher00doylrich_djvu.xml</original>
<mtime>1288609616</mtime>
<size>11176526</size>
<md5>52ccad905110bc7f3c0fe032ef744fa1</md5>
<crc32>91973508</crc32>
<sha1>aa5214400b60e3b68f5eaa9733e10af4bcfe4e8d</sha1>
</file>
<file name="adventuresofsher00doylrich.gif" source="derivative">
<format>Animated GIF</format>
<original>adventuresofsher00doylrich_jp2.zip</original>
<mtime>1288593173</mtime>
<size>334379</size>
<md5>4d56403c12771ed7e396f6187d95502d</md5>
<crc32>1a8585c1</crc32>
<sha1>6062155959dd69f279a489b3761afa0a1638173d</sha1>
</file>
<file name="adventuresofsher00doylrich.pdf" source="derivative">
<pdf_module_version>0.0.22</pdf_module_version>
<format>Text PDF</format>
<original>adventuresofsher00doylrich_page_numbers.json</original>
<mtime>1682525473</mtime>
<size>26126569</size>
<md5>98ad55b9989671f823228055bfbca465</md5>
<crc32>cdc5f40d</crc32>
<sha1>e358dd22e04f7efcd4dfc8a6c2d714a0e0a8297c</sha1>
</file>
<file name="adventuresofsher00doylrich_archive.torrent" source="metadata">
<btih>4b5bbf39f9e489f66e0c70a9ac45af495b31a2ba</btih>
<mtime>1688944719</mtime>
<size>17197</size>
<md5>91aa4822d1552ef8bd08b27b14d8af4d</md5>
<crc32>10a3d397</crc32>
<sha1>9f5d4b652ac6903e218c8ff8e854482ab3d31b99</sha1>
<format>Archive BitTorrent</format>
</file>
<file name="adventuresofsher00doylrich_chocr.html.gz" source="derivative">
<ocr>tesseract 5.3.0-3-g9920</ocr>
<ocr_parameters>lang-eng;two-pass-disabled;pass-dpi-disabled;autonomous-mode-disabled;binarisation-method-otsu</ocr_parameters>
<ocr_module_version>0.0.21</ocr_module_version>
<ocr_detected_script>Latin</ocr_detected_script>
<ocr_detected_script_conf>0.8867</ocr_detected_script_conf>
<ocr_detected_lang>en</ocr_detected_lang>
<ocr_detected_lang_conf>1.0000</ocr_detected_lang_conf>
<format>chOCR</format>
<original>adventuresofsher00doylrich_jp2.zip</original>
<mtime>1682523054</mtime>
<size>8326755</size>
<md5>cda085dadcc74ca4bbeb768b69e7b28b</md5>
<crc32>faa3bbc1</crc32>
<sha1>0ceffd2d58757e30ccfcc84ff6805093e32607b3</sha1>
</file>
<file name="adventuresofsher00doylrich_dc.xml" source="original">
<format>Dublin Core</format>
<mtime>1682518166</mtime>
<size>1712</size>
<md5>bb3294746c4d5bb8102f8426454b26fb</md5>
<crc32>7c20c9c9</crc32>
<sha1>2bc134939b5f044396c56af740ad55029978859e</sha1>
</file>
<file name="adventuresofsher00doylrich_djvu.txt" source="derivative">
<format>DjVuTXT</format>
<original>adventuresofsher00doylrich_djvu.xml</original>
<mtime>1682523254</mtime>
<size>613882</size>
<md5>022b57b4017bbd8e6564c6e0968ad063</md5>
<crc32>2828f839</crc32>
<sha1>1535e2f509fc3a81c377d2b7270d21a95af71d65</sha1>
</file>
<file name="adventuresofsher00doylrich_djvu.xml" source="derivative">
<format>Djvu XML</format>
<original>adventuresofsher00doylrich_hocr.html</original>
<mtime>1682523185</mtime>
<size>8593952</size>
<md5>6faee3cd4ba7b3eaca3a2f5f4e8414f6</md5>
<crc32>51713c61</crc32>
<sha1>16dc73c2bdce4705c37d86a798d83fb780929928</sha1>
</file>
<file name="adventuresofsher00doylrich_files.xml" source="original">
<format>Metadata</format>
<md5>f39ff302a7965a1a077f3c4962f5d2e9</md5>
<summation>md5</summation>
</file>
<file name="adventuresofsher00doylrich_hocr.html" source="derivative">
<hocr_char_to_word_module_version>1.1.0</hocr_char_to_word_module_version>
<hocr_char_to_word_hocr_version>1.1.15</hocr_char_to_word_hocr_version>
<ocr_parameters>lang-eng;two-pass-disabled;pass-dpi-disabled;autonomous-mode-disabled;binarisation-method-otsu</ocr_parameters>
<ocr_module_version>0.0.21</ocr_module_version>
<ocr_detected_script>Latin</ocr_detected_script>
<ocr_detected_script_conf>0.8867</ocr_detected_script_conf>
<ocr_detected_lang>en</ocr_detected_lang>
<ocr_detected_lang_conf>1.0000</ocr_detected_lang_conf>
<format>hOCR</format>
<original>adventuresofsher00doylrich_chocr.html.gz</original>
<mtime>1682523150</mtime>
<size>15785810</size>
<md5>afbb8a9c1e6c25b3755fa6345153b13d</md5>
<crc32>e79c31fd</crc32>
<sha1>8b7cd7fd2010544783277f502c4103310dc2e2c4</sha1>
</file>
<file name="adventuresofsher00doylrich_hocr_pageindex.json.gz" source="derivative">
<hocr_pageindex_module_version>1.0.0</hocr_pageindex_module_version>
<hocr_pageindex_hocr_version>1.1.15</hocr_pageindex_hocr_version>
<format>OCR Page Index</format>
<original>adventuresofsher00doylrich_hocr.html</original>
<mtime>1682523219</mtime>
<size>4232</size>
<md5>aec425c66ef34149cff679b8998c2a36</md5>
<crc32>9a6e2f76</crc32>
<sha1>cc91c1eb055652ab6c9de878c6b54e8b95bddad1</sha1>
</file>
<file name="adventuresofsher00doylrich_hocr_searchtext.txt.gz" source="derivative">
<hocr_fts_text_module_version>1.1.0</hocr_fts_text_module_version>
<hocr_fts_text_hocr_version>1.1.15</hocr_fts_text_hocr_version>
<word_conf_0_10>531</word_conf_0_10>
<word_conf_11_20>242</word_conf_11_20>
<word_conf_21_30>298</word_conf_21_30>
<word_conf_31_40>321</word_conf_31_40>
<word_conf_41_50>389</word_conf_41_50>
<word_conf_51_60>450</word_conf_51_60>
<word_conf_61_70>671</word_conf_61_70>
<word_conf_71_80>1059</word_conf_71_80>
<word_conf_81_90>3102</word_conf_81_90>
<word_conf_91_100>103350</word_conf_91_100>
<format>OCR Search Text</format>
<original>adventuresofsher00doylrich_hocr.html</original>
<mtime>1682523242</mtime>
<size>224594</size>
<md5>2e1011def53909a247f73af66420e0c3</md5>
<crc32>32dddf59</crc32>
<sha1>f303467ce8f109a1d9cf49f2038c9a1d43e3c63e</sha1>
</file>
<file name="adventuresofsher00doylrich_jp2.zip" source="derivative">
<format>Single Page Processed JP2 ZIP</format>
<original>adventuresofsher00doylrich_raw_jp2.zip</original>
<mtime>1288593129</mtime>
<size>203319748</size>
<md5>2acbe6b61d35d8bd093127f0cef724c9</md5>
<crc32>66bcf599</crc32>
<sha1>a8801760eec5442176281584bd6242868f81cde5</sha1>
<filecount>364</filecount>
</file>
<file name="adventuresofsher00doylrich_marc.xml" source="original">
<format>MARC</format>
<md5>e951175fb1ee03c706ab6ee34754a92c</md5>
<mtime>1682518056</mtime>
<size>7679</size>
<crc32>3538b620</crc32>
<sha1>8fd493fa0971c2f7c2fdb1d84e0ec4234b878ff4</sha1>
</file>
<file name="adventuresofsher00doylrich_meta.xml" source="original">
<format>Metadata</format>
<mtime>1682525557</mtime>
<size>3790</size>
<md5>e98cb44acdb4e4f070fddb8b1f328d93</md5>
<crc32>04ee4346</crc32>
<sha1>c687fc00d20b516b0cf19225296e984cbfe7c315</sha1>
</file>
<file name="adventuresofsher00doylrich_metasource.xml" source="original">
<format>MARC Source</format>
<md5>7099c2013db04ef8a7277386ce74de54</md5>
<mtime>1682518056</mtime>
<size>527</size>
<crc32>6fa5d977</crc32>
<sha1>34ed3e366ed809a1a39a265373b9cf5b8cc06f0c</sha1>
</file>
<file name="adventuresofsher00doylrich_page_numbers.json" source="derivative">
<format>Page Numbers JSON</format>
<original>adventuresofsher00doylrich_djvu.xml</original>
<mtime>1682523259</mtime>
<size>67337</size>
<md5>f9179733924fa4566f0e9a05c8c276dd</md5>
<crc32>2bd8369f</crc32>
<sha1>e4104152caa990c72f986eb63c660239f0111feb</sha1>
</file>
<file name="adventuresofsher00doylrich_raw_jp2.zip" source="original">
<format>Single Page Raw JP2 ZIP</format>
<md5>d2a1137d4f8fed61be945f878444923b</md5>
<mtime>1204986411</mtime>
<size>370667543</size>
<crc32>12eda51c</crc32>
<sha1>402b01b5a734ae4f857991c0977ba629a8606932</sha1>
<filecount>364</filecount>
</file>
<file name="adventuresofsher00doylrich_reviews.xml" source="original">
<mtime>1688944715</mtime>
<size>9412</size>
<md5>ff0c9653a7aa865ea01e032e8b657736</md5>
<crc32>7634dbbc</crc32>
<sha1>f93a8206d6836f08c19dd023eb552bed2106cad9</sha1>
<format>Metadata</format>
</file>
<file name="scandata.zip" source="original">
<format>Scribe Scandata ZIP</format>
<md5>506fdf9aa054fbc7e9e2577d05d5e7cf</md5>
<mtime>1682523105</mtime>
<size>55175976</size>
<crc32>232ce248</crc32>
<sha1>9adf53bc1b8e7be5afe3526d2ab64f9696058488</sha1>
<filecount>11</filecount>
</file>
</files>