From 20c2a547d1fc1593eb1b276fe865b937d66dddf0 Mon Sep 17 00:00:00 2001 From: KoalaBear Date: Sun, 12 Sep 2021 15:00:21 +0200 Subject: [PATCH] - Also allow fallback for table - Add CheckParents and checkParents everywhere - Change tests for new behavior - Add suppor for another format + test --- .../DirectoryParser026_050Tests.cs | 8 +- .../DirectoryParser051_075Tests.cs | 4 +- .../DirectoryParser076_100Tests.cs | 8 +- .../DirectoryParser101_125Tests.cs | 15 + .../DirectoryParserTests.cs | 4 +- .../Samples/DirectoryListing123a.html.dat | 760 ++++++++++++++++++ .../DirectoryParser.cs | 161 ++-- .../OpenDirectoryIndexer.cs | 45 +- 8 files changed, 907 insertions(+), 98 deletions(-) create mode 100644 src/OpenDirectoryDownloader.Tests/Samples/DirectoryListing123a.html.dat diff --git a/src/OpenDirectoryDownloader.Tests/DirectoryParser026_050Tests.cs b/src/OpenDirectoryDownloader.Tests/DirectoryParser026_050Tests.cs index 7bb797b0..46515f16 100644 --- a/src/OpenDirectoryDownloader.Tests/DirectoryParser026_050Tests.cs +++ b/src/OpenDirectoryDownloader.Tests/DirectoryParser026_050Tests.cs @@ -443,9 +443,9 @@ public async Task TestDirectoryListing40aAsync() [Fact] public async Task TestDirectoryListing40bAsync() { - WebDirectory webDirectory = await ParseHtml(GetSample()); + WebDirectory webDirectory = await ParseHtml(GetSample(), "http://www.funreading.com.hk/primary/computer/download/"); - Assert.Equal("ROOT", webDirectory.Name); + Assert.Equal("download", webDirectory.Name); Assert.Empty(webDirectory.Subdirectories); Assert.Empty(webDirectory.Files); } @@ -686,7 +686,7 @@ public async Task TestDirectoryListing47bAsync() [Fact] public async Task TestDirectoryListing48aAsync() { - WebDirectory webDirectory = await ParseHtml(GetSample()); + WebDirectory webDirectory = await ParseHtml(GetSample(), checkParents: false); Assert.Equal("ROOT", webDirectory.Name); Assert.Single(webDirectory.Subdirectories); @@ -702,7 +702,7 @@ public async Task TestDirectoryListing48aAsync() [Fact] public async Task TestDirectoryListing48bAsync() { - WebDirectory webDirectory = await ParseHtml(GetSample()); + WebDirectory webDirectory = await ParseHtml(GetSample(), checkParents: false); Assert.Equal("ROOT", webDirectory.Name); Assert.Empty(webDirectory.Subdirectories); diff --git a/src/OpenDirectoryDownloader.Tests/DirectoryParser051_075Tests.cs b/src/OpenDirectoryDownloader.Tests/DirectoryParser051_075Tests.cs index 41187256..6ace2520 100644 --- a/src/OpenDirectoryDownloader.Tests/DirectoryParser051_075Tests.cs +++ b/src/OpenDirectoryDownloader.Tests/DirectoryParser051_075Tests.cs @@ -637,7 +637,7 @@ public async Task TestDirectoryListing71bAsync() [Fact] public async Task TestDirectoryListing72aAsync() { - WebDirectory webDirectory = await ParseHtml(GetSample()); + WebDirectory webDirectory = await ParseHtml(GetSample(), checkParents: false); Assert.Equal("ROOT", webDirectory.Name); Assert.Equal(9, webDirectory.Subdirectories.Count); @@ -653,7 +653,7 @@ public async Task TestDirectoryListing72aAsync() [Fact] public async Task TestDirectoryListing72bAsync() { - WebDirectory webDirectory = await ParseHtml(GetSample()); + WebDirectory webDirectory = await ParseHtml(GetSample(), checkParents: false); Assert.Equal("ROOT", webDirectory.Name); Assert.Single(webDirectory.Subdirectories); diff --git a/src/OpenDirectoryDownloader.Tests/DirectoryParser076_100Tests.cs b/src/OpenDirectoryDownloader.Tests/DirectoryParser076_100Tests.cs index 7ae9c4f2..4726edda 100644 --- a/src/OpenDirectoryDownloader.Tests/DirectoryParser076_100Tests.cs +++ b/src/OpenDirectoryDownloader.Tests/DirectoryParser076_100Tests.cs @@ -45,7 +45,7 @@ public async Task TestDirectoryListing75bAsync() [Fact] public async Task TestDirectoryListing76aAsync() { - WebDirectory webDirectory = await ParseHtml(GetSample()); + WebDirectory webDirectory = await ParseHtml(GetSample(), checkParents: false); Assert.Equal("ROOT", webDirectory.Name); Assert.Empty(webDirectory.Subdirectories); @@ -60,7 +60,7 @@ public async Task TestDirectoryListing76aAsync() [Fact] public async Task TestDirectoryListing77aAsync() { - WebDirectory webDirectory = await ParseHtml(GetSample()); + WebDirectory webDirectory = await ParseHtml(GetSample(), checkParents: false); Assert.Equal("ROOT", webDirectory.Name); Assert.Empty(webDirectory.Subdirectories); @@ -214,7 +214,7 @@ public async Task TestDirectoryListing82bAsync() [Fact] public async Task TestDirectoryListing83aAsync() { - WebDirectory webDirectory = await ParseHtml(GetSample()); + WebDirectory webDirectory = await ParseHtml(GetSample(), checkParents: false); Assert.Equal("ROOT", webDirectory.Name); Assert.Empty(webDirectory.Subdirectories); @@ -411,7 +411,7 @@ public async Task TestDirectoryListing89cAsync() WebDirectory webDirectory = await ParseHtml(GetSample()); Assert.Equal("ROOT", webDirectory.Name); - Assert.Equal(4, webDirectory.Subdirectories.Count); + Assert.Equal(3, webDirectory.Subdirectories.Count); Assert.Equal("本子", webDirectory.Subdirectories[0].Name); Assert.Empty(webDirectory.Files); } diff --git a/src/OpenDirectoryDownloader.Tests/DirectoryParser101_125Tests.cs b/src/OpenDirectoryDownloader.Tests/DirectoryParser101_125Tests.cs index ba12154a..cd0b30f5 100644 --- a/src/OpenDirectoryDownloader.Tests/DirectoryParser101_125Tests.cs +++ b/src/OpenDirectoryDownloader.Tests/DirectoryParser101_125Tests.cs @@ -626,5 +626,20 @@ public async Task TestDirectoryListing122bAsync() Assert.Equal("acjp_hayden01.ram", webDirectory.Files[0].FileName); Assert.Equal(Constants.NoFileSize, webDirectory.Files[0].FileSize); } + + /// + /// Url: http://www.diggerhistory.info/images/uniforms4/ + /// + [Fact] + public async Task TestDirectoryListing123aAsync() + { + WebDirectory webDirectory = await ParseHtml(GetSample()); + + Assert.Equal("ROOT", webDirectory.Name); + Assert.Empty(webDirectory.Subdirectories); + Assert.Equal(588, webDirectory.Files.Count); + Assert.Equal("101-engr.jpg", webDirectory.Files[0].FileName); + Assert.Equal(Constants.NoFileSize, webDirectory.Files[0].FileSize); + } } } \ No newline at end of file diff --git a/src/OpenDirectoryDownloader.Tests/DirectoryParserTests.cs b/src/OpenDirectoryDownloader.Tests/DirectoryParserTests.cs index ea81c991..b9124f7f 100644 --- a/src/OpenDirectoryDownloader.Tests/DirectoryParserTests.cs +++ b/src/OpenDirectoryDownloader.Tests/DirectoryParserTests.cs @@ -46,9 +46,9 @@ public static void CleanWebDirectory(WebDirectory webDirectory, Uri testedUri) }).ToList().ForEach(wd => webDirectory.Files.Remove(wd)); } - public static async Task ParseHtml(string html, string url = "http://localhost/") + public static async Task ParseHtml(string html, string url = "http://localhost/", bool checkParents = true) { - return await DirectoryParser.ParseHtml(new WebDirectory(null) { Url = url }, html); + return await DirectoryParser.ParseHtml(new WebDirectory(null) { Url = url }, html, checkParents: checkParents); } } } diff --git a/src/OpenDirectoryDownloader.Tests/Samples/DirectoryListing123a.html.dat b/src/OpenDirectoryDownloader.Tests/Samples/DirectoryListing123a.html.dat new file mode 100644 index 00000000..8cc6be13 --- /dev/null +++ b/src/OpenDirectoryDownloader.Tests/Samples/DirectoryListing123a.html.dat @@ -0,0 +1,760 @@ + + + + + Index of /images/uniforms4 + + + +
+ + + + + +
+ + + + + + + + +
+ + + + + + +
+

  Unofficial history of the Australian + & New Zealand Armed + Services 

+

+  Search  &  Help Recruits Military History Hall of Heroes Indigenous Slouch hat + ARMY Today Uniforms + + + Badges

+

+  Colours & Flags + + + Weapons Food Equipment Assorted Medals Armour + + + Navy Air Power 

+

+ + Nurses - Medical Tributes Poetry - Music Posters & Signs Leaders The Enemy Humour Links Killing Anzac

+

+
+ +
+

Index of /images/uniforms4

+ +
Apache/2.2.10 (Unix) mod_ssl/2.2.10 OpenSSL/0.9.8e-fips-rhel5 mod_auth_passthrough/2.1 mod_bwlimited/1.4 FrontPage/5.0.2.2635 mod_perl/2.0.4 Perl/v5.8.8 Server at www.diggerhistory.info Port 80
+ + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + +
+
+
+

+ + + + + + + + + + + + + + + +
+

+ + Statistics + : Over 35 + million page visitors + since +  11 + Nov 2002  

+

 

+

+ Email  

+  Search  +  Help    + +  Guestbook  +  Get + Updates  +  Last + Post   +  The + Ode    +   + + FAQ    +  Digger Forum + +

+ + Click for news

+ + + + + +
+

+

+
+ + + + +
+

+ Digger History:  + an unofficial history of + the + Australian & + New Zealand + Armed Forces +

+
+ + + diff --git a/src/OpenDirectoryDownloader/DirectoryParser.cs b/src/OpenDirectoryDownloader/DirectoryParser.cs index e6360dd0..3abedb59 100644 --- a/src/OpenDirectoryDownloader/DirectoryParser.cs +++ b/src/OpenDirectoryDownloader/DirectoryParser.cs @@ -30,7 +30,7 @@ public static class DirectoryParser /// Base url /// Html to parse /// WebDirectory object containing current directory index - public static async Task ParseHtml(WebDirectory webDirectory, string html, HttpClient httpClient = null) + public static async Task ParseHtml(WebDirectory webDirectory, string html, HttpClient httpClient = null, bool checkParents = true) { string baseUrl = webDirectory.Url; @@ -51,7 +51,7 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (webDirectory.Uri.Host == "ipfs.io" || webDirectory.Uri.Host == "gateway.ipfs.io") { - return ParseIpfsDirectoryListing(baseUrl, parsedWebDirectory, htmlDocument); + return ParseIpfsDirectoryListing(baseUrl, parsedWebDirectory, htmlDocument, checkParents); } if (webDirectory.Uri.Host == Constants.BlitzfilesTechDomain) @@ -111,14 +111,14 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (directoryListingDotComlistItems.Any()) { - return ParseDirectoryListingDoctComDirectoryListing(baseUrl, parsedWebDirectory, directoryListingDotComlistItems); + return ParseDirectoryListingDoctComDirectoryListing(baseUrl, parsedWebDirectory, directoryListingDotComlistItems, checkParents); } IHtmlCollection h5aiTableRows = htmlDocument.QuerySelectorAll("#fallback table tr"); if (h5aiTableRows.Any()) { - return ParseH5aiDirectoryListing(baseUrl, parsedWebDirectory, h5aiTableRows); + return ParseH5aiDirectoryListing(baseUrl, parsedWebDirectory, h5aiTableRows, checkParents); } // Snif directory listing @@ -127,7 +127,7 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (snifTableRows.Any()) { - return ParseSnifDirectoryListing(baseUrl, parsedWebDirectory, snifTableRows); + return ParseSnifDirectoryListing(baseUrl, parsedWebDirectory, snifTableRows, checkParents); } // Godir - https://gitlab.com/Montessquio/godir @@ -135,7 +135,7 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (pureTableRows.Any()) { - return ParsePureDirectoryListing(ref baseUrl, parsedWebDirectory, htmlDocument, pureTableRows); + return ParsePureDirectoryListing(ref baseUrl, parsedWebDirectory, htmlDocument, pureTableRows, checkParents); } // Remove it after ParsePureDirectoryListing (.breadcrumb is used in it) @@ -146,7 +146,7 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (divElements.Any()) { - return ParseCustomDivListing(ref baseUrl, parsedWebDirectory, htmlDocument, divElements); + return ParseCustomDivListing(ref baseUrl, parsedWebDirectory, htmlDocument, divElements, checkParents); } // Custom directory listing 2 @@ -154,14 +154,14 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (divElements2.Any()) { - return ParseCustomDivListing2(ref baseUrl, parsedWebDirectory, htmlDocument, divElements2); + return ParseCustomDivListing2(ref baseUrl, parsedWebDirectory, htmlDocument, divElements2, checkParents); } IHtmlCollection pres = htmlDocument.QuerySelectorAll("pre"); if (pres.Any()) { - WebDirectory result = await ParsePreDirectoryListing(baseUrl, parsedWebDirectory, pres); + WebDirectory result = await ParsePreDirectoryListing(baseUrl, parsedWebDirectory, pres, checkParents); if (result.Files.Any() || result.Subdirectories.Any() || result.Error) { @@ -180,26 +180,31 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (tables.Any()) { - return ParseTablesDirectoryListing(baseUrl, parsedWebDirectory, tables); + WebDirectory result = ParseTablesDirectoryListing(baseUrl, parsedWebDirectory, tables, checkParents); + + if (result.Files.Any() || result.Subdirectories.Any() || result.Error) + { + return result; + } } IHtmlCollection materialDesignListItems = htmlDocument.QuerySelectorAll("ul.mdui-list li"); if (materialDesignListItems.Any()) { - return ParseMaterialDesignListItemsDirectoryListing(baseUrl, parsedWebDirectory, materialDesignListItems); + return ParseMaterialDesignListItemsDirectoryListing(baseUrl, parsedWebDirectory, materialDesignListItems, checkParents); } if (htmlDocument.QuerySelectorAll("#content ul#file-list li").Length == 2) { - return ParseDirectoryListerDirectoryListing(baseUrl, parsedWebDirectory, htmlDocument); + return ParseDirectoryListerDirectoryListing(baseUrl, parsedWebDirectory, htmlDocument, checkParents); } IHtmlCollection listItems = htmlDocument.QuerySelectorAll(".list-group li"); if (listItems.Any()) { - WebDirectory result = ParseListItemsDirectoryListing(baseUrl, parsedWebDirectory, listItems); + WebDirectory result = ParseListItemsDirectoryListing(baseUrl, parsedWebDirectory, listItems, checkParents); if (result.ParsedSuccessfully || result.Error) { @@ -211,7 +216,7 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (listItems.Any()) { - WebDirectory result = ParseListItemsDirectoryListing(baseUrl, parsedWebDirectory, listItems); + WebDirectory result = ParseListItemsDirectoryListing(baseUrl, parsedWebDirectory, listItems, checkParents); if (result.ParsedSuccessfully || result.Error) { @@ -224,12 +229,12 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (links.Any()) { - parsedWebDirectory = ParseLinksDirectoryListing(baseUrl, parsedWebDirectory, links); + parsedWebDirectory = ParseLinksDirectoryListing(baseUrl, parsedWebDirectory, links, checkParents); } parsedWebDirectory = await ParseDirectoryListingModel01(baseUrl, parsedWebDirectory, htmlDocument, httpClient); - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } @@ -246,7 +251,7 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri parsedWebDirectory.Error = true; } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } @@ -370,7 +375,7 @@ private static WebDirectory ConvertDirectoryListingModel01(string baseUrl, WebDi return webDirectory; } - private static WebDirectory ParseCustomDivListing(ref string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, IHtmlCollection divElements) + private static WebDirectory ParseCustomDivListing(ref string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, IHtmlCollection divElements, bool checkParents) { foreach (IElement divElement in divElements) { @@ -413,12 +418,12 @@ private static WebDirectory ParseCustomDivListing(ref string baseUrl, WebDirecto } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseCustomDivListing2(ref string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, IHtmlCollection divElements) + private static WebDirectory ParseCustomDivListing2(ref string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, IHtmlCollection divElements, bool checkParents) { foreach (IElement divElement in divElements) { @@ -461,12 +466,12 @@ private static WebDirectory ParseCustomDivListing2(ref string baseUrl, WebDirect } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseIpfsDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument) + private static WebDirectory ParseIpfsDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, bool checkParents) { foreach (IElement tableRow in htmlDocument.QuerySelectorAll("table tr")) { @@ -500,12 +505,12 @@ private static WebDirectory ParseIpfsDirectoryListing(string baseUrl, WebDirecto } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseDirectoryListingDoctComDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection listItems) + private static WebDirectory ParseDirectoryListingDoctComDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection listItems, bool checkParents) { foreach (IElement listItem in listItems) { @@ -542,12 +547,12 @@ private static WebDirectory ParseDirectoryListingDoctComDirectoryListing(string } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseSnifDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection snifTableRows) + private static WebDirectory ParseSnifDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection snifTableRows, bool checkParents) { IElement table = snifTableRows.First().Parent("table"); @@ -594,12 +599,12 @@ private static WebDirectory ParseSnifDirectoryListing(string baseUrl, WebDirecto } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParsePureDirectoryListing(ref string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, IHtmlCollection pureTableRows) + private static WebDirectory ParsePureDirectoryListing(ref string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, IHtmlCollection pureTableRows, bool checkParents) { string urlFromBreadcrumbs = Uri.EscapeUriString(string.Join("/", htmlDocument.QuerySelectorAll(".breadcrumbs_main .breadcrumb").Where(b => !b.ClassList.Contains("smaller")).Select(b => b.TextContent)) + "/"); @@ -669,12 +674,12 @@ private static WebDirectory ParsePureDirectoryListing(ref string baseUrl, WebDir Logger.Error($"Directory listing returns different directory than requested! Expected: {urlFromBaseUrl}, Actual: {urlFromBreadcrumbs}"); } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseH5aiDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection h5aiTableRows) + private static WebDirectory ParseH5aiDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection h5aiTableRows, bool checkParents) { IElement table = h5aiTableRows.First().Parent("table"); @@ -724,12 +729,12 @@ private static WebDirectory ParseH5aiDirectoryListing(string baseUrl, WebDirecto } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection tables) + private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection tables, bool checkParents) { // Dirty solution.. bool hasSeperateDirectoryAndFilesTables = false; @@ -776,7 +781,7 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec { if (table.QuerySelector("a") != null) { - webDirectoryCopy = ParseLinksDirectoryListing(baseUrl, webDirectoryCopy, table.QuerySelectorAll("a")); + webDirectoryCopy = ParseLinksDirectoryListing(baseUrl, webDirectoryCopy, table.QuerySelectorAll("a"), checkParents); } } else @@ -946,7 +951,7 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec parsedWebDirectory.Files = new ConcurrentList(results.SelectMany(r => r.Files)); } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } @@ -1404,7 +1409,7 @@ private static void ProcessUrl(string baseUrl, IElement link, out string linkHre return match.Success; }; - private static async Task ParsePreDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection pres) + private static async Task ParsePreDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection pres, bool checkParents) { List>> regexFuncs = new List>> { @@ -1437,12 +1442,12 @@ private static async Task ParsePreDirectoryListing(string baseUrl, } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseMaterialDesignListItemsDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection listItems) + private static WebDirectory ParseMaterialDesignListItemsDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection listItems, bool checkParents) { int nameIndex = -1; int sizeIndex = -1; @@ -1625,12 +1630,12 @@ private static WebDirectory ParseMaterialDesignListItemsDirectoryListing(string } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseDirectoryListerDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument) + private static WebDirectory ParseDirectoryListerDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, bool checkParents) { parsedWebDirectory.Parser = "ParseDirectoryListerDirectoryListing"; List tableHeaderInfos = new List(); @@ -1686,12 +1691,12 @@ private static WebDirectory ParseDirectoryListerDirectoryListing(string baseUrl, } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseListItemsDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection listItems) + private static WebDirectory ParseListItemsDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection listItems, bool checkParents) { bool firstLink = true; @@ -1715,19 +1720,19 @@ private static WebDirectory ParseListItemsDirectoryListing(string baseUrl, WebDi } } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } - private static WebDirectory ParseLinksDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection links) + private static WebDirectory ParseLinksDirectoryListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlCollection links, bool checkParents) { foreach (IElement link in links) { ProcessLink(baseUrl, parsedWebDirectory, link, "ParseLinksDirectoryListing"); } - CheckParsedResults(parsedWebDirectory); + CheckParsedResults(parsedWebDirectory, baseUrl, checkParents); return parsedWebDirectory; } @@ -1832,7 +1837,7 @@ private static void ProcessLink(string baseUrl, WebDirectory parsedWebDirectory, } } - public static void CheckParsedResults(WebDirectory webDirectory) + public static void CheckParsedResults(WebDirectory webDirectory, string baseUrl, bool checkParents) { if (!webDirectory.Subdirectories.Any() && !webDirectory.Files.Any()) { @@ -1844,11 +1849,43 @@ public static void CheckParsedResults(WebDirectory webDirectory) webDirectorySub.Url = StripUrl(webDirectorySub.Url); } + if (checkParents) + { + CheckParents(webDirectory, baseUrl); + } + CleanFragments(webDirectory); CheckSymlinks(webDirectory); } + private static void CheckParents(WebDirectory webDirectory, string baseUrl) + { + webDirectory.Subdirectories.Where(d => + { + Uri uri = new Uri(d.Url); + + if (uri.Host == Constants.GoogleDriveDomain || uri.Host == Constants.BlitzfilesTechDomain) + { + return false; + } + + return (uri.Scheme != Constants.UriScheme.Https && uri.Scheme != Constants.UriScheme.Http && uri.Scheme != Constants.UriScheme.Ftp && uri.Scheme != Constants.UriScheme.Ftps) || uri.Host != new Uri(baseUrl).Host || !SameHostAndDirectoryDirectory(new Uri(baseUrl), uri); + }).ToList().ForEach(wd => webDirectory.Subdirectories.Remove(wd)); + + webDirectory.Files.Where(f => + { + Uri uri = new Uri(f.Url); + + if (uri.Host == Constants.GoogleDriveDomain || uri.Host == Constants.BlitzfilesTechDomain) + { + return false; + } + + return (uri.Scheme != Constants.UriScheme.Https && uri.Scheme != Constants.UriScheme.Http && uri.Scheme != Constants.UriScheme.Ftp && uri.Scheme != Constants.UriScheme.Ftps) || uri.Host != new Uri(baseUrl).Host || !SameHostAndDirectoryFile(uri, new Uri(baseUrl)); + }).ToList().ForEach(wd => webDirectory.Files.Remove(wd)); + } + private static void CleanFragments(WebDirectory webDirectory) { // Directories @@ -1953,6 +1990,40 @@ private static bool CheckDirectoryTheSame(WebDirectory webDirectory, WebDirector return false; } + public static bool SameHostAndDirectoryFile(Uri baseUri, Uri checkUri) + { + string checkUrlWithoutFileName = checkUri.LocalPath; + checkUrlWithoutFileName = checkUrlWithoutFileName.Replace("index.php", string.Empty); + checkUrlWithoutFileName = checkUrlWithoutFileName.Replace("DirectoryList.asp", string.Empty); + string checkUrlFileName = Path.GetFileName(checkUri.ToString()); + + if (!string.IsNullOrWhiteSpace(checkUrlFileName)) + { + checkUrlWithoutFileName = checkUrlWithoutFileName.Replace(checkUrlFileName, string.Empty); + } + + string baseUrlWithoutFileName = baseUri.LocalPath; + string baseUrlFileName = Path.GetFileName(baseUri.ToString()); + + if (!string.IsNullOrWhiteSpace(baseUrlFileName)) + { + baseUrlWithoutFileName = baseUri.LocalPath.Replace(baseUrlFileName, string.Empty); + } + + return baseUri.ToString() == checkUri.ToString() || (baseUri.Host == checkUri.Host && ( + checkUri.LocalPath.StartsWith(baseUri.LocalPath) || + checkUri.LocalPath.StartsWith(baseUrlWithoutFileName) || + baseUri.LocalPath.StartsWith(checkUrlWithoutFileName) + )); + } + + public static bool SameHostAndDirectoryDirectory(Uri baseUri, Uri checkUri) + { + return baseUri.ToString() == checkUri.ToString() || (baseUri.Host == checkUri.Host && + checkUri.LocalPath.StartsWith(baseUri.LocalPath) + ); + } + /// /// Check simple cases of file size /// diff --git a/src/OpenDirectoryDownloader/OpenDirectoryIndexer.cs b/src/OpenDirectoryDownloader/OpenDirectoryIndexer.cs index 075cbc61..38f481e0 100644 --- a/src/OpenDirectoryDownloader/OpenDirectoryIndexer.cs +++ b/src/OpenDirectoryDownloader/OpenDirectoryIndexer.cs @@ -531,7 +531,7 @@ private void TimerStatistics_Elapsed(object sender, System.Timers.ElapsedEventAr if (WebDirectoriesQueue.Any() || RunningWebDirectoryThreads > 0 || WebFilesFileSizeQueue.Any() || RunningWebFileFileSizeThreads > 0) { stringBuilder.AppendLine(Statistics.GetSessionStats(Session)); - stringBuilder.AppendLine($"Queue: {Library.FormatWithThousands(WebDirectoriesQueue.Count)} ({RunningWebDirectoryThreads}), Queue (filesizes): {Library.FormatWithThousands(WebFilesFileSizeQueue.Count)} ({RunningWebFileFileSizeThreads})"); + stringBuilder.AppendLine($"Queue: {Library.FormatWithThousands(WebDirectoriesQueue.Count)} ({RunningWebDirectoryThreads} threads), Queue (filesizes): {Library.FormatWithThousands(WebFilesFileSizeQueue.Count)} ({RunningWebFileFileSizeThreads} threads)"); } string statistics = stringBuilder.ToString(); @@ -605,7 +605,7 @@ private async Task WebDirectoryProcessor(ConcurrentQueue queue, st if (parsedWebDirectory != null) { - DirectoryParser.CheckParsedResults(parsedWebDirectory); + DirectoryParser.CheckParsedResults(parsedWebDirectory, Session.Root.Uri.ToString(), true); AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } } @@ -627,7 +627,7 @@ private async Task WebDirectoryProcessor(ConcurrentQueue queue, st } else { - if (Session.Root.Uri.Host == Constants.BlitzfilesTechDomain || SameHostAndDirectory(Session.Root.Uri, webDirectory.Uri)) + if (Session.Root.Uri.Host == Constants.BlitzfilesTechDomain || DirectoryParser.SameHostAndDirectoryFile(Session.Root.Uri, webDirectory.Uri)) { Logger.Debug($"[{name}] Start download '{webDirectory.Url}'"); Session.TotalHttpRequests++; @@ -720,31 +720,6 @@ private async Task WebDirectoryProcessor(ConcurrentQueue queue, st Logger.Debug($"Finished [{name}]"); } - private bool SameHostAndDirectory(Uri baseUri, Uri checkUri) - { - string checkUrlWithoutFileName = checkUri.LocalPath.Replace("index.php", string.Empty); - string checkUrlFileName = Path.GetFileName(checkUri.ToString()); - - if (!string.IsNullOrWhiteSpace(checkUrlFileName)) - { - checkUrlWithoutFileName = checkUrlWithoutFileName.Replace(checkUrlFileName, string.Empty); - } - - string baseUrlWithoutFileName = baseUri.LocalPath; - string baseUrlFileName = Path.GetFileName(baseUri.ToString()); - - if (!string.IsNullOrWhiteSpace(baseUrlFileName)) - { - baseUrlWithoutFileName = baseUri.LocalPath.Replace(baseUrlFileName, string.Empty); - } - - return baseUri.ToString() == checkUri.ToString() || (baseUri.Host == checkUri.Host && ( - checkUri.LocalPath.StartsWith(baseUri.LocalPath) || - checkUri.LocalPath.StartsWith(baseUrlWithoutFileName) || - baseUri.LocalPath.StartsWith(checkUrlWithoutFileName) - )); - } - private async Task ProcessWebDirectoryAsync(string name, WebDirectory webDirectory, CancellationTokenSource cancellationTokenSource) { if (Session.Parameters.ContainsKey(Constants.Parameters_GdIndex_RootId)) @@ -1177,7 +1152,7 @@ private void AddProcessedWebDirectory(WebDirectory webDirectory, WebDirectory pa { if (!Session.ProcessedUrls.Contains(subdirectory.Url)) { - if (subdirectory.Uri.Host != Constants.GoogleDriveDomain && subdirectory.Uri.Host != Constants.BlitzfilesTechDomain && !SameHostAndDirectory(Session.Root.Uri, subdirectory.Uri)) + if (subdirectory.Uri.Host != Constants.GoogleDriveDomain && subdirectory.Uri.Host != Constants.BlitzfilesTechDomain && !DirectoryParser.SameHostAndDirectoryFile(Session.Root.Uri, subdirectory.Uri)) { Logger.Debug($"Removed subdirectory {subdirectory.Uri} from parsed webdirectory because it is not the same host"); } @@ -1198,18 +1173,6 @@ private void AddProcessedWebDirectory(WebDirectory webDirectory, WebDirectory pa Session.UrlsWithErrors.Add(webDirectory.Url); } - webDirectory.Files.Where(f => - { - Uri uri = new Uri(f.Url); - - if (uri.Host == Constants.GoogleDriveDomain || uri.Host == Constants.BlitzfilesTechDomain) - { - return false; - } - - return (uri.Scheme != Constants.UriScheme.Https && uri.Scheme != Constants.UriScheme.Http && uri.Scheme != Constants.UriScheme.Ftp && uri.Scheme != Constants.UriScheme.Ftps) || uri.Host != Session.Root.Uri.Host || !SameHostAndDirectory(uri, Session.Root.Uri); - }).ToList().ForEach(wd => webDirectory.Files.Remove(wd)); - if (Session.Root.Uri.Scheme != Constants.UriScheme.Ftp && Session.Root.Uri.Scheme != Constants.UriScheme.Ftps) { foreach (WebFile webFile in webDirectory.Files.Where(f => (f.FileSize == Constants.NoFileSize && !OpenDirectoryIndexerSettings.CommandLineOptions.FastScan) || OpenDirectoryIndexerSettings.CommandLineOptions.ExactFileSizes))