Skip to content

Commit

Permalink
- Replace all duplicate code for processing urls
Browse files Browse the repository at this point in the history
  • Loading branch information
KoalaBear84 committed Sep 6, 2021
1 parent 34cc70d commit c9f58d6
Showing 1 changed file with 49 additions and 60 deletions.
109 changes: 49 additions & 60 deletions src/OpenDirectoryDownloader/DirectoryParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -376,12 +376,11 @@ private static WebDirectory ParseCustomDivListing(ref string baseUrl, WebDirecto
{
string size = divElement.QuerySelector("em").TextContent.Trim();
IElement link = divElement.QuerySelector("a");

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

bool isFile = IsFileSize(size);

if (!isFile)
Expand Down Expand Up @@ -472,12 +471,11 @@ private static WebDirectory ParseIpfsDirectoryListing(string baseUrl, WebDirecto
foreach (IElement tableRow in htmlDocument.QuerySelectorAll("table tr"))
{
IElement link = tableRow.QuerySelector("a");

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

string size = tableRow.QuerySelector("td:nth-child(3)").TextContent.Trim();
bool isFile = IsFileSize(size);

Expand Down Expand Up @@ -515,9 +513,8 @@ private static WebDirectory ParseDirectoryListingDoctComDirectoryListing(string

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

string size = listItem.QuerySelector(".file-size").TextContent.Trim();
bool isFile = IsFileSize(size);

Expand Down Expand Up @@ -570,9 +567,8 @@ private static WebDirectory ParseSnifDirectoryListing(string baseUrl, WebDirecto

if (IsValidLink(link))
{
string linkHref = link?.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

string size = tableRow.QuerySelector($"td:nth-child({fileSizeHeaderColumnIndex})")?.TextContent.Trim();
bool isFile = IsFileSize(size) && !size.Contains("item");

Expand Down Expand Up @@ -694,11 +690,10 @@ private static WebDirectory ParseH5aiDirectoryListing(string baseUrl, WebDirecto
{
IHtmlAnchorElement link = tableRow.QuerySelector($"td:nth-child({nameHeaderColumnIndex})")?.QuerySelector("a") as IHtmlAnchorElement;

if (link != null && IsValidLink(link))
if (IsValidLink(link))
{
string linkHref = link?.Attributes["href"]?.Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

string size = tableRow.QuerySelector($"td:nth-child({fileSizeHeaderColumnIndex})")?.TextContent.Trim();
bool isFile = !string.IsNullOrWhiteSpace(size);
IElement image = tableRow.QuerySelector("img");
Expand Down Expand Up @@ -807,9 +802,7 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
{
addedEntry = true;

string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

fullUrl = StripUrl(fullUrl);

Expand Down Expand Up @@ -958,6 +951,13 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
return parsedWebDirectory;
}

private static void ProcessUrl(string baseUrl, IElement link, out string linkHref, out Uri uri, out string fullUrl)
{
linkHref = link.Attributes["href"]?.Value;
uri = new Uri(new Uri(baseUrl), linkHref);
fullUrl = uri.ToString();
}

private static readonly Func<WebDirectory, string, string, Task<bool>> RegexParser1 = async (webDirectory, baseUrl, line) =>
{
Match match = Regex.Match(line, @"(?:<img.*>\s*)+<a.*?>.*?<\/a>\S*\s*(?<Modified>\d*-(?:[a-zA-Z]*|\d*)-\d*\s*\d*:\d*(:\d*)?)?\s*(?<FileSize>\S+)?(\s*(?<Description>.*))?");
Expand All @@ -974,9 +974,7 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
IElement link = parsedLine.QuerySelector("a");
if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

bool isFile = IsFileSize(match.Groups["FileSize"].Value.Trim()) && parsedLine.QuerySelector("img[alt=\"[DIR]\"]") == null;

Expand Down Expand Up @@ -1026,9 +1024,8 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

string fileSizeGroup = match.Groups["FileSize"].Value.Trim();
bool isFile = long.TryParse(fileSizeGroup, out long fileSize);

Expand Down Expand Up @@ -1079,9 +1076,8 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

bool isFile = match.Groups["FileSize"].Value.Trim() != "&lt;dir&gt;" && match.Groups["FileSize"].Value.Trim() != "DIR";

if (!isFile)
Expand Down Expand Up @@ -1124,8 +1120,6 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (match.Success)
{
bool isFile = match.Groups["FileSize"].Value.Trim() != "&lt;dir&gt;";

IHtmlDocument parsedLine = await HtmlParser.ParseDocumentAsync(line);

if (parsedLine.QuerySelector("img[alt=\"[ICO]\"]") == null &&
Expand All @@ -1137,9 +1131,9 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

bool isFile = match.Groups["FileSize"].Value.Trim() != "&lt;dir&gt;";

if (!isFile)
{
Expand Down Expand Up @@ -1194,9 +1188,7 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

if (!isFile)
{
Expand Down Expand Up @@ -1238,7 +1230,6 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (match.Success)
{
bool isFile = match.Groups["FileSize"].Value.Trim() != "&lt;dir&gt;";

IHtmlDocument parsedLine = await HtmlParser.ParseDocumentAsync(line);

Expand All @@ -1251,9 +1242,9 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

bool isFile = match.Groups["FileSize"].Value.Trim() != "&lt;dir&gt;";

if (!isFile)
{
Expand Down Expand Up @@ -1295,7 +1286,6 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (match.Success)
{
bool isFile = !match.Groups["FileMode"].Value.ToLower().StartsWith("d");

IHtmlDocument parsedLine = await HtmlParser.ParseDocumentAsync(line);

Expand All @@ -1305,9 +1295,9 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

bool isFile = !match.Groups["FileMode"].Value.ToLower().StartsWith("d");

if (!isFile)
{
Expand Down Expand Up @@ -1356,8 +1346,6 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (match.Success && (match.Groups["IsDirectory"].Success && !string.IsNullOrWhiteSpace(match.Groups["IsDirectory"].Value)) != match.Groups["FileSize"].Success)
{
bool isFile = !string.IsNullOrWhiteSpace(match.Groups["FileSize"].Value) && match.Groups["FileSize"].Value.Trim() != "-";

if (match.Groups["FileSize"].Value.Contains("<"))
{
return false;
Expand All @@ -1371,9 +1359,9 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

bool isFile = !string.IsNullOrWhiteSpace(match.Groups["FileSize"].Value) && match.Groups["FileSize"].Value.Trim() != "-";

if (!isFile)
{
Expand Down Expand Up @@ -1611,9 +1599,8 @@ private static WebDirectory ParseMaterialDesignListItemsDirectoryListing(string
fileSize = listItem.Attributes["data-sort-size"].Value;
}

string linkHref = link.Attributes["href"]?.Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

bool isFile = listItem.ClassList.Contains("file");

if (!isFile)
Expand Down Expand Up @@ -1661,9 +1648,8 @@ private static WebDirectory ParseDirectoryListerDirectoryListing(string baseUrl,
{
if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

bool isFile = !link.QuerySelector("i").ClassList.Contains("fa-folder");
UrlEncodingParser urlEncodingParser = new UrlEncodingParser(fullUrl);

Expand Down Expand Up @@ -1754,9 +1740,7 @@ private static void ProcessLink(string baseUrl, WebDirectory parsedWebDirectory,
{
try
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
ProcessUrl(baseUrl, link, out string linkHref, out Uri uri, out string fullUrl);

fullUrl = StripUrl(fullUrl);

Expand Down Expand Up @@ -2224,6 +2208,11 @@ private static HeaderInfo GetHeaderInfo(IElement header)

private static bool IsValidLink(IElement link)
{
if (link == null)
{
return false;
}

string linkHref = link.Attributes["href"]?.Value;

return
Expand Down

0 comments on commit c9f58d6

Please sign in to comment.