Skip to content

Commit

Permalink
- Small optimisations, check if link is valid first
Browse files Browse the repository at this point in the history
  • Loading branch information
KoalaBear84 committed Sep 6, 2021
1 parent 84b27cb commit 34cc70d
Showing 1 changed file with 46 additions and 66 deletions.
112 changes: 46 additions & 66 deletions src/OpenDirectoryDownloader/DirectoryParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -375,16 +375,14 @@ private static WebDirectory ParseCustomDivListing(ref string baseUrl, WebDirecto
foreach (IElement divElement in divElements)
{
string size = divElement.QuerySelector("em").TextContent.Trim();

bool isFile = IsFileSize(size);

IElement link = divElement.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;


if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
bool isFile = IsFileSize(size);

if (!isFile)
{
Expand Down Expand Up @@ -473,17 +471,15 @@ private static WebDirectory ParseIpfsDirectoryListing(string baseUrl, WebDirecto
{
foreach (IElement tableRow in htmlDocument.QuerySelectorAll("table tr"))
{
string size = tableRow.QuerySelector("td:nth-child(3)").TextContent.Trim();

bool isFile = IsFileSize(size);

IElement link = tableRow.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;


if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
string size = tableRow.QuerySelector("td:nth-child(3)").TextContent.Trim();
bool isFile = IsFileSize(size);

if (!isFile)
{
Expand Down Expand Up @@ -515,17 +511,15 @@ private static WebDirectory ParseDirectoryListingDoctComDirectoryListing(string
{
foreach (IElement listItem in listItems)
{
string size = listItem.QuerySelector(".file-size").TextContent.Trim();

bool isFile = IsFileSize(size);

IElement link = listItem.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
string size = listItem.QuerySelector(".file-size").TextContent.Trim();
bool isFile = IsFileSize(size);

if (!isFile)
{
Expand Down Expand Up @@ -572,17 +566,15 @@ private static WebDirectory ParseSnifDirectoryListing(string baseUrl, WebDirecto
{
if (!tableRow.ClassList.Contains("snHeading") && !tableRow.QuerySelector("td").ClassList.Contains("snDir"))
{
string size = tableRow.QuerySelector($"td:nth-child({fileSizeHeaderColumnIndex})")?.TextContent.Trim();

bool isFile = IsFileSize(size) && !size.Contains("item");

IHtmlAnchorElement link = tableRow.QuerySelector($"td:nth-child({nameHeaderColumnIndex})")?.QuerySelector("a") as IHtmlAnchorElement;
string linkHref = link?.Attributes["href"].Value;

if (IsValidLink(link))
{
string linkHref = link?.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
string size = tableRow.QuerySelector($"td:nth-child({fileSizeHeaderColumnIndex})")?.TextContent.Trim();
bool isFile = IsFileSize(size) && !size.Contains("item");

if (!isFile)
{
Expand Down Expand Up @@ -639,17 +631,15 @@ private static WebDirectory ParsePureDirectoryListing(ref string baseUrl, WebDir

foreach (IElement tableRow in pureTableRows)
{
string size = tableRow.QuerySelector($"td:nth-child({fileSizeHeaderColumnIndex})")?.TextContent.Trim();

bool isFile = !tableRow.ClassList.Contains("dir");

IElement link = tableRow.QuerySelector($"td:nth-child({nameHeaderColumnIndex})")?.QuerySelector("a");
string linkHref = link.TextContent;

if (IsValidLink(link))
{
string linkHref = link.TextContent;
Uri uri = new Uri(new Uri(baseUrl), Uri.EscapeUriString(linkHref));
string fullUrl = uri.ToString();
string size = tableRow.QuerySelector($"td:nth-child({fileSizeHeaderColumnIndex})")?.TextContent.Trim();
bool isFile = !tableRow.ClassList.Contains("dir");

if (!isFile)
{
Expand Down Expand Up @@ -702,23 +692,21 @@ private static WebDirectory ParseH5aiDirectoryListing(string baseUrl, WebDirecto

foreach (IElement tableRow in h5aiTableRows)
{
string size = tableRow.QuerySelector($"td:nth-child({fileSizeHeaderColumnIndex})")?.TextContent.Trim();

bool isFile = !string.IsNullOrWhiteSpace(size);
IElement image = tableRow.QuerySelector("img");

if (isFile && image != null && image.HasAttribute("alt") && image.Attributes["alt"].Value == "folder")
{
isFile = false;
}

IHtmlAnchorElement link = tableRow.QuerySelector($"td:nth-child({nameHeaderColumnIndex})")?.QuerySelector("a") as IHtmlAnchorElement;
string linkHref = link?.Attributes["href"]?.Value;

if (link != null && IsValidLink(link))
{
string linkHref = link?.Attributes["href"]?.Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
string size = tableRow.QuerySelector($"td:nth-child({fileSizeHeaderColumnIndex})")?.TextContent.Trim();
bool isFile = !string.IsNullOrWhiteSpace(size);
IElement image = tableRow.QuerySelector("img");

if (isFile && image != null && image.HasAttribute("alt") && image.Attributes["alt"].Value == "folder")
{
isFile = false;
}

if (!isFile)
{
Expand Down Expand Up @@ -984,10 +972,9 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
!line.ToLower().Contains("parent directory"))
{
IElement link = parsedLine.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();

Expand Down Expand Up @@ -1033,17 +1020,16 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (match.Success)
{
string fileSizeGroup = match.Groups["FileSize"].Value.Trim();

IHtmlDocument parsedLine = await HtmlParser.ParseDocumentAsync(line);
IElement link = parsedLine.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;

Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
string fileSizeGroup = match.Groups["FileSize"].Value.Trim();
bool isFile = long.TryParse(fileSizeGroup, out long fileSize);

if (!isFile && IsFileSize(fileSizeGroup))
Expand Down Expand Up @@ -1082,8 +1068,6 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec

if (match.Success)
{
bool isFile = match.Groups["FileSize"].Value.Trim() != "<dir>" && match.Groups["FileSize"].Value.Trim() != "DIR";

IHtmlDocument parsedLine = await HtmlParser.ParseDocumentAsync(line);

if (parsedLine.QuerySelector("img[alt=\"[ICO]\"]") == null &&
Expand All @@ -1092,12 +1076,13 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
!line.ToLower().Contains("parent directory"))
{
IElement link = parsedLine.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
bool isFile = match.Groups["FileSize"].Value.Trim() != "<dir>" && match.Groups["FileSize"].Value.Trim() != "DIR";

if (!isFile)
{
Expand Down Expand Up @@ -1149,10 +1134,10 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
!line.ToLower().Contains("parent directory"))
{
IElement link = parsedLine.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();

Expand Down Expand Up @@ -1206,10 +1191,10 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
!line.ToLower().Contains("parent directory"))
{
IElement link = parsedLine.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();

Expand Down Expand Up @@ -1263,10 +1248,10 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
!line.ToLower().Contains("parent directory"))
{
IElement link = parsedLine.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();

Expand Down Expand Up @@ -1317,10 +1302,10 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
if (parsedLine.QuerySelector("a") != null)
{
IElement link = parsedLine.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();

Expand Down Expand Up @@ -1383,10 +1368,10 @@ private static WebDirectory ParseTablesDirectoryListing(string baseUrl, WebDirec
if (parsedLine.QuerySelector("a") != null)
{
IElement link = parsedLine.QuerySelector("a");
string linkHref = link.Attributes["href"].Value;

if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();

Expand Down Expand Up @@ -1591,10 +1576,6 @@ private static WebDirectory ParseMaterialDesignListItemsDirectoryListing(string

if (link?.Attributes["href"] != null)
{
string linkHref = link.Attributes["href"]?.Value;

bool isFile = listItem.ClassList.Contains("file");

if (IsValidLink(link))
{
string name = string.Empty;
Expand Down Expand Up @@ -1630,8 +1611,10 @@ private static WebDirectory ParseMaterialDesignListItemsDirectoryListing(string
fileSize = listItem.Attributes["data-sort-size"].Value;
}

string linkHref = link.Attributes["href"]?.Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
bool isFile = listItem.ClassList.Contains("file");

if (!isFile)
{
Expand Down Expand Up @@ -1672,19 +1655,16 @@ private static WebDirectory ParseDirectoryListerDirectoryListing(string baseUrl,
tableHeaderInfos.Add(GetHeaderInfo(headerDiv));
}

IHtmlCollection<IElement> entries = htmlDocument.QuerySelectorAll("#content ul#file-list li").Last().QuerySelectorAll("a");
IHtmlCollection<IElement> links = htmlDocument.QuerySelectorAll("#content ul#file-list li").Last().QuerySelectorAll("a");

foreach (IElement entry in entries)
foreach (IElement link in links)
{
bool isFile = !entry.QuerySelector("i").ClassList.Contains("fa-folder");

//IElement link = entry.QuerySelector("a");
string linkHref = entry.Attributes["href"].Value;

if (IsValidLink(entry))
if (IsValidLink(link))
{
string linkHref = link.Attributes["href"].Value;
Uri uri = new Uri(new Uri(baseUrl), linkHref);
string fullUrl = uri.ToString();
bool isFile = !link.QuerySelector("i").ClassList.Contains("fa-folder");
UrlEncodingParser urlEncodingParser = new UrlEncodingParser(fullUrl);

if (!isFile)
Expand All @@ -1700,10 +1680,10 @@ private static WebDirectory ParseDirectoryListerDirectoryListing(string baseUrl,
{
try
{
List<IElement> divs = entry.QuerySelectorAll("div > div").ToList();
List<IElement> divs = link.QuerySelectorAll("div > div").ToList();
// Remove file info 'column'
divs.RemoveAt(tableHeaderInfos.FindIndex(h => h.Type == HeaderType.FileName) + 1);
string fileSize = entry.QuerySelectorAll("div > div").Skip(2).ToList()[tableHeaderInfos.FindIndex(h => h.Type == HeaderType.FileSize)].TextContent;
string fileSize = link.QuerySelectorAll("div > div").Skip(2).ToList()[tableHeaderInfos.FindIndex(h => h.Type == HeaderType.FileSize)].TextContent;

parsedWebDirectory.Files.Add(new WebFile
{
Expand Down

0 comments on commit 34cc70d

Please sign in to comment.