Skip to content

Commit

Permalink
- Add support for some JavaScript drawn / protected directory listings (
Browse files Browse the repository at this point in the history
  • Loading branch information
KoalaBear84 committed Aug 9, 2022
1 parent b6a4c84 commit 7ef6e16
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 75 deletions.
206 changes: 139 additions & 67 deletions src/OpenDirectoryDownloader/BrowserContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

namespace OpenDirectoryDownloader;

public class BrowserContext
public class BrowserContext: IDisposable
{
private static readonly Logger Logger = LogManager.GetCurrentClassLogger();

Expand All @@ -21,21 +21,32 @@ public class BrowserContext
private Browser Browser { get; set; }
private Page Page { get; set; }
private CookieContainer CookieContainer { get; }
public bool CloudFlare { get; }
public bool DebugInfo { get; }
public TimeSpan Timeout { get; set; }
private string Url { get; }
private CancellationTokenSource CancellationTokenSource { get; set; } = new CancellationTokenSource();
private bool OK { get; set; }

public BrowserContext(string url, CookieContainer cookieContainer, bool debugInfo = false, TimeSpan timeout = default)
public BrowserContext(CookieContainer cookieContainer, bool cloudFlare = false, bool debugInfo = false, TimeSpan timeout = default)
{
Url = url;
CookieContainer = cookieContainer;
CloudFlare = cloudFlare;
DebugInfo = debugInfo;
Timeout = timeout;
}

public async Task<bool> DoAsync()
~BrowserContext()
{
Dispose();
}

public void Dispose()
{
Page?.Dispose();
Browser?.Dispose();
}

public async Task<bool> DoCloudFlareAsync(string url)
{
try
{
Expand All @@ -46,7 +57,45 @@ public async Task<bool> DoAsync()

CancellationTokenSource.CancelAfter(Timeout);

BrowserFetcher browserFetcher = new BrowserFetcher();
await InitializeAsync();

Stopwatch stopwatch = Stopwatch.StartNew();

Logger.Debug($"Navigating to {url}..");

await Page.GoToAsync(url);
await Task.Delay(TimeSpan.FromSeconds(60), CancellationTokenSource.Token);

Logger.Debug($"Navigation done in {stopwatch.ElapsedMilliseconds}ms");

Logger.Debug("Finished with browser!");
}
catch (OperationCanceledException ex)
{
if (!OK)
{
Logger.Error(ex, "Looks like Cloudflare protection wasn't solved in time.");
}
}
catch (Exception ex)
{
Logger.Error(ex, "Error with browser");
}
finally
{
Logger.Debug("Closing browser");
await Browser.CloseAsync();
Logger.Debug("Closed browser");
}

return OK;
}

public async Task InitializeAsync()
{
try
{
BrowserFetcher browserFetcher = new();

if (!browserFetcher.LocalRevisions().Contains(BrowserFetcher.DefaultChromiumRevision))
{
Expand All @@ -57,87 +106,106 @@ public async Task<bool> DoAsync()

Logger.Debug($"Creating browser...");

PuppeteerExtra puppeteerExtra = new PuppeteerExtra();
PuppeteerExtra puppeteerExtra = new();

// Use stealth plugin (needed for Cloudflare / hCaptcha)
puppeteerExtra.Use(new StealthPlugin());

using (Browser = await puppeteerExtra.LaunchAsync(new LaunchOptions
Browser = await puppeteerExtra.LaunchAsync(new LaunchOptions
{
Headless = false,
Args = new[] { "--no-sandbox", "--disable-setuid-sandbox", $"--user-agent=\"{Constants.UserAgent.Chrome}\"" },
DefaultViewport = null,
IgnoreHTTPSErrors = true
}))
});

Logger.Info($"Started browser with PID {Browser.Process.Id}");

Browser.Closed += Browser_Closed;
Browser.Disconnected += Browser_Disconnected;
Browser.TargetChanged += Browser_TargetChanged;
Browser.TargetCreated += Browser_TargetCreated;
Browser.TargetDestroyed += Browser_TargetDestroyed;

Logger.Debug($"Created browser.");

Logger.Debug($"Creating page...");

Page = (await Browser.PagesAsync())[0];

Page.Close += Page_Close;
Page.Console += Page_Console;
Page.Dialog += Page_Dialog;
Page.DOMContentLoaded += Page_DOMContentLoaded;
Page.Error += Page_Error;
Page.FrameAttached += Page_FrameAttached;
Page.FrameDetached += Page_FrameDetached;
Page.FrameNavigated += Page_FrameNavigated;
Page.Load += Page_Load;
Page.Metrics += Page_Metrics;
Page.PageError += Page_PageError;
Page.Popup += Page_Popup;
Page.Request += Page_Request;
Page.RequestFailed += Page_RequestFailed;
Page.RequestFinished += Page_RequestFinished;
Page.RequestServedFromCache += Page_RequestServedFromCache;
Page.Response += Page_Response;
Page.WorkerCreated += Page_WorkerCreated;
Page.WorkerDestroyed += Page_WorkerDestroyed;

Logger.Debug($"Created page.");
}
catch (Exception ex)
{
Logger.Error(ex, "Error with initializing browser");
throw;
}
}

public async Task<CookieParam[]> GetCookiesAsync() => await Page.GetCookiesAsync();

public async Task<string> GetHtml(string url)
{
try
{
if (Timeout == default)
{
Logger.Info($"Started browser with PID {Browser.Process.Id}");
Timeout = TimeSpan.FromMinutes(1);
}

Browser.Closed += Browser_Closed;
Browser.Disconnected += Browser_Disconnected;
Browser.TargetChanged += Browser_TargetChanged;
Browser.TargetCreated += Browser_TargetCreated;
Browser.TargetDestroyed += Browser_TargetDestroyed;
CancellationTokenSource.CancelAfter(Timeout);

Logger.Debug($"Created browser.");
Stopwatch stopwatch = Stopwatch.StartNew();

Logger.Debug($"Creating page...");
Logger.Debug($"Navigating to {url}..");

using (Page = (await Browser.PagesAsync())[0])
{
Page.Close += Page_Close;
Page.Console += Page_Console;
Page.Dialog += Page_Dialog;
Page.DOMContentLoaded += Page_DOMContentLoaded;
Page.Error += Page_Error;
Page.FrameAttached += Page_FrameAttached;
Page.FrameDetached += Page_FrameDetached;
Page.FrameNavigated += Page_FrameNavigated;
Page.Load += Page_Load;
Page.Metrics += Page_Metrics;
Page.PageError += Page_PageError;
Page.Popup += Page_Popup;
Page.Request += Page_Request;
Page.RequestFailed += Page_RequestFailed;
Page.RequestFinished += Page_RequestFinished;
Page.RequestServedFromCache += Page_RequestServedFromCache;
Page.Response += Page_Response;
Page.WorkerCreated += Page_WorkerCreated;
Page.WorkerDestroyed += Page_WorkerDestroyed;

Logger.Debug($"Created page.");

Stopwatch stopwatch = Stopwatch.StartNew();

Logger.Debug($"Navigating to {Url}..");

await Page.GoToAsync(Url);
await Task.Delay(TimeSpan.FromSeconds(60), CancellationTokenSource.Token);

Logger.Debug($"Navigation done in {stopwatch.ElapsedMilliseconds}ms");
}
}
NavigationOptions navigationOptions = new()
{
Timeout = (int)Timeout.TotalMilliseconds,
WaitUntil = new[] { WaitUntilNavigation.DOMContentLoaded }
};

Logger.Debug("Finished with browser!");
await Page.GoToAsync(url, navigationOptions);
Logger.Debug($"Navigation done in {stopwatch.ElapsedMilliseconds}ms");

string html = await Page.GetContentAsync();

return html;
}
catch (OperationCanceledException ex)
{
if (!OK)
{
Logger.Error(ex, "Looks like Cloudflare protection wasn't solved in time.");
Logger.Error(ex, "Timeout in navigating to URL");
}
}
catch (Exception ex)
{
Logger.Error(ex, "Error with browser");
}
finally
{
Logger.Debug("Closing browser");
await Browser.CloseAsync();
Logger.Debug("Closed browser");
throw;
}

return OK;
return null;
}

private void Browser_Closed(object sender, EventArgs e)
Expand Down Expand Up @@ -332,17 +400,21 @@ private void Page_Response(object sender, ResponseCreatedEventArgs e)
if (theCookie != null)
{
CookieContainer.SetCookies(new Uri(baseUrl), theCookie);
Cookie cloudflareClearance = CookieContainer.GetCookies(new Uri(baseUrl)).FirstOrDefault(c => c.Name == CloudflareClearanceKey);

if (cloudflareClearance != null)
if (CloudFlare)
{
if (DebugInfo)
Cookie cloudflareClearance = CookieContainer.GetCookies(new Uri(baseUrl)).FirstOrDefault(c => c.Name == CloudflareClearanceKey);

if (cloudflareClearance != null)
{
Console.WriteLine($"Cloudflare clearance cookie found: {cloudflareClearance.Value}");
}
if (DebugInfo)
{
Console.WriteLine($"Cloudflare clearance cookie found: {cloudflareClearance.Value}");
}

OK = true;
CancellationTokenSource.Cancel();
OK = true;
CancellationTokenSource.Cancel();
}
}
}
}
Expand Down
52 changes: 48 additions & 4 deletions src/OpenDirectoryDownloader/DirectoryParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
using OpenDirectoryDownloader.Site.GoFileIO;
using OpenDirectoryDownloader.Site.Mediafire;
using OpenDirectoryDownloader.Site.Pixeldrain;
using PuppeteerSharp;
using System;
using System.Collections.Generic;
using System.Diagnostics;
Expand All @@ -44,7 +45,7 @@ public static class DirectoryParser
/// <param name="baseUrl">Base url</param>
/// <param name="html">Html to parse</param>
/// <returns>WebDirectory object containing current directory index</returns>
public static async Task<WebDirectory> ParseHtml(WebDirectory webDirectory, string html, HttpClient httpClient = null, HttpResponseMessage httpResponseMessage = null, bool checkParents = true)
public static async Task<WebDirectory> ParseHtml(WebDirectory webDirectory, string html, HttpClient httpClient = null, HttpClientHandler httpClientHandler = null, HttpResponseMessage httpResponseMessage = null, bool checkParents = true)
{
string baseUrl = webDirectory.Url;

Expand Down Expand Up @@ -152,7 +153,7 @@ public static async Task<WebDirectory> ParseHtml(WebDirectory webDirectory, stri
break;
}
}

if (googleDriveIndexType is not null)
{
break;
Expand All @@ -162,11 +163,11 @@ public static async Task<WebDirectory> ParseHtml(WebDirectory webDirectory, stri
if (googleDriveIndexType is not null)
{
OpenDirectoryIndexer.Session.Parameters[Constants.GoogleDriveIndexType] = googleDriveIndexType;

if (OpenDirectoryIndexer.Session.MaxThreads != 1)
{
OpenDirectoryIndexer.Session.MaxThreads = 1;
Logger.Warn($"Reduce threads to 1 because of Google Drive index");
OpenDirectoryIndexer.Session.MaxThreads = 1;
}
}
}
Expand Down Expand Up @@ -345,6 +346,49 @@ public static async Task<WebDirectory> ParseHtml(WebDirectory webDirectory, stri
if (parsedWebDirectory.Subdirectories.Count == 0 && parsedWebDirectory.Files.Count == 0 && htmlDocument.QuerySelector("noscript") != null)
{
Logger.Warn("No directories and files found, but did find a <noscript> tag, probably a JavaScript challenge in there which is unsupported");

if (!OpenDirectoryIndexer.Session.CommandLineOptions.NoBrowser && httpClient is not null)
{
if (OpenDirectoryIndexer.Session.MaxThreads != 1)
{
Logger.Warn($"Reduce threads to 1 because of possible Browser JavaScript");
OpenDirectoryIndexer.Session.MaxThreads = 1;
}

if (OpenDirectoryIndexer.BrowserContext is null)
{
Logger.Warn($"Starting Browser..");
OpenDirectoryIndexer.BrowserContext = new(httpClientHandler.CookieContainer);
await OpenDirectoryIndexer.BrowserContext.InitializeAsync();
Logger.Warn($"Started Browser");
}

Logger.Warn($"Retrieving HTML through Browser..");
string browserHtml = await OpenDirectoryIndexer.BrowserContext.GetHtml(webDirectory.Url);
Logger.Warn($"Retrieved HTML through Browser");

// Transfer cookies to HttpClient, so hopefully the following requests can be done with the help of cookies
CookieParam[] cookieParams = await OpenDirectoryIndexer.BrowserContext.GetCookiesAsync();

foreach (CookieParam cookieParam in cookieParams)
{
httpClientHandler.CookieContainer.Add(new Cookie
{
Name = cookieParam.Name,
Domain = cookieParam.Domain,
Path = cookieParam.Path,
Expires = Library.UnixTimestampToDateTime((long)cookieParam.Expires),
HttpOnly = cookieParam.HttpOnly ?? false,
Value = cookieParam.Value,
Secure = cookieParam.Secure ?? false
});
}

if (browserHtml != html)
{
return await ParseHtml(webDirectory, browserHtml, httpClient, httpClientHandler);
}
}
}

return parsedWebDirectory;
Expand Down
Loading

0 comments on commit 7ef6e16

Please sign in to comment.