From 7ef6e164e6866e2ed276c8d33ae76d4b8d21170e Mon Sep 17 00:00:00 2001 From: KoalaBear Date: Tue, 9 Aug 2022 17:00:25 +0200 Subject: [PATCH] - Add support for some JavaScript drawn / protected directory listings (#77) --- src/OpenDirectoryDownloader/BrowserContext.cs | 206 ++++++++++++------ .../DirectoryParser.cs | 52 ++++- .../OpenDirectoryIndexer.cs | 32 ++- 3 files changed, 215 insertions(+), 75 deletions(-) diff --git a/src/OpenDirectoryDownloader/BrowserContext.cs b/src/OpenDirectoryDownloader/BrowserContext.cs index 06581aa6..32b49ff9 100644 --- a/src/OpenDirectoryDownloader/BrowserContext.cs +++ b/src/OpenDirectoryDownloader/BrowserContext.cs @@ -11,7 +11,7 @@ namespace OpenDirectoryDownloader; -public class BrowserContext +public class BrowserContext: IDisposable { private static readonly Logger Logger = LogManager.GetCurrentClassLogger(); @@ -21,21 +21,32 @@ public class BrowserContext private Browser Browser { get; set; } private Page Page { get; set; } private CookieContainer CookieContainer { get; } + public bool CloudFlare { get; } public bool DebugInfo { get; } public TimeSpan Timeout { get; set; } - private string Url { get; } private CancellationTokenSource CancellationTokenSource { get; set; } = new CancellationTokenSource(); private bool OK { get; set; } - public BrowserContext(string url, CookieContainer cookieContainer, bool debugInfo = false, TimeSpan timeout = default) + public BrowserContext(CookieContainer cookieContainer, bool cloudFlare = false, bool debugInfo = false, TimeSpan timeout = default) { - Url = url; CookieContainer = cookieContainer; + CloudFlare = cloudFlare; DebugInfo = debugInfo; Timeout = timeout; } - public async Task DoAsync() + ~BrowserContext() + { + Dispose(); + } + + public void Dispose() + { + Page?.Dispose(); + Browser?.Dispose(); + } + + public async Task DoCloudFlareAsync(string url) { try { @@ -46,7 +57,45 @@ public async Task DoAsync() CancellationTokenSource.CancelAfter(Timeout); - BrowserFetcher browserFetcher = new BrowserFetcher(); + await InitializeAsync(); + + Stopwatch stopwatch = Stopwatch.StartNew(); + + Logger.Debug($"Navigating to {url}.."); + + await Page.GoToAsync(url); + await Task.Delay(TimeSpan.FromSeconds(60), CancellationTokenSource.Token); + + Logger.Debug($"Navigation done in {stopwatch.ElapsedMilliseconds}ms"); + + Logger.Debug("Finished with browser!"); + } + catch (OperationCanceledException ex) + { + if (!OK) + { + Logger.Error(ex, "Looks like Cloudflare protection wasn't solved in time."); + } + } + catch (Exception ex) + { + Logger.Error(ex, "Error with browser"); + } + finally + { + Logger.Debug("Closing browser"); + await Browser.CloseAsync(); + Logger.Debug("Closed browser"); + } + + return OK; + } + + public async Task InitializeAsync() + { + try + { + BrowserFetcher browserFetcher = new(); if (!browserFetcher.LocalRevisions().Contains(BrowserFetcher.DefaultChromiumRevision)) { @@ -57,87 +106,106 @@ public async Task DoAsync() Logger.Debug($"Creating browser..."); - PuppeteerExtra puppeteerExtra = new PuppeteerExtra(); + PuppeteerExtra puppeteerExtra = new(); // Use stealth plugin (needed for Cloudflare / hCaptcha) puppeteerExtra.Use(new StealthPlugin()); - using (Browser = await puppeteerExtra.LaunchAsync(new LaunchOptions + Browser = await puppeteerExtra.LaunchAsync(new LaunchOptions { Headless = false, Args = new[] { "--no-sandbox", "--disable-setuid-sandbox", $"--user-agent=\"{Constants.UserAgent.Chrome}\"" }, DefaultViewport = null, IgnoreHTTPSErrors = true - })) + }); + + Logger.Info($"Started browser with PID {Browser.Process.Id}"); + + Browser.Closed += Browser_Closed; + Browser.Disconnected += Browser_Disconnected; + Browser.TargetChanged += Browser_TargetChanged; + Browser.TargetCreated += Browser_TargetCreated; + Browser.TargetDestroyed += Browser_TargetDestroyed; + + Logger.Debug($"Created browser."); + + Logger.Debug($"Creating page..."); + + Page = (await Browser.PagesAsync())[0]; + + Page.Close += Page_Close; + Page.Console += Page_Console; + Page.Dialog += Page_Dialog; + Page.DOMContentLoaded += Page_DOMContentLoaded; + Page.Error += Page_Error; + Page.FrameAttached += Page_FrameAttached; + Page.FrameDetached += Page_FrameDetached; + Page.FrameNavigated += Page_FrameNavigated; + Page.Load += Page_Load; + Page.Metrics += Page_Metrics; + Page.PageError += Page_PageError; + Page.Popup += Page_Popup; + Page.Request += Page_Request; + Page.RequestFailed += Page_RequestFailed; + Page.RequestFinished += Page_RequestFinished; + Page.RequestServedFromCache += Page_RequestServedFromCache; + Page.Response += Page_Response; + Page.WorkerCreated += Page_WorkerCreated; + Page.WorkerDestroyed += Page_WorkerDestroyed; + + Logger.Debug($"Created page."); + } + catch (Exception ex) + { + Logger.Error(ex, "Error with initializing browser"); + throw; + } + } + + public async Task GetCookiesAsync() => await Page.GetCookiesAsync(); + + public async Task GetHtml(string url) + { + try + { + if (Timeout == default) { - Logger.Info($"Started browser with PID {Browser.Process.Id}"); + Timeout = TimeSpan.FromMinutes(1); + } - Browser.Closed += Browser_Closed; - Browser.Disconnected += Browser_Disconnected; - Browser.TargetChanged += Browser_TargetChanged; - Browser.TargetCreated += Browser_TargetCreated; - Browser.TargetDestroyed += Browser_TargetDestroyed; + CancellationTokenSource.CancelAfter(Timeout); - Logger.Debug($"Created browser."); + Stopwatch stopwatch = Stopwatch.StartNew(); - Logger.Debug($"Creating page..."); + Logger.Debug($"Navigating to {url}.."); - using (Page = (await Browser.PagesAsync())[0]) - { - Page.Close += Page_Close; - Page.Console += Page_Console; - Page.Dialog += Page_Dialog; - Page.DOMContentLoaded += Page_DOMContentLoaded; - Page.Error += Page_Error; - Page.FrameAttached += Page_FrameAttached; - Page.FrameDetached += Page_FrameDetached; - Page.FrameNavigated += Page_FrameNavigated; - Page.Load += Page_Load; - Page.Metrics += Page_Metrics; - Page.PageError += Page_PageError; - Page.Popup += Page_Popup; - Page.Request += Page_Request; - Page.RequestFailed += Page_RequestFailed; - Page.RequestFinished += Page_RequestFinished; - Page.RequestServedFromCache += Page_RequestServedFromCache; - Page.Response += Page_Response; - Page.WorkerCreated += Page_WorkerCreated; - Page.WorkerDestroyed += Page_WorkerDestroyed; - - Logger.Debug($"Created page."); - - Stopwatch stopwatch = Stopwatch.StartNew(); - - Logger.Debug($"Navigating to {Url}.."); - - await Page.GoToAsync(Url); - await Task.Delay(TimeSpan.FromSeconds(60), CancellationTokenSource.Token); - - Logger.Debug($"Navigation done in {stopwatch.ElapsedMilliseconds}ms"); - } - } + NavigationOptions navigationOptions = new() + { + Timeout = (int)Timeout.TotalMilliseconds, + WaitUntil = new[] { WaitUntilNavigation.DOMContentLoaded } + }; - Logger.Debug("Finished with browser!"); + await Page.GoToAsync(url, navigationOptions); + Logger.Debug($"Navigation done in {stopwatch.ElapsedMilliseconds}ms"); + + string html = await Page.GetContentAsync(); + + return html; } catch (OperationCanceledException ex) { if (!OK) { - Logger.Error(ex, "Looks like Cloudflare protection wasn't solved in time."); + Logger.Error(ex, "Timeout in navigating to URL"); } } catch (Exception ex) { Logger.Error(ex, "Error with browser"); - } - finally - { - Logger.Debug("Closing browser"); - await Browser.CloseAsync(); - Logger.Debug("Closed browser"); + throw; } - return OK; + return null; } private void Browser_Closed(object sender, EventArgs e) @@ -332,17 +400,21 @@ private void Page_Response(object sender, ResponseCreatedEventArgs e) if (theCookie != null) { CookieContainer.SetCookies(new Uri(baseUrl), theCookie); - Cookie cloudflareClearance = CookieContainer.GetCookies(new Uri(baseUrl)).FirstOrDefault(c => c.Name == CloudflareClearanceKey); - if (cloudflareClearance != null) + if (CloudFlare) { - if (DebugInfo) + Cookie cloudflareClearance = CookieContainer.GetCookies(new Uri(baseUrl)).FirstOrDefault(c => c.Name == CloudflareClearanceKey); + + if (cloudflareClearance != null) { - Console.WriteLine($"Cloudflare clearance cookie found: {cloudflareClearance.Value}"); - } + if (DebugInfo) + { + Console.WriteLine($"Cloudflare clearance cookie found: {cloudflareClearance.Value}"); + } - OK = true; - CancellationTokenSource.Cancel(); + OK = true; + CancellationTokenSource.Cancel(); + } } } } diff --git a/src/OpenDirectoryDownloader/DirectoryParser.cs b/src/OpenDirectoryDownloader/DirectoryParser.cs index 08b85141..fc99da15 100644 --- a/src/OpenDirectoryDownloader/DirectoryParser.cs +++ b/src/OpenDirectoryDownloader/DirectoryParser.cs @@ -20,6 +20,7 @@ using OpenDirectoryDownloader.Site.GoFileIO; using OpenDirectoryDownloader.Site.Mediafire; using OpenDirectoryDownloader.Site.Pixeldrain; +using PuppeteerSharp; using System; using System.Collections.Generic; using System.Diagnostics; @@ -44,7 +45,7 @@ public static class DirectoryParser /// Base url /// Html to parse /// WebDirectory object containing current directory index - public static async Task ParseHtml(WebDirectory webDirectory, string html, HttpClient httpClient = null, HttpResponseMessage httpResponseMessage = null, bool checkParents = true) + public static async Task ParseHtml(WebDirectory webDirectory, string html, HttpClient httpClient = null, HttpClientHandler httpClientHandler = null, HttpResponseMessage httpResponseMessage = null, bool checkParents = true) { string baseUrl = webDirectory.Url; @@ -152,7 +153,7 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri break; } } - + if (googleDriveIndexType is not null) { break; @@ -162,11 +163,11 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (googleDriveIndexType is not null) { OpenDirectoryIndexer.Session.Parameters[Constants.GoogleDriveIndexType] = googleDriveIndexType; - + if (OpenDirectoryIndexer.Session.MaxThreads != 1) { - OpenDirectoryIndexer.Session.MaxThreads = 1; Logger.Warn($"Reduce threads to 1 because of Google Drive index"); + OpenDirectoryIndexer.Session.MaxThreads = 1; } } } @@ -345,6 +346,49 @@ public static async Task ParseHtml(WebDirectory webDirectory, stri if (parsedWebDirectory.Subdirectories.Count == 0 && parsedWebDirectory.Files.Count == 0 && htmlDocument.QuerySelector("noscript") != null) { Logger.Warn("No directories and files found, but did find a