Skip to content

Commit

Permalink
- Skip responses large than 20MB, handle as a file instead of directory
Browse files Browse the repository at this point in the history
- Skip files which do not contain HTML, or very unlikely
- Show warning for very large respones (which might be HTML)
  • Loading branch information
KoalaBear84 committed Aug 5, 2021
1 parent 0f424f7 commit 386b00f
Show file tree
Hide file tree
Showing 2 changed files with 177 additions and 7 deletions.
3 changes: 2 additions & 1 deletion src/OpenDirectoryDownloader/Constants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ public class Constants
public const long NoFileSize = 0;
public const string Root = "ROOT";
public const string Ftp_Max_Connections = "MAX_CONNECTIONS";
public const int Megabyte = 1024 * 1024;
public const int Kilobyte = 1024;
public const int Megabyte = 1024 * Kilobyte;

public class UserAgent
{
Expand Down
181 changes: 175 additions & 6 deletions src/OpenDirectoryDownloader/OpenDirectoryIndexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ public async void StartIndexingAsync()
Console.WriteLine("Finished indexing!");

Program.SetConsoleTitle($"✔ {Program.ConsoleTitle}");

bool clipboardSuccess = false;

if (OpenDirectoryIndexerSettings.CommandLineOptions.Clipboard)
Expand All @@ -493,7 +493,7 @@ public async void StartIndexingAsync()
Logger.Error($"Error copying stats to clipboard: {ex.Message}");
}
}

if (OpenDirectoryIndexerSettings.CommandLineOptions.Quit)
{
Command.KillApplication();
Expand Down Expand Up @@ -765,13 +765,33 @@ private async Task ProcessWebDirectoryAsync(string name, WebDirectory webDirecto
}
}

if (httpResponseMessage.Content?.Headers.ContentLength > 20 * Constants.Megabyte)
{
ConvertDirectoryToFile(webDirectory, httpResponseMessage);

return;
}

string html = null;

if (httpResponseMessage.IsSuccessStatusCode)
{
SetRootUrl(httpResponseMessage);

html = await GetHtml(httpResponseMessage);
using (Stream htmlStream = await GetHtmlStream(httpResponseMessage))
{
if (htmlStream != null)
{
html = await GetHtml(htmlStream);
}
else
{
Logger.Warn($"Treated {webDirectory.Url} as file instead of directory ({FileSizeHelper.ToHumanReadable(httpResponseMessage.Content.Headers.ContentLength.Value)})");
ConvertDirectoryToFile(webDirectory, httpResponseMessage);

return;
}
}
}

if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html) || html?.Contains("HTTP_USER_AGENT") == true)
Expand All @@ -787,7 +807,19 @@ private async Task ProcessWebDirectoryAsync(string name, WebDirectory webDirecto

SetRootUrl(httpResponseMessage);

html = await GetHtml(httpResponseMessage);
using (Stream htmlStream = await GetHtmlStream(httpResponseMessage))
{
if (htmlStream != null)
{
html = await GetHtml(htmlStream);
}
else
{
ConvertDirectoryToFile(webDirectory, httpResponseMessage);

return;
}
}
}
}

Expand All @@ -804,7 +836,19 @@ private async Task ProcessWebDirectoryAsync(string name, WebDirectory webDirecto

SetRootUrl(httpResponseMessage);

html = await GetHtml(httpResponseMessage);
using (Stream htmlStream = await GetHtmlStream(httpResponseMessage))
{
if (htmlStream != null)
{
html = await GetHtml(htmlStream);
}
else
{
ConvertDirectoryToFile(webDirectory, httpResponseMessage);

return;
}
}
}
}

Expand Down Expand Up @@ -838,7 +882,20 @@ private async Task ProcessWebDirectoryAsync(string name, WebDirectory webDirecto
{
if (html == null)
{
html = await GetHtml(httpResponseMessage);
using (Stream htmlStream = await GetHtmlStream(httpResponseMessage))
{
if (htmlStream != null)
{
html = await GetHtml(htmlStream);
}
else
{
Logger.Warn($"Treated {webDirectory.Url} as file instead of directory ({FileSizeHelper.ToHumanReadable(httpResponseMessage.Content.Headers.ContentLength.Value)})");
ConvertDirectoryToFile(webDirectory, httpResponseMessage);

return;
}
}
}

// UNTESTED (cannot find Calibre with this issue)
Expand Down Expand Up @@ -945,6 +1002,20 @@ private async Task ProcessWebDirectoryAsync(string name, WebDirectory webDirecto
}
}

private static void ConvertDirectoryToFile(WebDirectory webDirectory, HttpResponseMessage httpResponseMessage)
{
// Remove it as directory
webDirectory.ParentDirectory.Subdirectories.Remove(webDirectory);

// Add it as a file
webDirectory.ParentDirectory.Files.Add(new WebFile
{
Url = webDirectory.Url,
FileName = webDirectory.Name,
FileSize = (long)httpResponseMessage.Content.Headers.ContentLength
});
}

private void SetRootUrl(HttpResponseMessage httpResponseMessage)
{
if (FirstRequest)
Expand All @@ -969,6 +1040,104 @@ private static async Task<string> GetHtml(HttpResponseMessage httpResponseMessag
return await httpResponseMessage.Content.ReadAsStringAsync();
}

private static async Task<string> GetHtml(Stream stream)
{
using (StreamReader streamReader = new StreamReader(stream))
{
return await streamReader.ReadToEndAsync();
}
}

/// <summary>
/// Checks for maximum of 10% control characters, which should not be in HTML
/// </summary>
/// <param name="buffer">Buffer to check</param>
/// <param name="length">Length to check</param>
/// <returns>True if there is less than 10% control characters</returns>
private static bool IsHtmlMaybe(char[] buffer, int length)
{
int controlChars = buffer.Take(length).Count(c => char.IsControl(c) && c != 10 && c != 13 && c != 9);

return (100d / (buffer.Length / (double)controlChars)) < 10;
}

/// <summary>
/// Check HttpResponseMessage for HTML, as good as it can
/// The code below might not be perfect, streams are hard
/// </summary>
/// <param name="httpResponseMessage">The HttpResponseMessage to read from</param>
/// <returns>A checked stream when possible HTML, else null</returns>
private static async Task<Stream> GetHtmlStream(HttpResponseMessage httpResponseMessage)
{
FixCharSet(httpResponseMessage);

Encoding encoding = Encoding.ASCII;

string charSet = httpResponseMessage.Content.Headers.ContentType?.CharSet;

if (!string.IsNullOrWhiteSpace(charSet))
{
encoding = Encoding.GetEncoding(charSet);
}

// Don't use using tags, it will close the stream for the callee
MemoryStream responseStream = new MemoryStream();
StreamWriter streamWriter = new StreamWriter(responseStream);

using (Stream stream = await httpResponseMessage.Content.ReadAsStreamAsync())
{
using (StreamReader streamReader = new StreamReader(stream, encoding))
{
// Check first 10kB for any 'HTML'
char[] buffer = new char[10 * Constants.Kilobyte];
int readBytes = await streamReader.ReadBlockAsync(buffer, 0, buffer.Length);

if (!buffer.Contains('<'))
{
return null;
}
else if (!IsHtmlMaybe(buffer, readBytes))
{
return null;
}
else
{
Regex htmlRegex = new Regex("<[a-zA-Z0-9] ?([^>]+)>", RegexOptions.IgnoreCase);

if (!htmlRegex.Match(new string(buffer)).Success)
{
return null;
}
}

await streamWriter.WriteAsync(buffer);
await streamWriter.FlushAsync();

buffer = new char[Constants.Kilobyte];

do
{
readBytes = await streamReader.ReadBlockAsync(buffer, 0, buffer.Length);

if (readBytes > 0)
{
await streamWriter.WriteAsync(buffer);
await streamWriter.FlushAsync();
}
} while (readBytes > 0);

streamReader.Close();
}

stream.Close();
}

await streamWriter.FlushAsync();
responseStream.Position = 0;

return responseStream;
}

private static void FixCharSet(HttpResponseMessage httpResponseMessage)
{
if (httpResponseMessage.Content.Headers.ContentType?.CharSet?.ToLower() == "utf8" || httpResponseMessage.Content.Headers.ContentType?.CharSet == "GB1212")
Expand Down

0 comments on commit 386b00f

Please sign in to comment.