diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index f8df995..428d4e4 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -1,8 +1,12 @@ -using AngleSharp; +using System.Reflection.Metadata; +using System.Text; +using System.Xml.Linq; +using AngleSharp; using AngleSharp.Dom; using AngleSharp.Html.Dom; using AngleSharp.Io; using KoeBook.Core; +using KoeBook.Core.Utilities; using KoeBook.Epub.Contracts.Services; using KoeBook.Epub.Models; using Microsoft.Extensions.DependencyInjection; @@ -15,7 +19,6 @@ public partial class ScrapingAozoraService(ISplitBraceService splitBraceService, private readonly ISplitBraceService _splitBraceService = splitBraceService; private readonly IScrapingClientService _scrapingClientService = scrapingClientService; - public bool IsMatchSite(Uri uri) { return uri.Host == "www.aozora.gr.jp"; @@ -23,11 +26,6 @@ public bool IsMatchSite(Uri uri) public async ValueTask ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct) { - var chapterNum = 0; - var sectionNum = 0; - var chapterExist = false; - var sectionExist = false; - var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); @@ -41,49 +39,9 @@ public async ValueTask ScrapingAsync(string url, string coverFileP ?? throw new EbookException(ExceptionType.WebScrapingFailed, $"著者の取得に失敗しました。\n以下のリンクから正しい小説のリンクを取得してください。\n{GetCardUrl(url)}"); // EpubDocument の生成 - var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id) - { - // EpubDocument.Chapters の生成 - Chapters = new List() - }; + var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id); - // 目次を取得 - var contents = doc.QuerySelectorAll(".midashi_anchor"); - - // 目次からEpubDocumentを構成 - List contentsIds = new List() { 0 }; - // Chapter, Section が存在するとき、それぞれtrue - chapterExist = false; - sectionExist = false; - if (contents.Length != 0) - { - int previousMidashiId = 0; - foreach (var midashi in contents) - { - if (midashi.Id != null) - { - var MidashiId = int.Parse(midashi.Id.Replace("midashi", "")); - if ((MidashiId - previousMidashiId) == 100) - { - document.Chapters.Add(new Chapter() { Title = TextProcess(midashi) }); - chapterExist = true; - } - if ((MidashiId - previousMidashiId) == 10) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Add(new Section(TextProcess(midashi))); - sectionExist = true; - } - contentsIds.Add(MidashiId); - previousMidashiId = MidashiId; - } - } - } - else - { - document.Chapters.Add(new Chapter() { Title = null }); - document.Chapters[^1].Sections.Add(new Section(bookTitle.InnerHtml)); - } + var (contentsIds, hasChapter, hasSection) = LoadToc(doc, document); // 本文を取得 var mainText = doc.QuerySelector(".main_text")!; @@ -91,494 +49,310 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // 本文を分割しながらEpubDocumntに格納 // 直前のNodeを確認した操作で、その内容をParagraphに追加した場合、true - bool previous = false; + var previous = false; // 各ChapterとSection のインデックス - chapterNum = -1; - sectionNum = -1; + var chapterNum = -1; + var sectionNum = -1; // 直前のimgタグにaltがなかったときtrueになる。 - bool skipCaption = false; + var skipCaption = false; foreach (var element in mainText.Children) { var nextNode = element.NextSibling; - if (element.TagName == "BR") + switch (element.TagName) { - if (previous == true) - { - document.EnsureSection(chapterNum); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - else if (element.TagName == "DIV") - { - var midashi = element.QuerySelector(".midashi_anchor"); - if (midashi != null) - { - if (midashi.Id == null) - throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); - - if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) - throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); - - if (contentsIds.Contains(midashiId)) + case TagNames.A: + if (previous) { - var contentsId = contentsIds.IndexOf(midashiId); - switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) - { - case 100: - if (chapterNum >= 0 && sectionNum >= 0) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); - } - chapterNum++; - sectionNum = -1; - break; - case 10: - if (chapterNum == -1) - { - chapterNum++; - sectionNum = -1; - } - if (chapterNum >= 0 && sectionNum >= 0) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); - } - sectionNum++; - break; - default: - break; - } + document.EnsureSection(chapterNum); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); } - else //小見出し、行中小見出しの処理 + break; + case TagNames.Div: + var midashi = element.QuerySelector(".midashi_anchor"); + if (midashi != null) { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) - { - paragraph.Text += TextProcess(midashi); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + if (midashi.Id == null) + throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(midashi))) - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) - { - paragraph1.Text += splitText; - } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - } - } - else - { - if (element.ClassName == "caption") - { - // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; + if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) + throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; - } - else - focusElements.Add(new Paragraph() { Text = text }); - } - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) + if (contentsIds.Contains(midashiId)) { - if (sectionExist) + var contentsId = contentsIds.IndexOf(midashiId); + switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); + case 100: + if (chapterNum >= 0 && sectionNum >= 0) + { + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(^1); + } + chapterNum++; + sectionNum = -1; + break; + case 10: + if (chapterNum == -1) + { + chapterNum++; + sectionNum = -1; + } + if (chapterNum >= 0 && sectionNum >= 0) + { + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(^1); + } + sectionNum++; + break; + default: + break; } - sectionNum++; } - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + else //小見出し、行中小見出しの処理 { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) - { - paragraph1.Text += splitText; - } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } } - } - } - else if (element.TagName == "IMG") - { - if (element is IHtmlImageElement img) - { - if (chapterNum == -1) + else { - if (chapterExist) + if (element.ClassName == "caption") { - document.Chapters.Insert(0, new Chapter()); + // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) + else { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } - sectionNum++; } - if (element.ClassName != "gaiji") + break; + case TagNames.Img: { + var img = (IHtmlImageElement)element; + + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + + if (element.ClassName == "gaiji") + break; + if (img.Source != null) { - // 画像のダウンロード - var loader = context.GetService(); - if (loader != null) - { - var downloading = loader.FetchAsync(new DocumentRequest(new Url(img.Source))); - ct.Register(() => downloading.Cancel()); - var response = await downloading.Task.ConfigureAwait(false); - using var ms = new MemoryStream(); - await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false); - var filePass = System.IO.Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); - File.WriteAllBytes(filePass, ms.ToArray()); - document.EnsureSection(chapterNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.Insert(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1, new Picture(filePass)); - } - } - } - if (img.AlternativeText != null) - { - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + // 画像のダウンロード + var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); + await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); + document.EnsureSection(chapterNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) { - paragraph.Text += TextReplace(img.AlternativeText); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); } - skipCaption = false; } - else + + if (img.AlternativeText is null) { skipCaption = true; + continue; } - } - } - } - else if (element.TagName == "SPAN") - { - if (element.ClassName == "caption") - { - if (skipCaption) - { - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph)) + + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) { - paragraph.Text = TextProcess(element) + "の画像"; + paragraph.Text += TextReplace(img.AlternativeText); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); } + skipCaption = false; + break; } - else + case TagNames.Span: + if (element.ClassName == "caption") { - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) - { + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[skipCaption ? ^2 : ^1] is Paragraph paragraph) paragraph.Text = TextProcess(element) + "の画像"; - } } - } - else if (element.ClassName == "notes") - { - switch (element.InnerHtml) + else if (element.ClassName == "notes") { - case "[#改丁]": - break; - case "[#改ページ]": - break; - case "[#改見開き]": - break; - case "[#改段]": - break; - case "[#ページの左右中央]": - break; - default: - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) - { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) - { - paragraph1.Text += splitText; - } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - break; - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) + switch (element.InnerHtml) { - document.Chapters.Insert(0, new Chapter()); + case "[#改丁]": + case "[#改ページ]": + case "[#改見開き]": + case "[#改段]": + case "[#ページの左右中央]": + break; + default: + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + break; } - chapterNum++; - sectionNum = -1; } - if (sectionNum == -1) + else { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + // 想定していない構造が見つかったことをログに出力した方が良い? } + break; + default: + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; - } - else - focusElements.Add(new Paragraph { Text = text }); - } - } + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + break; // 想定していない構造が見つかったことをログに出力した方が良い? - } } - else + + if (nextNode is null) + continue; + + if (nextNode.NodeType != NodeType.Text || string.IsNullOrWhiteSpace(nextNode.TextContent)) { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; - } - else - focusElements.Add(new Paragraph { Text = text }); - } - } - // 想定していない構造が見つかったことをログに出力した方が良い? + previous = false; + continue; } - if (nextNode != null) - { - if (nextNode.NodeType == NodeType.Text) - { - if (!string.IsNullOrWhiteSpace(nextNode.Text())) - { - previous = true; + previous = true; - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextReplace(nextNode.Text())); - var first = true; - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; - } - else - focusElements.Add(new Paragraph { Text = text }); - } - } - } - else - { - previous = false; - } - } - else - { - previous = false; - } - } + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, nextNode.TextContent, false); } // 末尾の空のparagraphを削除 - document.Chapters[^1].Sections[^1].Elements.RemoveAt(document.Chapters[^1].Sections[^1].Elements.Count - 1); + document.Chapters[^1].Sections[^1].Elements.RemoveAt(^1); return document; } - private static string TextProcess(IElement element) { - string text = ""; if (element.ChildElementCount == 0) { - text += TextReplace(element.InnerHtml); + return TextReplace(element.InnerHtml); } else { - var rubies = element.QuerySelectorAll("ruby"); + var rubies = element.QuerySelectorAll(TagNames.Ruby); if (rubies.Length > 0) { + var resultBuilder = new StringBuilder(); if (element.Children[0].PreviousSibling is INode node) { if (node.NodeType == NodeType.Text) { if (!string.IsNullOrWhiteSpace(node.Text())) { - text += TextReplace(node.Text()); + resultBuilder.Append(TextReplace(node.Text())); } } } + foreach (var item in element.Children) { - if (item.TagName == "RUBY") + if (item.TagName == TagNames.Ruby) { if (item.QuerySelectorAll("img").Length > 0) { - if (item.QuerySelector("rt") != null) + if (item.QuerySelector("rt") is { TextContent: var text }) { - text += TextReplace(item.QuerySelector("rt")!.TextContent); + resultBuilder.Append(TextReplace(text)); } } else { - text += TextReplace(item.OuterHtml); + resultBuilder.Append(TextReplace(item.OuterHtml)); } } else { if (!string.IsNullOrWhiteSpace(item.TextContent) && (!string.IsNullOrEmpty(item.TextContent))) { - text += TextReplace(item.TextContent); + resultBuilder.Append(TextReplace(item.TextContent)); } } if (item.NextSibling != null) { if (!string.IsNullOrWhiteSpace(item.NextSibling.TextContent) && (!string.IsNullOrEmpty(item.NextSibling.TextContent))) { - text += TextReplace(item.NextSibling.Text()); + resultBuilder.Append(TextReplace(item.NextSibling.Text())); } } } + return resultBuilder.ToString(); } - else if (element.TagName == "RUBY") + else if (element.TagName == TagNames.Ruby) { if (element.QuerySelectorAll("img").Length > 0) { - if (element.QuerySelector("rt") != null) - { - text += TextReplace(element.QuerySelector("rt")!.TextContent); - } + if (element.QuerySelector("rt") is { TextContent: var text }) + return TextReplace(text); + else + return ""; } else { - text += TextReplace(element.OuterHtml); + return TextReplace(element.OuterHtml); } } else { - text += TextReplace(element.TextContent); + return TextReplace(element.TextContent); } } - return text; } + private void AddParagraphs(List focusElements, IElement element, bool lastEmpty) + { + if (focusElements[^1] is Paragraph paragraph) + { + var splitted = _splitBraceService.SplitBrace(TextProcess(element)); + var first = true; + foreach (var text in splitted) + { + if (first) + { + paragraph.Text += text; + first = false; + } + else + focusElements.Add(new Paragraph { Text = text }); + } - // ローマ数字、改行の置換をまとめて行う。 + if (lastEmpty) + focusElements.Add(new Paragraph()); + } + } + + private void AddParagraphs(List focusElements, string input, bool lastEmpty) + { + if (focusElements[^1] is Paragraph paragraph) + { + var splitted = _splitBraceService.SplitBrace(TextReplace(input)); + var first = true; + foreach (var text in splitted) + { + if (first) + { + paragraph.Text += text; + first = false; + } + else + focusElements.Add(new Paragraph { Text = text }); + } + + if (lastEmpty) + focusElements.Add(new Paragraph()); + } + } + + /// + /// ローマ数字、改行の置換をまとめて行う。 + /// private static string TextReplace(string text) { string returnText = text; @@ -589,12 +363,167 @@ private static string TextReplace(string text) return returnText; } + /// + /// 目次からEpubDocuemntを構成します + /// + /// + /// + /// contentsIds: 見出しIDの数字部分。※EpubDocumentのChapter, Sectionとは一致しません + /// Chapterが存在するとき + /// Sectionが存在するとき + /// + /// + private static (List contentsIds, bool hasChapter, bool hasSection) LoadToc(IDocument doc, EpubDocument epubDocument) + { + // 目次を取得 + var contents = doc.QuerySelectorAll(".midashi_anchor"); + + // 目次からEpubDocumentを構成 + var contentsIds = new List() { 0 }; + // Chapter, Section が存在するとき、それぞれtrue + var hasChapter = false; + var hasSection = false; + if (contents.Length != 0) + { + int previousMidashiId = 0; + foreach (var midashi in contents) + { + if (midashi.Id != null) + { + var midashiId = int.Parse(midashi.Id.Replace("midashi", "")); + if ((midashiId - previousMidashiId) == 100) + { + epubDocument.Chapters.Add(new Chapter() { Title = TextProcess(midashi) }); + hasChapter = true; + } + else if ((midashiId - previousMidashiId) == 10) + { + epubDocument.EnsureChapter(); + epubDocument.Chapters[^1].Sections.Add(new Section(TextProcess(midashi))); + hasSection = true; + } + contentsIds.Add(midashiId); + previousMidashiId = midashiId; + } + } + } + else + { + epubDocument.Chapters.Add(new Chapter() + { + Title = null, + Sections = [new Section(epubDocument.Title)] + }); + } + return (contentsIds, hasChapter, hasSection); + } + + /// + /// 新規状態のときに初期設定を行います + /// + private static (int focusChapterIdx, int focusSectionIdx) SetChapterAndSection(EpubDocument document, bool hasChapter, bool hasSection, int chapterNum, int sectionNum) + { + if (chapterNum == -1) + { + if (hasChapter) + { + document.Chapters.Insert(0, new Chapter()); + } + chapterNum++; + sectionNum = -1; + } + if (sectionNum == -1) + { + if (hasSection) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; + } + return (chapterNum, sectionNum); + } private static string GetCardUrl(string url) { return UrlBookToCard().Replace(url, "$1card$2$3"); } + /// + /// class="main_text"なdiv要素の内容をに書き込む + /// + /// 書き込むEpubDocument + /// class = "main_text" なdiv要素 + internal void ProcessMainText(EpubDocument document, IHtmlDivElement mainText) + { + // 青空文庫の見出しのaタグのidの数値に対応 + int headingId = 0; + SplittedLineBuilder paragraphLineBuilder = new(); + SplittedLineBuilder scriptLineLineBuilder = new(); + // 作品中で使われるCSSスタイルを実現するために必要なclassの情報を保持する。 + // 例: + // 字下げに使われる class "jisage_1", "jisage_2", ..., "jisage_n"で、 n がいくつになるかは、その作品全体をチェックしないとわからないため、 + Dictionary classes = new(); + + //ProcessChildren(); する。 + } + + /// + /// EpubDocumentに対してある要素に応じた処理を行う。 + /// + /// 処理対象のEpubDocument + /// 処理を行う要素 + /// 適用されるclassのリスト + /// + internal void ProcessChildren(EpubDocument document, IElement element, string appliedClasses, ref int headingId, SplittedLineBuilder paragraphLineBuilder, SplittedLineBuilder scriptLineLineBuilder, Dictionary classes) + { + + } + + /// + /// に基づき、EpubDocument内で使用するクラスを生成する。 + /// + /// を変更するEpubDocument + void AddCssClasses(EpubDocument document, Dictionary classes) + { + (int min, int max) value = (0, 0); + if (classes.TryGetValue("jisage", out value)) + { + for (int i = value.min; i <= value.max; i++) + { + document.CssClasses.Add(new CssClass("jisage", $@" + .jisage_{i} {{ + margin-left: {i}em; + }} + ")); + } + } + if (classes.TryGetValue("text_indent", out value)) + { + for (int i = value.min; i <= value.max; i++) + { + document.CssClasses.Add(new CssClass("text_indent", $@" + .text_indent_{i} {{ + text-indent: {i}em; + }} + ")); + } + } + if (classes.TryGetValue("chitsuki", out value)) + { + for (int i = value.min; i <= value.max; i++) + { + document.CssClasses.Add(new CssClass("chitsuki", $@" + .chitsuki_{i} {{ + text-align: right; + margin-right: {i}em; + }} + ")); + } + } + } + + [System.Text.RegularExpressions.GeneratedRegex(@"(https://www\.aozora\.gr\.jp/cards/\d{6}/)files/(\d{1,})_\d{1,}(\.html)")] private static partial System.Text.RegularExpressions.Regex UrlBookToCard(); diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index 2fd47d0..6741549 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -131,7 +131,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP { switch (child) { - case { TagName: TagNames.Anchor, Children: [IHtmlImageElement img] } when img.Source is not null: + case { TagName: TagNames.A, Children: [IHtmlImageElement img] } when img.Source is not null: { // 画像のダウンロード var filePath = Path.Combine(imageDirectory, new Uri(img.Source, Options.RawUri).Segments[^1].TrimEnd('/')); @@ -143,7 +143,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP if (!string.IsNullOrWhiteSpace(item.InnerHtml)) lineBuilder.Append(item.InnerHtml); break; - case { TagName: TagNames.BreakRow }: + case { TagName: TagNames.Br }: foreach (var split in _splitBraceService.SplitBrace(lineBuilder.ToLinesAndClear())) { section.Elements.Add(new Paragraph() { Text = split }); diff --git a/Epub/KoeBook.Epub/TagNames.cs b/Epub/KoeBook.Epub/TagNames.cs index e98e4c0..400d52c 100644 --- a/Epub/KoeBook.Epub/TagNames.cs +++ b/Epub/KoeBook.Epub/TagNames.cs @@ -1,15 +1,12 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace KoeBook.Epub +namespace KoeBook.Epub { internal static class TagNames { - public const string Anchor = "A"; + public const string A = "A"; + public const string Br = "BR"; + public const string Div = "Div"; + public const string Img = "IMG"; public const string Ruby = "RUBY"; - public const string BreakRow = "BR"; + public const string Span = "SPAN"; } } diff --git a/KoeBook.Core/Utilities/EnumerableEx.cs b/KoeBook.Core/Utilities/EnumerableEx.cs index 4b1ce37..eab16f2 100644 --- a/KoeBook.Core/Utilities/EnumerableEx.cs +++ b/KoeBook.Core/Utilities/EnumerableEx.cs @@ -20,4 +20,9 @@ public static class EnumerableEx yield return (current, false, !hasNext); } } + + public static void RemoveAt(this List list, Index index) + { + list.RemoveAt(index.GetOffset(list.Count)); + } } diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs new file mode 100644 index 0000000..18a7b41 --- /dev/null +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -0,0 +1,201 @@ +using System.Runtime.CompilerServices; +using AngleSharp; +using AngleSharp.Dom; +using KoeBook.Epub.Models; +using KoeBook.Epub.Services; +using KoeBook.Core.Models; +using Microsoft.Extensions.DependencyInjection; +using KoeBook.Epub.Contracts.Services; +using AngleSharp.Html.Dom; + +namespace KoeBook.Test.Epub; + +public class ScrapingAozoraServiceTest : DiTestBase +{ + private readonly ScrapingAozoraService _scrapingAozoraService; + + public ScrapingAozoraServiceTest() + { + _scrapingAozoraService = Host.Services + .GetServices() + .OfType() + .Single(); + } + + private static EpubDocument EmptySingleParagraph + => new("", "", "", Guid.NewGuid()) + { + Chapters = [ + new() + { + Sections = [new Section("") { Elements = [new Paragraph()] }] + }] + }; + + + /// + /// (htmlの要素の)テキストを"
"で囲む + ///
+ /// divタグで囲む htmlの要素 + /// divタグで囲まれた + private static string ToMainText(string text) + { + return @$"
{text}
"; + } + + [Theory] + // レイアウト1.1 改丁 + [InlineData(@"
[#改丁]
", "[#改丁]", "")] + // レイアウト1.2 改ページ + [InlineData(@"
[#改ページ]
", "[#改ページ]", "")] + // レイアウト1.3 改見開き + [InlineData(@"
[#改見開き]
", "[#改見開き]", "")] + // レイアウト1.4 改段 + [InlineData(@"
[#改段]
", "[#改段]", "")] + public async void ProcessChildrenLayout1Test(string html, string expectedParagraphText, string expectedScriptText) + { + var config = Configuration.Default.WithDefaultLoader(); + using var context = BrowsingContext.New(config); + var doc = await context.OpenAsync(request => request.Content(html)); + var mainText = doc.DocumentElement.LastElementChild?.LastElementChild as IHtmlDivElement; + if (mainText == null) + Assert.Fail(); + var document = EmptySingleParagraph; + + _scrapingAozoraService.ProcessMainText(document, mainText); + + var chapter = Assert.Single(document.Chapters); + var section = Assert.Single(chapter.Sections); + var paragraph = Assert.IsType(section.Elements[^1]); + Assert.Equal(expectedParagraphText, paragraph.Text); + Assert.Equal(string.Empty, paragraph.ClassName); + Assert.NotNull(paragraph.ScriptLine); + Assert.Equal(expectedScriptText, paragraph.ScriptLine.Text); + } + + // Classes の各 value は、対応するclass で、ソースに出てきたものの内、最大のものの値をほじするようにする。 + public static object[][] ProcessChildrenlayout2TestCases() + { + (string, Paragraph[], (string, (int, int))[])[] cases = [ + // レイアウト2.1 1行だけの字下げ + (@"
text

", [new Paragraph() { Text = "text", ClassName = "jisage_3", ScriptLine = new ScriptLine("text", "", "") }], [("jisage", (1, 3))]), + // レイアウト2.2 ブロックでの字下げ + (@"
text1
text2

", [new Paragraph() { Text = "text1", ClassName = "jisage_3", ScriptLine = new ScriptLine("text1", "", "") }, new Paragraph() { Text = "text2", ClassName = "jisage_3", ScriptLine = new ScriptLine("text2", "", "") },], [("jisage", (1, 3))]), + // レイアウト2.3 凹凸の複雑な字下げ + (@"
Long Text
", [new Paragraph() { Text = "Long Text", ClassName = "jisage_3 text_indent_-1" }], [("jisage", (1, 3)), ("text_indent", (-1, 0))]), + // レイアウト2.4 は特定の書き方について述べていないので省略。 + // レイアウト2.5 地付き + (@"
text
", [new Paragraph() { Text = "text", ClassName = "chitsuki_0", ScriptLine = new ScriptLine("text", "", "") }], [("chitsuki", (0, 0))]), + + + //
の後の
がないパターン + (@"
text
", [new Paragraph() { Text = "text", ClassName = "jisage_3", ScriptLine = new ScriptLine("text", "", "") }], [("jisage", (1, 3))]), + //
の前の
がないパターン + (@"
text
", [new Paragraph() { Text = "text", ClassName = "jisage_3 text_indent_-1", ScriptLine = new ScriptLine("text", "", "") }], [("jisage", (1, 3)), ("text_indent", (-1, 0))]), + + ]; + return cases.Select(c => new object[] { ToMainText(c.Item1), c.Item2, c.Item3 }).ToArray(); + } + + [Theory] + [MemberData(nameof(ProcessChildrenlayout2TestCases))] + public async void ProcessChildrenLayout2Test(string html, IReadOnlyCollection expectedParagraphs, IEnumerable<(string key, (int min, int max) value)> expectedDictionary) + { + var config = Configuration.Default.WithDefaultLoader(); + using var context = BrowsingContext.New(config); + var doc = await context.OpenAsync(request => request.Content(html)); + var mainText = doc.QuerySelector(".main_text") as IHtmlDivElement; + if (mainText == null) + Assert.Fail(); + var document = EmptySingleParagraph; + _scrapingAozoraService._Classes().Clear(); + + _scrapingAozoraService.ProcessMainText(document, mainText); + + var chapter = Assert.Single(document.Chapters); + var section = Assert.Single(chapter.Sections); + Assert.Equal(expectedParagraphs.Count, document.Chapters[^1].Sections[^1].Elements.Count); + Assert.All(expectedParagraphs.Zip(document.Chapters[^1].Sections[^1].Elements), v => + { + var actualParagraph = Assert.IsType(v.Second); + Assert.Equal(v.First.Text, actualParagraph.Text); + Assert.Equal(v.First.ClassName, actualParagraph.ClassName); + Assert.NotNull(actualParagraph.ScriptLine); + Assert.Equal(v.First.ScriptLine?.Text, actualParagraph.ScriptLine.Text); + }); + // ScrapingAozoraService.Classes の確認 + Assert.All(expectedDictionary, expectedKeyValuePair => + { + Assert.True(_scrapingAozoraService._Classes().TryGetValue(expectedKeyValuePair.key, out var actualValue)); + Assert.True(actualValue.min <= expectedKeyValuePair.value.min); + Assert.True(actualValue.max >= expectedKeyValuePair.value.max); + }); + } + + internal class httpClientFactory : IHttpClientFactory + { + public HttpClient CreateClient(string name) + { + return httpClient; + } + + private static readonly HttpClient httpClient = new HttpClient(); + + } + + + [Theory] + [InlineData("", "")] + public async Task TextProcess(string input, string expected) + { + using var context = BrowsingContext.New(Configuration.Default); + using var doc = await context.OpenAsync(req => req.Content(input)); + Assert.NotNull(doc.ParentElement); + + var result = ScrapingAozora.TextProcess(null, doc.ParentElement!); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData("", new[] { "" })] + public async Task AddParagraphs1(string input, string[] expected) + { + using var context = BrowsingContext.New(Configuration.Default); + using var doc = await context.OpenAsync(req => req.Content(input)); + Assert.NotNull(doc.ParentElement); + var epubDocument = new EpubDocument("title", "author", "", default) + { + Chapters = [new() { Sections = [new("section title") { Elements = [new Paragraph() { Text = "test" }] }] }] + }; + + Assert.Equal(expected.Length, epubDocument.Chapters[0].Sections[0].Elements.Count); + Assert.All(epubDocument.Chapters[0].Sections[0].Elements.Zip(expected), v => + { + var (element, expected) = v; + var paragraph = Assert.IsType(element); + Assert.Equal(expected, paragraph.Text); + }); + } +} + +file static class ScrapingAozora +{ + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern string TextProcess(ScrapingAozoraService? _, IElement element); + + [UnsafeAccessor(UnsafeAccessorKind.Method)] + public static extern void AddParagraphs(ScrapingAozoraService service, List focusElements, IElement element, bool lastEmpty); + + [UnsafeAccessor(UnsafeAccessorKind.Method)] + public static extern void AddParagraphs(ScrapingAozoraService service, List focusElements, string input, bool lastEmpty); + + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern string TextReplace(ScrapingAozoraService? _, string text); + + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern (List contentsIds, bool hasChapter, bool hasSection) LoadToc(ScrapingAozoraService? _, IDocument doc, EpubDocument epubDocument); + + [UnsafeAccessor(UnsafeAccessorKind.Field, Name = "Classes")] + public static extern Dictionary _Classes(this ScrapingAozoraService scraper); +}