diff --git a/plugins/org.obeonetwork.m2doc.html/src/org/obeonetwork/m2doc/html/services/M2DocHTMLParser.java b/plugins/org.obeonetwork.m2doc.html/src/org/obeonetwork/m2doc/html/services/M2DocHTMLParser.java index 1d72dcc22..b4a45f780 100644 --- a/plugins/org.obeonetwork.m2doc.html/src/org/obeonetwork/m2doc/html/services/M2DocHTMLParser.java +++ b/plugins/org.obeonetwork.m2doc.html/src/org/obeonetwork/m2doc/html/services/M2DocHTMLParser.java @@ -21,9 +21,11 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.Stack; import org.apache.poi.xwpf.usermodel.XWPFDocument; @@ -96,6 +98,108 @@ public class M2DocHTMLParser extends Parser { */ private static final String UL_TAG = "ul"; + /** + * The ul HTML tag. + */ + private static final String BR_TAG = "br"; + + private static final Set BR_CONTINUE_TAGS = initializeBrContinueTags(); + + /** + * The small HTML tag. + */ + private static final String SMALL_TAG = "small"; + + /** + * The big HTML tag. + */ + private static final String BIG_TAG = "big"; + + /** + * The a HTML tag. + */ + private static final String A_TAG = "a"; + + /** + * The font HTML tag. + */ + private static final String FONT_TAG = "font"; + + /** + * The sup HTML tag. + */ + private static final String SUP_TAG = "sup"; + + /** + * The sub HTML tag. + */ + private static final String SUB_TAG = "sub"; + + /** + * The ins HTML tag. + */ + private static final String INS_TAG = "ins"; + + /** + * The u HTML tag. + */ + private static final String U_TAG = "u"; + + /** + * The del HTML tag. + */ + private static final String DEL_TAG = "del"; + + /** + * The strike HTML tag. + */ + private static final String STRIKE_TAG = "strike"; + + /** + * The s HTML tag. + */ + private static final String S_TAG = "s"; + + /** + * The cite HTML tag. + */ + private static final String CITE_TAG = "cite"; + + /** + * The var HTML tag. + */ + private static final String VAR_TAG = "var"; + + /** + * The i HTML tag. + */ + private static final String I_TAG = "i"; + + /** + * The em HTML tag. + */ + private static final String EM_TAG = "em"; + + /** + * The b HTML tag. + */ + private static final String B_TAG = "b"; + + /** + * The strong HTML tag. + */ + private static final String STRONG_TAG = "strong"; + + /** + * The span HTML tag. + */ + private static final String SPAN_TAG = "span"; + + /** + * The p HTML tag. + */ + private static final String P_TAG = "p"; + /** * The svg HTML tag. */ @@ -644,6 +748,37 @@ public M2DocHTMLParser(URIConverter uriConverter, XWPFDocument destinationDocume this.destinationDocument = destinationDocument; } + /** + * Initializes the {@link Set} of tags that can continue a list for {@link #BR_TAG} removal. + * + * @return the {@link Set} of tags that can continue a list for {@link #BR_TAG} removal + */ + private static Set initializeBrContinueTags() { + final Set res = new HashSet(); + + res.add(BR_TAG); + res.add(SMALL_TAG); + res.add(BIG_TAG); + res.add(A_TAG); + res.add(FONT_TAG); + res.add(SUP_TAG); + res.add(SUB_TAG); + res.add(INS_TAG); + res.add(U_TAG); + res.add(DEL_TAG); + res.add(STRIKE_TAG); + res.add(S_TAG); + res.add(CITE_TAG); + res.add(VAR_TAG); + res.add(I_TAG); + res.add(EM_TAG); + res.add(B_TAG); + res.add(STRONG_TAG); + res.add(SPAN_TAG); + + return res; + } + /** * Parses the given HTML {@link String} with the given base URI. * @@ -670,6 +805,7 @@ public List parse(URI baseURI, String htmlString) { parseHead(baseURI, headElement); final Element bodyElement = htmlElement.getElementsByTag("body").get(0); + cleanHTML(bodyElement); final MStyle defaultStyle = new MStyleImpl(null, -1, null, null, -1); if (document.body().hasAttr("bgcolor")) { defaultStyle.setBackgroundColor(htmlToColor(bodyElement.attr("bgcolor").toLowerCase())); @@ -693,6 +829,85 @@ public List parse(URI baseURI, String htmlString) { return res; } + /** + * Cleans the HTML code. + * + * @param node + * the Node to clean + */ + private void cleanHTML(Node node) { + final List toRemove = new ArrayList(); + + if (node instanceof Element) { + if (isHidden(node)) { + toRemove.add(node); + } else { + List nodeList = new ArrayList(); + for (Node child : node.childNodes()) { + cleanHTML(child); + if (continueBrList(child)) { + nodeList.add(child); + } else if (!nodeList.isEmpty()) { + removeBrTags(toRemove, nodeList); + nodeList = new ArrayList(); + } + } + if (!nodeList.isEmpty()) { + removeBrTags(toRemove, nodeList); + } + } + } + + for (Node remove : toRemove) { + remove.remove(); + } + } + + /** + * Tells if the {@link #BR_TAG} removal {@link List} should be continued with the given {@link Node}. + * + * @param node + * the {@link Node} + * @return true if the {@link #BR_TAG} removal {@link List} should be continued, false otherwise + */ + private boolean continueBrList(Node node) { + final boolean res; + + if (node instanceof TextNode) { + res = true; + } else { + res = BR_CONTINUE_TAGS.contains(node.nodeName()); + } + + return res; + } + + /** + * Add irrelevant {@link #BR_TAG} from the given {@link Node} {@link List} to the given to remove {@link List}. + * + * @param toRemove + * the to remove {@link List} + * @param nodeList + * the {@link Node} {@link List} + */ + private void removeBrTags(final List toRemove, List nodeList) { + boolean onlyBrTag = true; + Node lastBrTag = null; + for (Node child : nodeList) { + final boolean isEmptyTextChild = child instanceof TextNode && text((TextNode) child).trim().isEmpty(); + final boolean isBrChild = BR_TAG.equals(child.nodeName()); + onlyBrTag = onlyBrTag && (isBrChild || isEmptyTextChild); + if (isBrChild) { + lastBrTag = child; + } else if (lastBrTag != null && !isEmptyTextChild) { + lastBrTag = null; + } + } + if (!onlyBrTag && lastBrTag != null) { + toRemove.add(lastBrTag); + } + } + /** * Initializes SVG to prevent JSoup's HTML parser from breaking case sensitive SVG. * @@ -758,7 +973,7 @@ private void parseHead(URI baseURI, Element headElement) { */ private void walkNodeTree(MParagraph parent, Context context, Node node, Element lastElement) { final Context contextCopy = context.copy(); - if (node instanceof Element && !isHidden(node)) { + if (node instanceof Element) { if ("table".equals(node.nodeName())) { Node tHeader = null; for (Node child : node.childNodes()) { @@ -1332,7 +1547,7 @@ private MParagraph startElement(MParagraph parent, Context context, Element elem final String nodeName = element.nodeName(); boolean isNumbering = false; - if ("p".equals(nodeName)) { + if (P_TAG.equals(nodeName)) { res = createMParagraph(context, parent, element, null, null); } else if (BLOCKQUOTE_TAG.equals(nodeName)) { if (element.childNodeSize() > 0 && element.childNode(0) instanceof TextNode) { @@ -1347,91 +1562,92 @@ private MParagraph startElement(MParagraph parent, Context context, Element elem } else { res = parent; } - } else if ("strong".equals(nodeName) || "b".equals(nodeName)) { + } else if (STRONG_TAG.equals(nodeName) || B_TAG.equals(nodeName)) { setModifiers(context.style, MStyle.FONT_BOLD); res = parent; - } else if ("em".equals(nodeName) || "i".equals(nodeName) || "var".equals(nodeName) || "cite".equals(nodeName)) { - setModifiers(context.style, MStyle.FONT_ITALIC); - res = parent; - } else if ("s".equals(nodeName) || "strike".equals(nodeName) || "del".equals(nodeName)) { - setModifiers(context.style, MStyle.FONT_STRIKE_THROUGH); - res = parent; - } else if ("u".equals(nodeName) || "ins".equals(nodeName)) { - setModifiers(context.style, MStyle.FONT_UNDERLINE); - res = parent; - } else if ("sub".equals(nodeName)) { - setModifiers(context.style, MStyle.SUBSCRIPT); - res = parent; - } else if ("sup".equals(nodeName)) { - setModifiers(context.style, MStyle.SUPERSCRIPT); - res = parent; - } else if ("font".equals(nodeName)) { - if (element.hasAttr("color")) { - context.style.setForegroundColor(htmlToColor(element.attr("color").toLowerCase())); - } - if (element.hasAttr("face")) { - // TODO double check this - context.style.setFontName(element.attr("face")); - } - if (element.hasAttr(SIZE_ATTR)) { - context.style.setFontSize(fontSizeToPoint(element.attr(SIZE_ATTR))); - } - res = parent; - } else if ("a".equals(nodeName)) { - context.linkTargetURI = toURI(context.baseURI, element.attr(HREF_ATTR)); - if (element.hasAttr("title")) { - context.linkTitle = element.attr("title"); - } - res = parent; - } else if ("br".equals(nodeName)) { - final MList parentContents = (MList) parent.getContents(); - parentContents.add(MPagination.ligneBreak); - res = parent; - } else if ("li".equals(nodeName)) { - res = createMParagraph(context, parent, element, context.numberingID.longValue(), - context.numberingLevel - 1); - isNumbering = true; - } else if (OL_TAG.equals(nodeName)) { - setOrderedListNumbering(context, element); - isNumbering = true; - res = parent; - } else if (UL_TAG.equals(nodeName)) { - setUnorderedListNumbering(context, element); - isNumbering = true; - res = parent; - } else if ("img".equals(nodeName)) { - final MImage mImage = createMImage(context, element); - final MList parentContents = (MList) parent.getContents(); - parentContents.add(mImage); - res = parent; - } else if ("big".equals(nodeName)) { - setBigFont(context); - res = parent; - } else if ("small".equals(nodeName)) { - setSmallFont(context); - res = parent; - } else - if ("tt".equals(nodeName) || "code".equals(nodeName) || "samp".equals(nodeName) || "kbd".equals(nodeName)) { - context.style.setFontName(COURIER_NEW_FONT); + } else if (EM_TAG.equals(nodeName) || I_TAG.equals(nodeName) || VAR_TAG.equals(nodeName) + || CITE_TAG.equals(nodeName)) { + setModifiers(context.style, MStyle.FONT_ITALIC); res = parent; - } else if ("h1".equals(nodeName)) { - res = createHeading(parent, context, element, H1_FONT_SIZE); - } else if ("h2".equals(nodeName)) { - res = createHeading(parent, context, element, H2_FONT_SIZE); - } else if ("h3".equals(nodeName)) { - res = createHeading(parent, context, element, H3_FONT_SIZE); - } else if ("h4".equals(nodeName)) { - res = createHeading(parent, context, element, H4_FONT_SIZE); - } else if ("h5".equals(nodeName)) { - res = createHeading(parent, context, element, H5_FONT_SIZE); - } else if ("h6".equals(nodeName)) { - res = createHeading(parent, context, element, H6_FONT_SIZE); - } else if (CENTER_TAG.equals(nodeName)) { - res = createMParagraph(context, parent, element, null, null); - res.setHAlignment(HAlignment.CENTER); - } else { + } else if (S_TAG.equals(nodeName) || STRIKE_TAG.equals(nodeName) || DEL_TAG.equals(nodeName)) { + setModifiers(context.style, MStyle.FONT_STRIKE_THROUGH); res = parent; - } + } else if (U_TAG.equals(nodeName) || INS_TAG.equals(nodeName)) { + setModifiers(context.style, MStyle.FONT_UNDERLINE); + res = parent; + } else if (SUB_TAG.equals(nodeName)) { + setModifiers(context.style, MStyle.SUBSCRIPT); + res = parent; + } else if (SUP_TAG.equals(nodeName)) { + setModifiers(context.style, MStyle.SUPERSCRIPT); + res = parent; + } else if (FONT_TAG.equals(nodeName)) { + if (element.hasAttr("color")) { + context.style.setForegroundColor(htmlToColor(element.attr("color").toLowerCase())); + } + if (element.hasAttr("face")) { + // TODO double check this + context.style.setFontName(element.attr("face")); + } + if (element.hasAttr(SIZE_ATTR)) { + context.style.setFontSize(fontSizeToPoint(element.attr(SIZE_ATTR))); + } + res = parent; + } else if (A_TAG.equals(nodeName)) { + context.linkTargetURI = toURI(context.baseURI, element.attr(HREF_ATTR)); + if (element.hasAttr("title")) { + context.linkTitle = element.attr("title"); + } + res = parent; + } else if (BR_TAG.equals(nodeName)) { + final MList parentContents = (MList) parent.getContents(); + parentContents.add(MPagination.ligneBreak); + res = parent; + } else if ("li".equals(nodeName)) { + res = createMParagraph(context, parent, element, context.numberingID.longValue(), + context.numberingLevel - 1); + isNumbering = true; + } else if (OL_TAG.equals(nodeName)) { + setOrderedListNumbering(context, element); + isNumbering = true; + res = parent; + } else if (UL_TAG.equals(nodeName)) { + setUnorderedListNumbering(context, element); + isNumbering = true; + res = parent; + } else if ("img".equals(nodeName)) { + final MImage mImage = createMImage(context, element); + final MList parentContents = (MList) parent.getContents(); + parentContents.add(mImage); + res = parent; + } else if (BIG_TAG.equals(nodeName)) { + setBigFont(context); + res = parent; + } else if (SMALL_TAG.equals(nodeName)) { + setSmallFont(context); + res = parent; + } else if ("tt".equals(nodeName) || "code".equals(nodeName) || "samp".equals(nodeName) + || "kbd".equals(nodeName)) { + context.style.setFontName(COURIER_NEW_FONT); + res = parent; + } else if ("h1".equals(nodeName)) { + res = createHeading(parent, context, element, H1_FONT_SIZE); + } else if ("h2".equals(nodeName)) { + res = createHeading(parent, context, element, H2_FONT_SIZE); + } else if ("h3".equals(nodeName)) { + res = createHeading(parent, context, element, H3_FONT_SIZE); + } else if ("h4".equals(nodeName)) { + res = createHeading(parent, context, element, H4_FONT_SIZE); + } else if ("h5".equals(nodeName)) { + res = createHeading(parent, context, element, H5_FONT_SIZE); + } else if ("h6".equals(nodeName)) { + res = createHeading(parent, context, element, H6_FONT_SIZE); + } else if (CENTER_TAG.equals(nodeName)) { + res = createMParagraph(context, parent, element, null, null); + res.setHAlignment(HAlignment.CENTER); + } else { + res = parent; + } if (!isNumbering) { context.numbering = null; @@ -1688,13 +1904,13 @@ private void setOrderedListNumbering(Context context, Element element) { } else if ("A".equals(typeStr) || CSS_PARSER.hasCSS(context.cssProperties, M2DocCSSParser.CSS_LIST_STYLE_TYPE, "upper-alpha")) { type = STNumberFormat.UPPER_LETTER; - } else if ("a".equals(typeStr) + } else if (A_TAG.equals(typeStr) || CSS_PARSER.hasCSS(context.cssProperties, M2DocCSSParser.CSS_LIST_STYLE_TYPE, "lower-alpha")) { type = STNumberFormat.LOWER_LETTER; } else if ("I".equals(typeStr) || CSS_PARSER.hasCSS(context.cssProperties, M2DocCSSParser.CSS_LIST_STYLE_TYPE, "upper-roman")) { type = STNumberFormat.UPPER_ROMAN; - } else if ("i".equals(typeStr) || CSS_PARSER.hasCSS(context.cssProperties, + } else if (I_TAG.equals(typeStr) || CSS_PARSER.hasCSS(context.cssProperties, M2DocCSSParser.CSS_LIST_STYLE_TYPE, "lower-roman")) { type = STNumberFormat.LOWER_ROMAN; } else diff --git a/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/br-expected-generation.OK.docx b/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/br-expected-generation.OK.docx new file mode 100644 index 000000000..4aa7a930a Binary files /dev/null and b/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/br-expected-generation.OK.docx differ diff --git a/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/br-expected-generation.docx b/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/br-expected-generation.docx index 912d6f006..806562ce8 100644 Binary files a/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/br-expected-generation.docx and b/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/br-expected-generation.docx differ diff --git a/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/doc.html b/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/doc.html index 7c7165279..87e9988ed 100644 --- a/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/doc.html +++ b/tests/org.obeonetwork.m2doc.html.tests/resources/html/br/doc.html @@ -2,9 +2,69 @@ -

-To break lines
in a text,
use the br element. -

+ To break lines
in a text,
use the br element. +

********************

+
+

********************

+
+
+

********************

+
+
+
+

********************

+ A Line
+

********************

+
A Line +

********************

+
A Line
+

********************

+ A Line
+
    +
  • Item1
  • +
  • Item2
  • +
  • Item3
  • +
+

********************

+

+ To break lines
in a text,
use the br element. +

+

********************

+

+
+

+

********************

+

+
+
+

+

********************

+

+
+
+
+

+

********************

+

+ A Line
+

+

********************

+

+
A Line +

+

********************

+

+
A Line
+

+

********************

+

+ A Line
+

    +
  • Item1
  • +
  • Item2
  • +
  • Item3
  • +
+

diff --git a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug449_2/bug449_2-expected-generation.docx b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug449_2/bug449_2-expected-generation.docx index d286cd900..f8cf20b05 100644 Binary files a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug449_2/bug449_2-expected-generation.docx and b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug449_2/bug449_2-expected-generation.docx differ diff --git a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug486/bug486-expected-generation.docx b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug486/bug486-expected-generation.docx index bab2aa3dd..d68a1a9b2 100644 Binary files a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug486/bug486-expected-generation.docx and b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug486/bug486-expected-generation.docx differ diff --git a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug487/bug487-expected-generation.docx b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug487/bug487-expected-generation.docx index 936734657..67bf457ec 100644 Binary files a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug487/bug487-expected-generation.docx and b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug487/bug487-expected-generation.docx differ diff --git a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug499/bug499-expected-generation.docx b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug499/bug499-expected-generation.docx index 05a46ae51..a833672b9 100644 Binary files a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug499/bug499-expected-generation.docx and b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug499/bug499-expected-generation.docx differ diff --git a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug514/bug514-expected-generation.docx b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug514/bug514-expected-generation.docx index 28402237c..44d36848c 100644 Binary files a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug514/bug514-expected-generation.docx and b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug514/bug514-expected-generation.docx differ diff --git a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug515/bug515-expected-generation.docx b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug515/bug515-expected-generation.docx index 45b6efdb0..2964f66a1 100644 Binary files a/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug515/bug515-expected-generation.docx and b/tests/org.obeonetwork.m2doc.html.tests/resources/m2DocHtmlServices/bug515/bug515-expected-generation.docx differ