diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c10f7e..6fe2bb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,14 @@ understanding of patterns often seen in commit messages. in their entirety, and allow them to follow a preceding paragraph without the empty line that is otherwise usually required. +- #7: Recognize fenced code blocks with backtick code fences (` ``` `) and + preserve them in their entirety. Do not recognize tilde code fences (`~~~`), + which are virtually never used in practice and which would interfere with + many other uses. Per CommonMark 0.3.0 a code fence must be at least three + characters long and may optionally be indented up to three spaces, and the + closing code fence must be at least as long as the opening code fence + ignoring whitespace. + - If `--width` is specified multiple times, ignore all but the last occurrence. ## 1.5.0 - 2022-07-30 diff --git a/doc/commitmsgfmt.1.adoc b/doc/commitmsgfmt.1.adoc index 6eec6e5..43acaa7 100644 --- a/doc/commitmsgfmt.1.adoc +++ b/doc/commitmsgfmt.1.adoc @@ -58,6 +58,10 @@ The exact formatting behavior is implementation defined. This section attempts to describe the rules but deviation may be either an inexpediency in the implementation or an error in the description. +{self} does not attempt to recognize AsciiDoc, CommonMark, Markdown, +reStructuredText, or any other markup language. Formally, {self} recognizes +only plain text, and similarity to any markup language is incidental. + NOTE: Depending on your expectations of merge commit messages you may find {self} unsuitable for use in such messages. If you write them precisely like you write non-merge commit messages, go ahead and format them with {self}. @@ -95,9 +99,9 @@ those cases and avoid them by preventing wrapping: it will references both preserve their context and don't degenerate into _list items_. -_Block quotes_ are exempt from the requirement of surrounding blank lines and -will never be considered to belong to a paragraph. A block quote embedded -inside a paragraph has the same effect on that paragraph as an empty line has. +A paragraph may be interrupted by _block quotes_ and _fenced code blocks_, +meaning these are exempt from the requirement of surrounding blank lines and +will never be considered to belong to a paragraph. === Subject line @@ -196,8 +200,64 @@ literals. === Literal -A line starting with one tab or four spaces is considered a _literal_. Literals -are printed verbatim, making them suitable for listings and tables. +A line starting with one tab or four spaces is considered a _literal_: + +---- +paragraph + + literal + +paragraph +---- + +Literals are printed verbatim, making them suitable for listings and tables. + +See also _fenced code block_. + +=== Code fence + +Outside of a _fenced code block_ a line starting with up to 3 spaces followed +by at least 3 consecutive backticks (*`*) is considered an _opening code +fence_: + +---- +```opening +---- + +Within a fenced code block a line starting with up to 3 spaces followed by at +least as many consecutive backticks as the preceding opening code fence is +considered a _closing code fence_; any sequence of fewer backticks is ignored: + +---- + ````opening +``` + ````` +---- + +NOTE: For sake of compatibility, tilde (*~*) cannot be used in place of +backtick. + +=== Fenced code block + +A _fenced code block_ begins with an _opening code fence_ and ends with the +first following _closing code fence_: + +---- +Compare the previous version of origin/topic with the current version: +```sh +$ git range-diff origin/main origin/topic@{1} origin/topic +``` +---- + +The fenced code block includes both code fences and all contents in-between the +code fences. + +Fenced code blocks are printed verbatim, making them suitable for listings. +Fenced code blocks are more flexible in their use than _literals_ are but +otherwise solve the same problem. + +A fenced code block may interrupt a _paragraph_; it needs no preceding or +following blank line. === Block quote @@ -226,11 +286,9 @@ vip:!fmt -w72 -p'>' ---- ==== -Unlike other constructs a block quote may be embedded inside a _paragraph_ with -no preceding or following blank line; the block quote will not be folded into -the paragraph and the paragraph will otherwise observe standard behavior. This -enables a common pattern of immediately preceding the block quote with an -author attribution, illustrated above. +A block quote may interrupt a _paragraph_; it needs no preceding or following +blank line. This enables a common pattern of immediately preceding the block +quote with an author attribution, illustrated above. === Comment @@ -309,6 +367,11 @@ foo baar -- baz qux wupwupwup [1][2] [wup] hex: > 0 1 2 3 4 5 6 7 8 9 a b c d e f +chicken: +```chicken +chicken chicken +``` + - foo 1. foo bar baz @@ -332,6 +395,11 @@ wupwupwup [1][2] [wup] hex: > 0 1 2 3 4 5 6 7 8 9 a b c d e f +chicken: +```chicken +chicken chicken +``` + - foo 1. foo bar baz diff --git a/src/commitmsgfmt.rs b/src/commitmsgfmt.rs index e862fc1..98ae48c 100644 --- a/src/commitmsgfmt.rs +++ b/src/commitmsgfmt.rs @@ -37,7 +37,8 @@ impl CommitMsgFmt { fn reflow_into(&self, buf: &mut String, msg: &[Token]) { for tok in msg { match *tok { - BlockQuote(s) | Comment(s) | Literal(s) | Scissored(s) | Trailer(s) => { + BlockQuote(s) | Comment(s) | FencedCodeBlock(s) | Literal(s) | Scissored(s) + | Trailer(s) => { buf.push_str(s); } ListItem(ref indent, ref li, ref s) => { @@ -214,6 +215,82 @@ foo assert_eq!(filter(72, &input), expected); } + #[test] + fn preserves_fenced_code_block() { + let input = " +foo + +``` +backtick +fenced +code +block +``` +"; + + let expected = " +foo + +``` +backtick +fenced +code +block +``` +"; + + assert_eq!(filter(72, &input), expected); + } + + #[test] + fn preserves_fenced_code_block_interrupting_paragraph() { + let input = " +foo + +a +``` +backtick +``` +b +"; + + let expected = " +foo + +a +``` +backtick +``` +b +"; + + assert_eq!(filter(72, &input), expected); + } + + #[test] + fn ignores_fenced_code_block_with_tilde() { + let input = " +foo + +~~~ +tilde +fenced +code +block +not +supported +~~~ +"; + + let expected = " +foo + +~~~ tilde fenced code block not supported ~~~ +"; + + assert_eq!(filter(72, &input), expected); + } + #[test] fn preserves_block_quote() { let input = " diff --git a/src/parser.rs b/src/parser.rs index 304fbfa..f1b4909 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -12,6 +12,7 @@ type ReflowableStrBuf<'input> = Cow<'input, str>; #[derive(Debug, PartialEq, Eq)] pub enum Token<'input> { Comment(&'input str), + FencedCodeBlock(&'input str), Footnote(&'input str, ReflowableStrBuf<'input>), ListItem( ListIndent<'input>, @@ -27,6 +28,24 @@ pub enum Token<'input> { VerticalSpace, } +// Token::FencedCodeBlock is the first block construct that can interrupt another block construct; +// that cannot be identified purely from either the current line or the previous token; and that +// may not be wrapped. That means there are valid situations we cannot represent without more +// sophisticated state management. We also don't want to needlessly extend a String. We'll just +// track the opening code fence, match it against closing fences, and treat the entire block as a +// glorified Token::Literal sequence. We accept the loss of composability. +struct CodeFence<'input>(&'input str); + +impl CodeFence<'_> { + fn is_closed_by(&self, line: &str) -> bool { + // "until a closing code fence of the same type as the code block began with (backticks or + // tildes), and with at least as many backticks or tildes as the opening code fence" + line_as_code_fence(line) + .map(|fence| fence.0.starts_with(self.0)) + .unwrap_or(false) + } +} + pub fn parse(input: &str, comment_char: char) -> Vec { let mut toks = Vec::new(); @@ -34,9 +53,18 @@ pub fn parse(input: &str, comment_char: char) -> Vec { let mut has_scissors = false; let lines = input.lines(); let mut px = false; + let mut in_code_fence: Option = None; for line in lines { if has_scissors { toks.push(Token::Scissored(line)); + } else if let Some(ref fence) = in_code_fence { + toks.push(Token::FencedCodeBlock(line)); + if fence.is_closed_by(line) { + in_code_fence = None; + } + } else if let Some(fence) = line_as_code_fence(line) { + toks.push(Token::FencedCodeBlock(line)); + in_code_fence = Some(fence); } else if line.starts_with(comment_char) { let t = if &line[1..] == " ------------------------ >8 ------------------------" { has_scissors = true; @@ -317,6 +345,64 @@ fn line_as_list_item(line: &str) -> Option { }) } +fn line_as_code_fence(line: &'_ str) -> Option { + enum FenceState { + New, + IndentSp1, + IndentSp2, + IndentSp3, + Backtick, + } + + let mut fence_state = FenceState::New; + let mut ix_fence_start = 0; + let mut fence_length = 0; + let mut tally = || fence_length += 1; + // https://spec.commonmark.org/0.30/#fenced-code-blocks + // Backtick fenced code blocks appear relatively safe to support. Tilde fenced code blocks, on + // the other hand, are unsafe: tildes are often used for emphasizing compilation error output + // or underlining headers. + for (ix, c) in line.char_indices() { + match fence_state { + FenceState::New + | FenceState::IndentSp1 + | FenceState::IndentSp2 + | FenceState::IndentSp3 => { + ix_fence_start = ix; + // "preceded by up to three spaces of indentation" + fence_state = match c { + ' ' => match fence_state { + FenceState::New => FenceState::IndentSp1, + FenceState::IndentSp1 => FenceState::IndentSp2, + FenceState::IndentSp2 => FenceState::IndentSp3, + _ => break, + }, + '`' => { + tally(); + FenceState::Backtick + } + _ => break, + }; + } + // "Tildes and backticks cannot be mixed." + FenceState::Backtick => match c { + '`' => tally(), + _ => break, + }, + } + } + + // "at least three consecutive backtick characters (`) or tildes (~)" + if fence_length >= 3 { + let ix_end = ix_fence_start + fence_length; + debug_assert!(ix_end <= line.len()); + let fence = &line[ix_fence_start..ix_end]; + Some(CodeFence(fence)) + } else { + None + } +} + fn line_as_line_block_quote(line: &str) -> Option { if line.starts_with('>') { Some(Token::BlockQuote(line)) @@ -677,6 +763,438 @@ some other paragraph ); } + #[test] + fn parses_codefence_backtick_verbatim() { + let input = " +subject + +``` +backtick +``` + + ``` + backtick + ``` + + ``` + backtick + ``` + + ``` + backtick + ``` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("```"), + FencedCodeBlock("backtick"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + FencedCodeBlock(" ```"), + VerticalSpace, + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + FencedCodeBlock(" ```"), + VerticalSpace, + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + FencedCodeBlock(" ```"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_not_fenced_code_block() { + let input = " +subject + +~~~ +tilde +~~~ +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Paragraph("~~~ tilde ~~~".into()), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_indented_aligned_4sp_not_fenced_code_block() { + let input = " +subject + + ``` + backtick + ``` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Literal(" ```"), + Literal(" backtick"), + Literal(" ```"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_indented_unaligned() { + let input = " +subject + + ``` +backtick 1 0 +``` + ``` +backtick 2 0 +``` + ``` +backtick 3 0 +``` +``` +backtick 0 1 + ``` +``` +backtick 0 2 + ``` +``` +backtick 0 3 + ``` + ``` +backtick 2 1 + ``` + ``` +backtick 3 2 + ``` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 1 0"), + FencedCodeBlock("```"), + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 2 0"), + FencedCodeBlock("```"), + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 3 0"), + FencedCodeBlock("```"), + FencedCodeBlock("```"), + FencedCodeBlock("backtick 0 1"), + FencedCodeBlock(" ```"), + FencedCodeBlock("```"), + FencedCodeBlock("backtick 0 2"), + FencedCodeBlock(" ```"), + FencedCodeBlock("```"), + FencedCodeBlock("backtick 0 3"), + FencedCodeBlock(" ```"), + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 2 1"), + FencedCodeBlock(" ```"), + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 3 2"), + FencedCodeBlock(" ```"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_fence_extra_long() { + let input = " +subject + +``` +backtick 3 4 +```` +```` +backtick 4 5 +````` +````` +backtick 5 6 +`````` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("```"), + FencedCodeBlock("backtick 3 4"), + FencedCodeBlock("````"), + FencedCodeBlock("````"), + FencedCodeBlock("backtick 4 5"), + FencedCodeBlock("`````"), + FencedCodeBlock("`````"), + FencedCodeBlock("backtick 5 6"), + FencedCodeBlock("``````"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_fence_too_short_not_fenced_code_block() { + let input = " +subject + +`` +backtick +`` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Paragraph("`` backtick ``".into()), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_with_infostring() { + let input = " +subject + +```info +backtick info no leading ws +``` + +``` info +backtick info leading sp +``` + +``` info +backtick info leading tab +``` + +```info` +backtick info accept illegal info with backtick +``` + +```info~ +backtick info accept legal info with tilde +``` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("```info"), + FencedCodeBlock("backtick info no leading ws"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock("``` info"), + FencedCodeBlock("backtick info leading sp"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock("```\tinfo"), + FencedCodeBlock("backtick info leading tab"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock("```info`"), + FencedCodeBlock("backtick info accept illegal info with backtick"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock("```info~"), + FencedCodeBlock("backtick info accept legal info with tilde"), + FencedCodeBlock("```"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_can_interrupt_paragraph() { + let input = " +subject + +a +``` +backtick +``` +b +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Paragraph("a".into()), + FencedCodeBlock("```"), + FencedCodeBlock("backtick"), + FencedCodeBlock("```"), + Paragraph("b".into()), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_fence_unmatched_length() { + let input = " +subject + +```` +backtick +``` + +backtick + +`` +backtick +```` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("````"), + FencedCodeBlock("backtick"), + FencedCodeBlock("```"), + FencedCodeBlock(""), + FencedCodeBlock("backtick"), + FencedCodeBlock(""), + FencedCodeBlock("``"), + FencedCodeBlock("backtick"), + FencedCodeBlock("````"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_unterminated() { + let input = " +subject + +``` +backtick +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("```"), + FencedCodeBlock("backtick"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn bug_parses_codefence_backtick_enclosed_in_block_quote() { + let input = " +subject + +> a +> ``` +> backtick +> ``` +> b + +> c +> ``` +> backtick +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + BlockQuote("> a"), + BlockQuote("> ```"), + BlockQuote("> backtick"), + BlockQuote("> ```"), + BlockQuote("> b"), + VerticalSpace, + BlockQuote("> c"), + BlockQuote("> ```"), + BlockQuote("> backtick"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn bug_parses_codefence_backtick_enclosed_in_list_item() { + let input = " +subject + +- a + ``` + backtick + ``` + b + +- c + ``` + backtick +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + ListItem(ListIndent(""), ListType("- "), "a".into()), + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + FencedCodeBlock(" ```"), + Paragraph("b".into()), + VerticalSpace, + ListItem(ListIndent(""), ListType("- "), "c".into()), + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + #[test] fn parses_block_quote_verbatim() { assert_eq!(