// Copyright 2021 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package markdown import ( "bytes" "fmt" "strings" "unicode" "unicode/utf8" ) /* text node can be - other literal text - run of * or _ characters - [ - ![ keep delimiter stack pointing at non-other literal text each node contains - type of delimiter [ ![ _ * - number of delimiters - active or not - potential opener, potential closer, or obth when a ] is hit, call look for link or image when end is hit, call process emphasis look for link or image: find topmost [ or ![ if none, emit literal ] if its inactive, remove and emit literal ] parse ahead to look for rest of link; if none, remove and emit literal ] run process emphasis on the interior, remove opener if this was a link (not an image), set all [ before opener to inactive, to avoid links inside links process emphasis walk forward in list to find a closer. walk back to find first potential matching opener. if found: strong for length >= 2 insert node drop delimiters between opener and closer remove 1 or 2 from open/close count, removing if now empty if closing has some left, go around again on this node if not: set openers bottom for this kind of element to before current_position if the closer at current pos is not an opener, remove it seems needlessly complex. two passes scan and find ` ` first. pass 1. scan and find [ and ]() and leave the rest alone. each completed one invokes emphasis on inner text and then on the overall list. */ type Inline interface { PrintHTML(*bytes.Buffer) PrintText(*bytes.Buffer) printMarkdown(*bytes.Buffer) } type Plain struct { Text string } func (*Plain) Inline() {} func (x *Plain) PrintHTML(buf *bytes.Buffer) { htmlEscaper.WriteString(buf, x.Text) } func (x *Plain) printMarkdown(buf *bytes.Buffer) { buf.WriteString(x.Text) } func (x *Plain) PrintText(buf *bytes.Buffer) { htmlEscaper.WriteString(buf, x.Text) } type openPlain struct { Plain i int // position in input where bracket is } type emphPlain struct { Plain canOpen bool canClose bool i int // position in output where emph is n int // length of original span } type Escaped struct { Plain } func (x *Escaped) printMarkdown(buf *bytes.Buffer) { buf.WriteByte('\\') x.Plain.printMarkdown(buf) } type Code struct { Text string } func (*Code) Inline() {} func (x *Code) PrintHTML(buf *bytes.Buffer) { fmt.Fprintf(buf, "%s", htmlEscaper.Replace(x.Text)) } func (x *Code) printMarkdown(buf *bytes.Buffer) { if len(x.Text) == 0 { return } // Use the fewest backticks we can, and add spaces as needed. ticks := strings.Repeat("`", longestSequence(x.Text, '`')+1) buf.WriteString(ticks) if x.Text[0] == '`' { buf.WriteByte(' ') } buf.WriteString(x.Text) if x.Text[len(x.Text)-1] == '`' { buf.WriteByte(' ') } buf.WriteString(ticks) } // longestSequence returns the length of the longest sequence of consecutive bytes b in s. func longestSequence(s string, b byte) int { max := 0 cur := 0 for i := range s { if s[i] == b { cur++ } else { if cur > max { max = cur } cur = 0 } } if cur > max { max = cur } return max } func (x *Code) PrintText(buf *bytes.Buffer) { htmlEscaper.WriteString(buf, x.Text) } type Strong struct { Marker string Inner []Inline } func (x *Strong) Inline() { } func (x *Strong) PrintHTML(buf *bytes.Buffer) { buf.WriteString("") for _, c := range x.Inner { c.PrintHTML(buf) } buf.WriteString("") } func (x *Strong) printMarkdown(buf *bytes.Buffer) { buf.WriteString(x.Marker) for _, c := range x.Inner { c.printMarkdown(buf) } buf.WriteString(x.Marker) } func (x *Strong) PrintText(buf *bytes.Buffer) { for _, c := range x.Inner { c.PrintText(buf) } } type Del struct { Marker string Inner []Inline } func (x *Del) Inline() { } func (x *Del) PrintHTML(buf *bytes.Buffer) { buf.WriteString("") for _, c := range x.Inner { c.PrintHTML(buf) } buf.WriteString("") } func (x *Del) printMarkdown(buf *bytes.Buffer) { buf.WriteString(x.Marker) for _, c := range x.Inner { c.printMarkdown(buf) } buf.WriteString(x.Marker) } func (x *Del) PrintText(buf *bytes.Buffer) { for _, c := range x.Inner { c.PrintText(buf) } } type Emph struct { Marker string Inner []Inline } func (*Emph) Inline() {} func (x *Emph) PrintHTML(buf *bytes.Buffer) { buf.WriteString("") for _, c := range x.Inner { c.PrintHTML(buf) } buf.WriteString("") } func (x *Emph) printMarkdown(buf *bytes.Buffer) { buf.WriteString(x.Marker) for _, c := range x.Inner { c.printMarkdown(buf) } buf.WriteString(x.Marker) } func (x *Emph) PrintText(buf *bytes.Buffer) { for _, c := range x.Inner { c.PrintText(buf) } } func (p *parseState) emit(i int) { if p.emitted < i { p.list = append(p.list, &Plain{p.s[p.emitted:i]}) p.emitted = i } } func (p *parseState) skip(i int) { p.emitted = i } func (p *parseState) inline(s string) []Inline { s = trimSpaceTab(s) // Scan text looking for inlines. // Leaf inlines are converted immediately. // Non-leaf inlines have potential starts pushed on a stack while we await completion. // Links take priority over other emphasis, so the emphasis must be delayed. p.s = s p.list = nil p.emitted = 0 var opens []int // indexes of open ![ and [ Plains in p.list var lastLinkOpen int backticks := false i := 0 for i < len(s) { var parser func(*parseState, string, int) (Inline, int, int, bool) switch s[i] { case '\\': parser = parseEscape case '`': if !backticks { backticks = true p.backticks.reset() } parser = p.backticks.parseCodeSpan case '<': parser = parseAutoLinkOrHTML case '[': parser = parseLinkOpen case '!': parser = parseImageOpen case '_', '*': parser = parseEmph case '.': if p.SmartDot { parser = parseDot } case '-': if p.SmartDash { parser = parseDash } case '"', '\'': if p.SmartQuote { parser = parseEmph } case '~': if p.Strikethrough { parser = parseEmph } case '\n': // TODO what about eof parser = parseBreak case '&': parser = parseHTMLEntity case ':': if p.Emoji { parser = parseEmoji } } if parser != nil { if x, start, end, ok := parser(p, s, i); ok { p.emit(start) if _, ok := x.(*openPlain); ok { opens = append(opens, len(p.list)) } p.list = append(p.list, x) i = end p.skip(i) continue } } if s[i] == ']' && len(opens) > 0 { oi := opens[len(opens)-1] open := p.list[oi].(*openPlain) opens = opens[:len(opens)-1] if open.Text[0] == '!' || lastLinkOpen <= open.i { if x, end, ok := p.parseLinkClose(s, i, open); ok { p.corner = p.corner || x.corner || linkCorner(x.URL) p.emit(i) x.Inner = p.emph(nil, p.list[oi+1:]) if open.Text[0] == '!' { p.list[oi] = (*Image)(x) } else { p.list[oi] = x } p.list = p.list[:oi+1] p.skip(end) i = end if open.Text[0] == '[' { // No links around links. lastLinkOpen = open.i } continue } } } i++ } p.emit(len(s)) p.list = p.emph(p.list[:0], p.list) p.list = p.mergePlain(p.list) p.list = p.autoLinkText(p.list) return p.list } func (ps *parseState) emph(dst, src []Inline) []Inline { const chars = "_*~\"'" var stack [len(chars)][]*emphPlain stackOf := func(c byte) int { return strings.IndexByte(chars, c) } trimStack := func() { for i := range stack { stk := &stack[i] for len(*stk) > 0 && (*stk)[len(*stk)-1].i >= len(dst) { *stk = (*stk)[:len(*stk)-1] } } } Src: for i := 0; i < len(src); i++ { if open, ok := src[i].(*openPlain); ok { // Convert unused link/image open marker to plain text. dst = append(dst, &open.Plain) continue } p, ok := src[i].(*emphPlain) if !ok { dst = append(dst, src[i]) continue } if p.canClose { stk := &stack[stackOf(p.Text[0])] Loop: for p.Text != "" { // Looking for same symbol and compatible with p.Text. for i := len(*stk) - 1; i >= 0; i-- { start := (*stk)[i] if (p.Text[0] == '*' || p.Text[0] == '_') && (p.canOpen && p.canClose || start.canOpen && start.canClose) && (p.n+start.n)%3 == 0 && (p.n%3 != 0 || start.n%3 != 0) { continue } if p.Text[0] == '~' && len(p.Text) != len(start.Text) { // ~ matches ~, ~~ matches ~~ continue } if p.Text[0] == '"' { dst[start.i].(*emphPlain).Text = "“" p.Text = "”" dst = append(dst, p) *stk = (*stk)[:i] // no trimStack continue Src } if p.Text[0] == '\'' { dst[start.i].(*emphPlain).Text = "‘" p.Text = "’" dst = append(dst, p) *stk = (*stk)[:i] // no trimStack continue Src } var d int if len(p.Text) >= 2 && len(start.Text) >= 2 { // strong d = 2 } else { // emph d = 1 } del := p.Text[0] == '~' x := &Emph{Marker: p.Text[:d], Inner: append([]Inline(nil), dst[start.i+1:]...)} start.Text = start.Text[:len(start.Text)-d] p.Text = p.Text[d:] if start.Text == "" { dst = dst[:start.i] } else { dst = dst[:start.i+1] } trimStack() if del { dst = append(dst, (*Del)(x)) } else if d == 2 { dst = append(dst, (*Strong)(x)) } else { dst = append(dst, x) } continue Loop } break } } if p.Text != "" { stk := &stack[stackOf(p.Text[0])] if p.Text == "'" { p.Text = "’" } if p.Text == "\"" { if p.canClose { p.Text = "”" } else { p.Text = "“" } } if p.canOpen { p.i = len(dst) dst = append(dst, p) *stk = append(*stk, p) } else { dst = append(dst, &p.Plain) } } } return dst } func mdUnescape(s string) string { if !strings.Contains(s, `\`) && !strings.Contains(s, `&`) { return s } return mdUnescaper.Replace(s) } var mdUnescaper = func() *strings.Replacer { var list = []string{ `\!`, `!`, `\"`, `"`, `\#`, `#`, `\$`, `$`, `\%`, `%`, `\&`, `&`, `\'`, `'`, `\(`, `(`, `\)`, `)`, `\*`, `*`, `\+`, `+`, `\,`, `,`, `\-`, `-`, `\.`, `.`, `\/`, `/`, `\:`, `:`, `\;`, `;`, `\<`, `<`, `\=`, `=`, `\>`, `>`, `\?`, `?`, `\@`, `@`, `\[`, `[`, `\\`, `\`, `\]`, `]`, `\^`, `^`, `\_`, `_`, "\\`", "`", `\{`, `{`, `\|`, `|`, `\}`, `}`, `\~`, `~`, } for name, repl := range htmlEntity { list = append(list, name, repl) } return strings.NewReplacer(list...) }() func isPunct(c byte) bool { return '!' <= c && c <= '/' || ':' <= c && c <= '@' || '[' <= c && c <= '`' || '{' <= c && c <= '~' } func parseEscape(p *parseState, s string, i int) (Inline, int, int, bool) { if i+1 < len(s) { c := s[i+1] if isPunct(c) { return &Escaped{Plain{s[i+1 : i+2]}}, i, i + 2, true } if c == '\n' { // TODO what about eof if i > 0 && s[i-1] == '\\' { p.corner = true // goldmark mishandles \\\ newline } end := i + 2 for end < len(s) && (s[end] == ' ' || s[end] == '\t') { end++ } return &HardBreak{}, i, end, true } } return nil, 0, 0, false } func parseDot(p *parseState, s string, i int) (Inline, int, int, bool) { if i+2 < len(s) && s[i+1] == '.' && s[i+2] == '.' { return &Plain{"…"}, i, i + 3, true } return nil, 0, 0, false } func parseDash(p *parseState, s string, i int) (Inline, int, int, bool) { if i+1 >= len(s) || s[i+1] != '-' { return nil, 0, 0, false } n := 2 for i+n < len(s) && s[i+n] == '-' { n++ } // Mimic cmark-gfm. Can't make this stuff up. em, en := 0, 0 switch { case n%3 == 0: em = n / 3 case n%2 == 0: en = n / 2 case n%3 == 2: em = (n - 2) / 3 en = 1 case n%3 == 1: em = (n - 4) / 3 en = 2 } return &Plain{strings.Repeat("—", em) + strings.Repeat("–", en)}, i, i + n, true } // Inline code span markers must fit on punched cards, to match cmark-gfm. const maxBackticks = 80 type backtickParser struct { last [maxBackticks]int scanned bool } func (b *backtickParser) reset() { *b = backtickParser{} } func (b *backtickParser) parseCodeSpan(p *parseState, s string, i int) (Inline, int, int, bool) { start := i // Count leading backticks. Need to find that many again. n := 1 for i+n < len(s) && s[i+n] == '`' { n++ } // If we've already scanned the whole string (for a different count), // we can skip a failed scan by checking whether we saw this count. // To enable this optimization, following cmark-gfm, we declare by fiat // that more than maxBackticks backquotes is too many. if n > len(b.last) || b.scanned && b.last[n-1] < i+n { goto NoMatch } for end := i + n; end < len(s); { if s[end] != '`' { end++ continue } estart := end for end < len(s) && s[end] == '`' { end++ } m := end - estart if !b.scanned && m < len(b.last) { b.last[m-1] = estart } if m == n { // Match. // Line endings are converted to single spaces. text := s[i+n : estart] text = strings.ReplaceAll(text, "\n", " ") // If enclosed text starts and ends with a space and is not all spaces, // one space is removed from start and end, to allow `` ` `` to quote a single backquote. if len(text) >= 2 && text[0] == ' ' && text[len(text)-1] == ' ' && trimSpace(text) != "" { text = text[1 : len(text)-1] } return &Code{text}, start, end, true } } b.scanned = true NoMatch: // No match, so none of these backticks count: skip them all. // For example ``x` is not a single backtick followed by a code span. // Returning nil, 0, false would advance to the second backtick and try again. return &Plain{s[i : i+n]}, start, i + n, true } func parseAutoLinkOrHTML(p *parseState, s string, i int) (Inline, int, int, bool) { if x, end, ok := parseAutoLinkURI(s, i); ok { return x, i, end, true } if x, end, ok := parseAutoLinkEmail(s, i); ok { return x, i, end, true } if x, end, ok := parseHTMLTag(p, s, i); ok { return x, i, end, true } return nil, 0, 0, false } func isLetter(c byte) bool { return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' } func isLDH(c byte) bool { return isLetterDigit(c) || c == '-' } func isLetterDigit(c byte) bool { return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' } func parseLinkOpen(_ *parseState, s string, i int) (Inline, int, int, bool) { return &openPlain{Plain{s[i : i+1]}, i + 1}, i, i + 1, true } func parseImageOpen(_ *parseState, s string, i int) (Inline, int, int, bool) { if i+1 < len(s) && s[i+1] == '[' { return &openPlain{Plain{s[i : i+2]}, i + 2}, i, i + 2, true } return nil, 0, 0, false } func parseEmph(p *parseState, s string, i int) (Inline, int, int, bool) { c := s[i] j := i + 1 if c == '*' || c == '~' || c == '_' { for j < len(s) && s[j] == c { j++ } } if c == '~' && j-i != 2 { // Goldmark does not accept ~text~ // and incorrectly accepts ~~~text~~~. // Only ~~ is correct. p.corner = true } if c == '~' && j-i > 2 { return &Plain{s[i:j]}, i, j, true } var before, after rune if i == 0 { before = ' ' } else { before, _ = utf8.DecodeLastRuneInString(s[:i]) } if j >= len(s) { after = ' ' } else { after, _ = utf8.DecodeRuneInString(s[j:]) } // “A left-flanking delimiter run is a delimiter run that is // (1) not followed by Unicode whitespace, and either // (2a) not followed by a Unicode punctuation character, or // (2b) followed by a Unicode punctuation character // and preceded by Unicode whitespace or a Unicode punctuation character. // For purposes of this definition, the beginning and the end // of the line count as Unicode whitespace.” leftFlank := !isUnicodeSpace(after) && (!isUnicodePunct(after) || isUnicodeSpace(before) || isUnicodePunct(before)) // “A right-flanking delimiter run is a delimiter run that is // (1) not preceded by Unicode whitespace, and either // (2a) not preceded by a Unicode punctuation character, or // (2b) preceded by a Unicode punctuation character // and followed by Unicode whitespace or a Unicode punctuation character. // For purposes of this definition, the beginning and the end // of the line count as Unicode whitespace.” rightFlank := !isUnicodeSpace(before) && (!isUnicodePunct(before) || isUnicodeSpace(after) || isUnicodePunct(after)) var canOpen, canClose bool switch c { case '\'', '"': canOpen = leftFlank && !rightFlank && before != ']' && before != ')' canClose = rightFlank case '*', '~': // “A single * character can open emphasis iff // it is part of a left-flanking delimiter run.” // “A double ** can open strong emphasis iff // it is part of a left-flanking delimiter run.” canOpen = leftFlank // “A single * character can close emphasis iff // it is part of a right-flanking delimiter run.” // “A double ** can close strong emphasis iff // it is part of a right-flanking delimiter run.” canClose = rightFlank case '_': // “A single _ character can open emphasis iff // it is part of a left-flanking delimiter run and either // (a) not part of a right-flanking delimiter run or // (b) part of a right-flanking delimiter run preceded by a Unicode punctuation character.” // “A double __ can open strong emphasis iff // it is part of a left-flanking delimiter run and either // (a) not part of a right-flanking delimiter run or // (b) part of a right-flanking delimiter run preceded by a Unicode punctuation character.” canOpen = leftFlank && (!rightFlank || isUnicodePunct(before)) // “A single _ character can close emphasis iff // it is part of a right-flanking delimiter run and either // (a) not part of a left-flanking delimiter run or // (b) part of a left-flanking delimiter run followed by a Unicode punctuation character.” // “A double __ can close strong emphasis iff // it is part of a right-flanking delimiter run and either // (a) not part of a left-flanking delimiter run or // (b) part of a left-flanking delimiter run followed by a Unicode punctuation character.” canClose = rightFlank && (!leftFlank || isUnicodePunct(after)) } return &emphPlain{Plain: Plain{s[i:j]}, canOpen: canOpen, canClose: canClose, n: j - i}, i, j, true } func isUnicodeSpace(r rune) bool { if r < 0x80 { return r == ' ' || r == '\t' || r == '\f' || r == '\n' } return unicode.In(r, unicode.Zs) } func isUnicodePunct(r rune) bool { if r < 0x80 { return isPunct(byte(r)) } return unicode.In(r, unicode.Punct) } func (p *parseState) parseLinkClose(s string, i int, open *openPlain) (*Link, int, bool) { if i+1 < len(s) { switch s[i+1] { case '(': // Inline link - [Text](Dest Title), with Title omitted or both Dest and Title omitted. i := skipSpace(s, i+2) var dest, title string var titleChar byte var corner bool if i < len(s) && s[i] != ')' { var ok bool dest, i, ok = parseLinkDest(s, i) if !ok { break } i = skipSpace(s, i) if i < len(s) && s[i] != ')' { title, titleChar, i, ok = parseLinkTitle(s, i) if title == "" { corner = true } if !ok { break } i = skipSpace(s, i) } } if i < len(s) && s[i] == ')' { return &Link{URL: dest, Title: title, TitleChar: titleChar, corner: corner}, i + 1, true } // NOTE: Test malformed ( ) with shortcut reference // TODO fall back on syntax error? case '[': // Full reference link - [Text][Label] label, i, ok := parseLinkLabel(p, s, i+1) if !ok { break } if link, ok := p.links[normalizeLabel(label)]; ok { return &Link{URL: link.URL, Title: link.Title, corner: link.corner}, i, true } // Note: Could break here, but CommonMark dingus does not // fall back to trying Text for [Text][Label] when Label is unknown. // Unclear from spec what the correct answer is. return nil, 0, false } } // Collapsed or shortcut reference link: [Text][] or [Text]. end := i + 1 if strings.HasPrefix(s[end:], "[]") { end += 2 } if link, ok := p.links[normalizeLabel(s[open.i:i])]; ok { return &Link{URL: link.URL, Title: link.Title, corner: link.corner}, end, true } return nil, 0, false } func skipSpace(s string, i int) int { // Note: Blank lines have already been removed. for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') { i++ } return i } func linkCorner(url string) bool { for i := 0; i < len(url); i++ { if url[i] == '%' { if i+2 >= len(url) || !isHexDigit(url[i+1]) || !isHexDigit(url[i+2]) { // Goldmark and the Dingus re-escape such percents as %25, // but the spec does not seem to require this behavior. return true } } } return false } func (p *parseState) mergePlain(list []Inline) []Inline { out := list[:0] start := 0 for i := 0; ; i++ { if i < len(list) && toPlain(list[i]) != nil { continue } // Non-Plain or end of list. if start < i { out = append(out, mergePlain1(list[start:i])) } if i >= len(list) { break } out = append(out, list[i]) start = i + 1 } return out } func toPlain(x Inline) *Plain { // TODO what about Escaped? switch x := x.(type) { case *Plain: return x case *emphPlain: return &x.Plain case *openPlain: return &x.Plain } return nil } func mergePlain1(list []Inline) *Plain { if len(list) == 1 { return toPlain(list[0]) } var all []string for _, pl := range list { all = append(all, toPlain(pl).Text) } return &Plain{Text: strings.Join(all, "")} } func parseEmoji(p *parseState, s string, i int) (Inline, int, int, bool) { for j := i + 1; ; j++ { if j >= len(s) || j-i > 2+maxEmojiLen { break } if s[j] == ':' { name := s[i+1 : j] if utf, ok := emoji[name]; ok { return &Emoji{s[i : j+1], utf}, i, j + 1, true } break } } return nil, 0, 0, false } type Emoji struct { Name string // emoji :name:, including colons Text string // Unicode for emoji sequence } func (*Emoji) Inline() {} func (x *Emoji) PrintHTML(buf *bytes.Buffer) { htmlEscaper.WriteString(buf, x.Text) } func (x *Emoji) printMarkdown(buf *bytes.Buffer) { buf.WriteString(x.Text) } func (x *Emoji) PrintText(buf *bytes.Buffer) { htmlEscaper.WriteString(buf, x.Text) }