src/cmd/vendor/rsc.io/markdown/link.go

// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package markdown

import (
	"bytes"
	"fmt"
	"strings"
	"unicode/utf8"

	"golang.org/x/text/cases"
)

func parseLinkRefDef(p buildState, s string) (int, bool) {
	// “A link reference definition consists of a link label,
	// optionally preceded by up to three spaces of indentation,
	// followed by a colon (:),
	// optional spaces or tabs (including up to one line ending),
	// a link destination,
	// optional spaces or tabs (including up to one line ending),
	// and an optional link title,
	// which if it is present must be separated from the link destination
	// by spaces or tabs. No further character may occur.”
	i := skipSpace(s, 0)
	label, i, ok := parseLinkLabel(p.(*parseState), s, i)
	if !ok || i >= len(s) || s[i] != ':' {
		return 0, false
	}
	i = skipSpace(s, i+1)
	suf := s[i:]
	dest, i, ok := parseLinkDest(s, i)
	if !ok {
		if suf != "" && suf[0] == '<' {
			// Goldmark treats <<> as a link definition.
			p.(*parseState).corner = true
		}
		return 0, false
	}
	moved := false
	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
		moved = true
		i++
	}

	// Take title if present and doesn't break parse.
	j := i
	if j >= len(s) || s[j] == '\n' {
		moved = true
		if j < len(s) {
			j++
		}
	}

	var title string
	var titleChar byte
	var corner bool
	if moved {
		for j < len(s) && (s[j] == ' ' || s[j] == '\t') {
			j++
		}
		if t, c, j, ok := parseLinkTitle(s, j); ok {
			for j < len(s) && (s[j] == ' ' || s[j] == '\t') {
				j++
			}
			if j >= len(s) || s[j] == '\n' {
				i = j
				if t == "" {
					// Goldmark adds title="" in this case.
					// We do not, nor does the Dingus.
					corner = true
				}
				title = t
				titleChar = c
			}
		}
	}

	// Must end line. Already trimmed spaces.
	if i < len(s) && s[i] != '\n' {
		return 0, false
	}
	if i < len(s) {
		i++
	}

	label = normalizeLabel(label)
	if p.link(label) == nil {
		p.defineLink(label, &Link{URL: dest, Title: title, TitleChar: titleChar, corner: corner})
	}
	return i, true
}

func parseLinkTitle(s string, i int) (title string, char byte, next int, found bool) {
	if i < len(s) && (s[i] == '"' || s[i] == '\'' || s[i] == '(') {
		want := s[i]
		if want == '(' {
			want = ')'
		}
		j := i + 1
		for ; j < len(s); j++ {
			if s[j] == want {
				title := s[i+1 : j]
				// TODO: Validate title?
				return mdUnescaper.Replace(title), want, j + 1, true
			}
			if s[j] == '(' && want == ')' {
				break
			}
			if s[j] == '\\' && j+1 < len(s) {
				j++
			}
		}
	}
	return "", 0, 0, false
}

func parseLinkLabel(p *parseState, s string, i int) (string, int, bool) {
	// “A link label begins with a left bracket ([) and ends with
	// the first right bracket (]) that is not backslash-escaped.
	// Between these brackets there must be at least one character
	// that is not a space, tab, or line ending.
	// Unescaped square bracket characters are not allowed
	// inside the opening and closing square brackets of link labels.
	// A link label can have at most 999 characters inside the square brackets.”
	if i >= len(s) || s[i] != '[' {
		return "", 0, false
	}
	j := i + 1
	for ; j < len(s); j++ {
		if s[j] == ']' {
			if j-(i+1) > 999 {
				// Goldmark does not apply 999 limit.
				p.corner = true
				break
			}
			if label := trimSpaceTabNewline(s[i+1 : j]); label != "" {
				// Note: CommonMark Dingus does not escape.
				return label, j + 1, true
			}
			break
		}
		if s[j] == '[' {
			break
		}
		if s[j] == '\\' && j+1 < len(s) {
			j++
		}
	}
	return "", 0, false
}

func normalizeLabel(s string) string {
	if strings.Contains(s, "[") || strings.Contains(s, "]") {
		// Labels cannot have [ ] so avoid the work of translating.
		// This is especially important for pathlogical cases like
		// [[[[[[[[[[a]]]]]]]]]] which would otherwise generate quadratic
		// amounts of garbage.
		return ""
	}

	// “To normalize a label, strip off the opening and closing brackets,
	// perform the Unicode case fold, strip leading and trailing spaces, tabs, and line endings,
	// and collapse consecutive internal spaces, tabs, and line endings to a single space.”
	s = trimSpaceTabNewline(s)
	var b strings.Builder
	space := false
	hi := false
	for i := 0; i < len(s); i++ {
		c := s[i]
		switch c {
		case ' ', '\t', '\n':
			space = true
			continue
		default:
			if space {
				b.WriteByte(' ')
				space = false
			}
			if 'A' <= c && c <= 'Z' {
				c += 'a' - 'A'
			}
			if c >= 0x80 {
				hi = true
			}
			b.WriteByte(c)
		}
	}
	s = b.String()
	if hi {
		s = cases.Fold().String(s)
	}
	return s
}

func parseLinkDest(s string, i int) (string, int, bool) {
	if i >= len(s) {
		return "", 0, false
	}

	// “A sequence of zero or more characters between an opening < and a closing >
	// that contains no line endings or unescaped < or > characters,”
	if s[i] == '<' {
		for j := i + 1; ; j++ {
			if j >= len(s) || s[j] == '\n' || s[j] == '<' {
				return "", 0, false
			}
			if s[j] == '>' {
				// TODO unescape?
				return mdUnescape(s[i+1 : j]), j + 1, true
			}
			if s[j] == '\\' {
				j++
			}
		}
	}

	// “or a nonempty sequence of characters that does not start with <,
	// does not include ASCII control characters or space character,
	// and includes parentheses only if (a) they are backslash-escaped
	// or (b) they are part of a balanced pair of unescaped parentheses.
	depth := 0
	j := i
Loop:
	for ; j < len(s); j++ {
		switch s[j] {
		case '(':
			depth++
			if depth > 32 {
				// Avoid quadratic inputs by stopping if too deep.
				// This is the same depth that cmark-gfm uses.
				return "", 0, false
			}
		case ')':
			if depth == 0 {
				break Loop
			}
			depth--
		case '\\':
			if j+1 < len(s) {
				if s[j+1] == ' ' || s[j+1] == '\t' {
					return "", 0, false
				}
				j++
			}
		case ' ', '\t', '\n':
			break Loop
		}
	}

	dest := s[i:j]
	// TODO: Validate dest?
	// TODO: Unescape?
	// NOTE: CommonMark Dingus does not reject control characters.
	return mdUnescape(dest), j, true
}

func parseAutoLinkURI(s string, i int) (Inline, int, bool) {
	// CommonMark 0.30:
	//
	//	For purposes of this spec, a scheme is any sequence of 2–32 characters
	//	beginning with an ASCII letter and followed by any combination of
	//	ASCII letters, digits, or the symbols plus (”+”), period (”.”), or
	//	hyphen (”-”).
	//
	//	An absolute URI, for these purposes, consists of a scheme followed by
	//	a colon (:) followed by zero or more characters other ASCII control
	//	characters, space, <, and >. If the URI includes these characters,
	//	they must be percent-encoded (e.g. %20 for a space).

	j := i
	if j+1 >= len(s) || s[j] != '<' || !isLetter(s[j+1]) {
		return nil, 0, false
	}
	j++
	for j < len(s) && isScheme(s[j]) && j-(i+1) <= 32 {
		j++
	}
	if j-(i+1) < 2 || j-(i+1) > 32 || j >= len(s) || s[j] != ':' {
		return nil, 0, false
	}
	j++
	for j < len(s) && isURL(s[j]) {
		j++
	}
	if j >= len(s) || s[j] != '>' {
		return nil, 0, false
	}
	link := s[i+1 : j]
	// link = mdUnescaper.Replace(link)
	return &AutoLink{link, link}, j + 1, true
}

func parseAutoLinkEmail(s string, i int) (Inline, int, bool) {
	// CommonMark 0.30:
	//
	//	An email address, for these purposes, is anything that matches
	//	the non-normative regex from the HTML5 spec:
	//
	//	/^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/

	j := i
	if j+1 >= len(s) || s[j] != '<' || !isUser(s[j+1]) {
		return nil, 0, false
	}
	j++
	for j < len(s) && isUser(s[j]) {
		j++
	}
	if j >= len(s) || s[j] != '@' {
		return nil, 0, false
	}
	for {
		j++
		n, ok := skipDomainElem(s[j:])
		if !ok {
			return nil, 0, false
		}
		j += n
		if j >= len(s) || s[j] != '.' && s[j] != '>' {
			return nil, 0, false
		}
		if s[j] == '>' {
			break
		}
	}
	email := s[i+1 : j]
	return &AutoLink{email, "mailto:" + email}, j + 1, true
}

func isUser(c byte) bool {
	if isLetterDigit(c) {
		return true
	}
	s := ".!#$%&'*+/=?^_`{|}~-"
	for i := 0; i < len(s); i++ {
		if c == s[i] {
			return true
		}
	}
	return false
}

func isHexDigit(c byte) bool {
	return 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' || '0' <= c && c <= '9'
}

func isDigit(c byte) bool {
	return '0' <= c && c <= '9'
}

func skipDomainElem(s string) (int, bool) {
	// String of LDH, up to 63 in length, with LetterDigit
	// at both ends (1-letter/digit names are OK).
	// Aka /[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?/.
	if len(s) < 1 || !isLetterDigit(s[0]) {
		return 0, false
	}
	i := 1
	for i < len(s) && isLDH(s[i]) && i <= 63 {
		i++
	}
	if i > 63 || !isLetterDigit(s[i-1]) {
		return 0, false
	}
	return i, true
}

func isScheme(c byte) bool {
	return isLetterDigit(c) || c == '+' || c == '.' || c == '-'
}

func isURL(c byte) bool {
	return c > ' ' && c != '<' && c != '>'
}

type AutoLink struct {
	Text string
	URL  string
}

func (*AutoLink) Inline() {}

func (x *AutoLink) PrintHTML(buf *bytes.Buffer) {
	fmt.Fprintf(buf, "<a href=\"%s\">%s</a>", htmlLinkEscaper.Replace(x.URL), htmlEscaper.Replace(x.Text))
}

func (x *AutoLink) printMarkdown(buf *bytes.Buffer) {
	fmt.Fprintf(buf, "<%s>", x.Text)
}

func (x *AutoLink) PrintText(buf *bytes.Buffer) {
	fmt.Fprintf(buf, "%s", htmlEscaper.Replace(x.Text))
}

type Link struct {
	Inner     []Inline
	URL       string
	Title     string
	TitleChar byte // ', " or )
	corner    bool
}

func (*Link) Inline() {}

func (x *Link) PrintHTML(buf *bytes.Buffer) {
	fmt.Fprintf(buf, "<a href=\"%s\"", htmlLinkEscaper.Replace(x.URL))
	if x.Title != "" {
		fmt.Fprintf(buf, " title=\"%s\"", htmlQuoteEscaper.Replace(x.Title))
	}
	buf.WriteString(">")
	for _, c := range x.Inner {
		c.PrintHTML(buf)
	}
	buf.WriteString("</a>")
}

func (x *Link) printMarkdown(buf *bytes.Buffer) {
	buf.WriteByte('[')
	x.printRemainingMarkdown(buf)
}

func (x *Link) printRemainingMarkdown(buf *bytes.Buffer) {
	for _, c := range x.Inner {
		c.printMarkdown(buf)
	}
	buf.WriteString("](")
	buf.WriteString(x.URL)
	printLinkTitleMarkdown(buf, x.Title, x.TitleChar)
	buf.WriteByte(')')
}

func printLinkTitleMarkdown(buf *bytes.Buffer, title string, titleChar byte) {
	if title == "" {
		return
	}
	closeChar := titleChar
	openChar := closeChar
	if openChar == ')' {
		openChar = '('
	}
	fmt.Fprintf(buf, " %c%s%c", openChar, title /*TODO(jba): escape*/, closeChar)
}

func (x *Link) PrintText(buf *bytes.Buffer) {
	for _, c := range x.Inner {
		c.PrintText(buf)
	}
}

type Image struct {
	Inner     []Inline
	URL       string
	Title     string
	TitleChar byte
	corner    bool
}

func (*Image) Inline() {}

func (x *Image) PrintHTML(buf *bytes.Buffer) {
	fmt.Fprintf(buf, "<img src=\"%s\"", htmlLinkEscaper.Replace(x.URL))
	fmt.Fprintf(buf, " alt=\"")
	i := buf.Len()
	for _, c := range x.Inner {
		c.PrintText(buf)
	}
	// GitHub and Goldmark both rewrite \n to space
	// but the Dingus does not.
	// The spec says title can be split across lines but not
	// what happens at that point.
	out := buf.Bytes()
	for ; i < len(out); i++ {
		if out[i] == '\n' {
			out[i] = ' '
		}
	}
	fmt.Fprintf(buf, "\"")
	if x.Title != "" {
		fmt.Fprintf(buf, " title=\"%s\"", htmlQuoteEscaper.Replace(x.Title))
	}
	buf.WriteString(" />")
}

func (x *Image) printMarkdown(buf *bytes.Buffer) {
	buf.WriteString("![")
	(*Link)(x).printRemainingMarkdown(buf)
}

func (x *Image) PrintText(buf *bytes.Buffer) {
	for _, c := range x.Inner {
		c.PrintText(buf)
	}
}

// GitHub Flavored Markdown autolinks extension
// https://github.github.com/gfm/#autolinks-extension-

// autoLinkMore rewrites any extended autolinks in the body
// and returns the result.
//
// body is a list of Plain, Emph, Strong, and Del nodes.
// Two Plains only appear consecutively when one is a
// potential emphasis marker that ended up being plain after all, like "_" or "**".
// There are no Link nodes.
//
// The GitHub “spec” declares that “autolinks can only come at the
// beginning of a line, after whitespace, or any of the delimiting
// characters *, _, ~, and (”. However, the GitHub web site does not
// enforce this rule: text like "$abc@def.ghi is my email" links the
// text following the $ as an email address. It appears the actual rule
// is that autolinks cannot come after ASCII letters, although they can
// come after numbers or Unicode letters.
// Since the only point of implementing GitHub Flavored Markdown
// is to match GitHub's behavior, we do what they do, not what they say,
// at least for now.
func (p *parseState) autoLinkText(list []Inline) []Inline {
	if !p.AutoLinkText {
		return list
	}

	var out []Inline // allocated lazily when we first change list
	for i, x := range list {
		switch x := x.(type) {
		case *Plain:
			if rewrite := p.autoLinkPlain(x.Text); rewrite != nil {
				if out == nil {
					out = append(out, list[:i]...)
				}
				out = append(out, rewrite...)
				continue
			}
		case *Strong:
			x.Inner = p.autoLinkText(x.Inner)
		case *Del:
			x.Inner = p.autoLinkText(x.Inner)
		case *Emph:
			x.Inner = p.autoLinkText(x.Inner)
		}
		if out != nil {
			out = append(out, x)
		}
	}
	if out == nil {
		return list
	}
	return out
}

func (p *parseState) autoLinkPlain(s string) []Inline {
	vd := &validDomainChecker{s: s}
	var out []Inline
Restart:
	for i := 0; i < len(s); i++ {
		c := s[i]
		if c == '@' {
			if before, link, after, ok := p.parseAutoEmail(s, i); ok {
				if before != "" {
					out = append(out, &Plain{Text: before})
				}
				out = append(out, link)
				vd.skip(len(s) - len(after))
				s = after
				goto Restart
			}
		}

		if (c == 'h' || c == 'm' || c == 'x' || c == 'w') && (i == 0 || !isLetter(s[i-1])) {
			if link, after, ok := p.parseAutoProto(s, i, vd); ok {
				if i > 0 {
					out = append(out, &Plain{Text: s[:i]})
				}
				out = append(out, link)
				vd.skip(len(s) - len(after))
				s = after
				goto Restart
			}
		}
	}
	if out == nil {
		return nil
	}
	out = append(out, &Plain{Text: s})
	return out
}

func (p *parseState) parseAutoProto(s string, i int, vd *validDomainChecker) (link *Link, after string, found bool) {
	if s == "" {
		return
	}
	switch s[i] {
	case 'h':
		var n int
		if strings.HasPrefix(s[i:], "https://") {
			n = len("https://")
		} else if strings.HasPrefix(s[i:], "http://") {
			n = len("http://")
		} else {
			return
		}
		return p.parseAutoHTTP(s[i:i+n], s, i, i+n, i+n+1, vd)
	case 'w':
		if !strings.HasPrefix(s[i:], "www.") {
			return
		}
		// GitHub Flavored Markdown says to use http://,
		// but it's not 1985 anymore. We live in the https:// future
		// (unless the parser is explicitly configured otherwise).
		// People who really care in their docs can write http:// themselves.
		scheme := "https://"
		if p.AutoLinkAssumeHTTP {
			scheme = "http://"
		}
		return p.parseAutoHTTP(scheme, s, i, i, i+3, vd)
	case 'm':
		if !strings.HasPrefix(s[i:], "mailto:") {
			return
		}
		return p.parseAutoMailto(s, i)
	case 'x':
		if !strings.HasPrefix(s[i:], "xmpp:") {
			return
		}
		return p.parseAutoXmpp(s, i)
	}
	return
}

// parseAutoWWW parses an extended www autolink.
// https://github.github.com/gfm/#extended-www-autolink
func (p *parseState) parseAutoHTTP(scheme, s string, textstart, start, min int, vd *validDomainChecker) (link *Link, after string, found bool) {
	n, ok := vd.parseValidDomain(start)
	if !ok {
		return
	}
	i := start + n
	domEnd := i

	// “After a valid domain, zero or more non-space non-< characters may follow.”
	paren := 0
	for i < len(s) {
		r, n := utf8.DecodeRuneInString(s[i:])
		if isUnicodeSpace(r) || r == '<' {
			break
		}
		if r == '(' {
			paren++
		}
		if r == ')' {
			paren--
		}
		i += n
	}

	// https://github.github.com/gfm/#extended-autolink-path-validation
Trim:
	for i > min {
		switch s[i-1] {
		case '?', '!', '.', ',', ':', '@', '_', '~':
			// Trim certain trailing punctuation.
			i--
			continue Trim

		case ')':
			// Trim trailing unmatched (by count only) parens.
			if paren < 0 {
				for s[i-1] == ')' && paren < 0 {
					paren++
					i--
				}
				continue Trim
			}

		case ';':
			// Trim entity reference.
			// After doing the work of the scan, we either cut that part off the string
			// or we stop the trimming entirely, so there's no chance of repeating
			// the scan on a future iteration and going accidentally quadratic.
			// Even though the Markdown spec already requires having a complete
			// list of all the HTML entities, the GitHub definition here just requires
			// "looks like" an entity, meaning its an ampersand, letters/digits, and semicolon.
			for j := i - 2; j > start; j-- {
				if j < i-2 && s[j] == '&' {
					i = j
					continue Trim
				}
				if !isLetterDigit(s[j]) {
					break Trim
				}
			}
		}
		break Trim
	}

	// According to the literal text of the GitHub Flavored Markdown spec
	// and the actual behavior on GitHub,
	// www.example.com$foo turns into <a href="https://www.example.com$foo">,
	// but that makes the character restrictions in the valid-domain check
	// almost meaningless. So we insist that when all is said and done,
	// if the domain is followed by anything, that thing must be a slash,
	// even though GitHub is not that picky.
	// People might complain about www.example.com:1234 not working,
	// but if you want to get fancy with that kind of thing, just write http:// in front.
	if textstart == start && i > domEnd && s[domEnd] != '/' {
		i = domEnd
	}

	if i < min {
		return
	}

	link = &Link{
		Inner: []Inline{&Plain{Text: s[textstart:i]}},
		URL:   scheme + s[start:i],
	}
	return link, s[i:], true
}

type validDomainChecker struct {
	s   string
	cut int // before this index, no valid domains
}

func (v *validDomainChecker) skip(i int) {
	v.s = v.s[i:]
	v.cut -= i
}

// parseValidDomain parses a valid domain.
// https://github.github.com/gfm/#valid-domain
//
// If s starts with a valid domain, parseValidDomain returns
// the length of that domain and true. If s does not start with
// a valid domain, parseValidDomain returns n, false,
// where n is the length of a prefix guaranteed not to be acceptable
// to any future call to parseValidDomain.
//
// “A valid domain consists of segments of alphanumeric characters,
// underscores (_) and hyphens (-) separated by periods (.).
// There must be at least one period, and no underscores may be
// present in the last two segments of the domain.”
//
// The spec does not spell out whether segments can be empty.
// Empirically, in GitHub's implementation they can.
func (v *validDomainChecker) parseValidDomain(start int) (n int, found bool) {
	if start < v.cut {
		return 0, false
	}
	i := start
	dots := 0
	for ; i < len(v.s); i++ {
		c := v.s[i]
		if c == '_' {
			dots = -2
			continue
		}
		if c == '.' {
			dots++
			continue
		}
		if !isLDH(c) {
			break
		}
	}
	if dots >= 0 && i > start {
		return i - start, true
	}
	v.cut = i
	return 0, false
}

func (p *parseState) parseAutoEmail(s string, i int) (before string, link *Link, after string, ok bool) {
	if s[i] != '@' {
		return
	}

	// “One ore more characters which are alphanumeric, or ., -, _, or +.”
	j := i
	for j > 0 && (isLDH(s[j-1]) || s[j-1] == '_' || s[j-1] == '+' || s[j-1] == '.') {
		j--
	}
	if i-j < 1 {
		return
	}

	// “One or more characters which are alphanumeric, or - or _, separated by periods (.).
	// There must be at least one period. The last character must not be one of - or _.”
	dots := 0
	k := i + 1
	for k < len(s) && (isLDH(s[k]) || s[k] == '_' || s[k] == '.') {
		if s[k] == '.' {
			if s[k-1] == '.' {
				// Empirically, .. stops the scan but foo@.bar is fine.
				break
			}
			dots++
		}
		k++
	}

	// “., -, and _ can occur on both sides of the @, but only . may occur at the end
	// of the email address, in which case it will not be considered part of the address”
	if s[k-1] == '.' {
		dots--
		k--
	}
	if s[k-1] == '-' || s[k-1] == '_' {
		return
	}
	if k-(i+1)-dots < 2 || dots < 1 {
		return
	}

	link = &Link{
		Inner: []Inline{&Plain{Text: s[j:k]}},
		URL:   "mailto:" + s[j:k],
	}
	return s[:j], link, s[k:], true
}

func (p *parseState) parseAutoMailto(s string, i int) (link *Link, after string, ok bool) {
	j := i + len("mailto:")
	for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '+' || s[j] == '.') {
		j++
	}
	if j >= len(s) || s[j] != '@' {
		return
	}
	before, link, after, ok := p.parseAutoEmail(s[i:], j-i)
	if before != "mailto:" || !ok {
		return nil, "", false
	}
	link.Inner[0] = &Plain{Text: s[i : len(s)-len(after)]}
	return link, after, true
}

func (p *parseState) parseAutoXmpp(s string, i int) (link *Link, after string, ok bool) {
	j := i + len("xmpp:")
	for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '+' || s[j] == '.') {
		j++
	}
	if j >= len(s) || s[j] != '@' {
		return
	}
	before, link, after, ok := p.parseAutoEmail(s[i:], j-i)
	if before != "xmpp:" || !ok {
		return nil, "", false
	}
	if after != "" && after[0] == '/' {
		k := 1
		for k < len(after) && (isLetterDigit(after[k]) || after[k] == '@' || after[k] == '.') {
			k++
		}
		after = after[k:]
	}
	url := s[i : len(s)-len(after)]
	link.Inner[0] = &Plain{Text: url}
	link.URL = url
	return link, after, true
}