From 6e4a912b1196de10c1516ffd6279476f7fb4de81 Mon Sep 17 00:00:00 2001
From: Bernd Ahlers <bernd@graylog.com>
Date: Fri, 31 Mar 2023 10:51:50 +0200
Subject: [PATCH 1/2] Implement a custom parser for <ac:*/> tags

This replaces the workaround to replace colons in <ac:*/> tags with a
magic string with a custom parser for these tags to parse them as
ast.KindRawHtml.

The custom parser is a stripped down version of goldmark's rawHTMLParser.
---
 pkg/mark/ac_tag_parser.go            | 112 +++++++++++++++++++++++++++
 pkg/mark/markdown.go                 |  22 ++----
 pkg/mark/testdata/macro-include.html |   4 +-
 3 files changed, 121 insertions(+), 17 deletions(-)
 create mode 100644 pkg/mark/ac_tag_parser.go

diff --git a/pkg/mark/ac_tag_parser.go b/pkg/mark/ac_tag_parser.go
new file mode 100644
index 0000000..763a1ca
--- /dev/null
+++ b/pkg/mark/ac_tag_parser.go
@@ -0,0 +1,112 @@
+package mark
+
+import (
+	"bytes"
+	"github.com/yuin/goldmark/ast"
+	"github.com/yuin/goldmark/parser"
+	"github.com/yuin/goldmark/text"
+	"github.com/yuin/goldmark/util"
+	"regexp"
+)
+
+// NewACTagParser returns an inline parser that parses <ac:* /> tags to ensure that Confluence specific tags are parsed
+// as ast.KindRawHtml so they are not escaped at render time. The parser must be registered with a higher priority
+// than goldmark's linkParser. Otherwise, the linkParser would parse the <ac:* /> tags.
+func NewACTagParser() parser.InlineParser {
+	return &acTagParser{}
+}
+
+var _ parser.InlineParser = (*acTagParser)(nil)
+
+// acTagParser is a stripped down version of goldmark's rawHTMLParser.
+// See: https://github.com/yuin/goldmark/blob/master/parser/raw_html.go
+type acTagParser struct {
+}
+
+func (s *acTagParser) Trigger() []byte {
+	return []byte{'<'}
+}
+
+func (s *acTagParser) Parse(_ ast.Node, block text.Reader, pc parser.Context) ast.Node {
+	line, _ := block.PeekLine()
+	if len(line) > 1 && util.IsAlphaNumeric(line[1]) {
+		return s.parseMultiLineRegexp(openTagRegexp, block, pc)
+	}
+	if len(line) > 2 && line[1] == '/' && util.IsAlphaNumeric(line[2]) {
+		return s.parseMultiLineRegexp(closeTagRegexp, block, pc)
+	}
+	if len(line) > 2 && line[1] == '!' && line[2] >= 'A' && line[2] <= 'Z' {
+		return s.parseUntil(block, closeDecl, pc)
+	}
+	if bytes.HasPrefix(line, openCDATA) {
+		return s.parseUntil(block, closeCDATA, pc)
+	}
+	return nil
+}
+
+var tagnamePattern = `([A-Za-z][A-Za-z0-9-]*)`
+
+var attributePattern = `(?:[\r\n \t]+[a-zA-Z_:][a-zA-Z0-9:._-]*(?:[\r\n \t]*=[\r\n \t]*(?:[^\"'=<>` + "`" + `\x00-\x20]+|'[^']*'|"[^"]*"))?)`
+
+// Only match <ac:*/> tags
+var openTagRegexp = regexp.MustCompile("^<ac:" + tagnamePattern + attributePattern + `*[ \t]*/?>`)
+var closeTagRegexp = regexp.MustCompile("^</ac:" + tagnamePattern + `\s*>`)
+
+var openCDATA = []byte("<![CDATA[")
+var closeCDATA = []byte("]]>")
+var closeDecl = []byte(">")
+
+func (s *acTagParser) parseUntil(block text.Reader, closer []byte, _ parser.Context) ast.Node {
+	savedLine, savedSegment := block.Position()
+	node := ast.NewRawHTML()
+	for {
+		line, segment := block.PeekLine()
+		if line == nil {
+			break
+		}
+		index := bytes.Index(line, closer)
+		if index > -1 {
+			node.Segments.Append(segment.WithStop(segment.Start + index + len(closer)))
+			block.Advance(index + len(closer))
+			return node
+		}
+		node.Segments.Append(segment)
+		block.AdvanceLine()
+	}
+	block.SetPosition(savedLine, savedSegment)
+	return nil
+}
+
+func (s *acTagParser) parseMultiLineRegexp(reg *regexp.Regexp, block text.Reader, _ parser.Context) ast.Node {
+	sline, ssegment := block.Position()
+	if block.Match(reg) {
+		node := ast.NewRawHTML()
+		eline, esegment := block.Position()
+		block.SetPosition(sline, ssegment)
+		for {
+			line, segment := block.PeekLine()
+			if line == nil {
+				break
+			}
+			l, _ := block.Position()
+			start := segment.Start
+			if l == sline {
+				start = ssegment.Start
+			}
+			end := segment.Stop
+			if l == eline {
+				end = esegment.Start
+			}
+
+			node.Segments.Append(text.NewSegment(start, end))
+			if l == eline {
+				block.Advance(end - start)
+				break
+			} else {
+				block.AdvanceLine()
+			}
+		}
+		return node
+	}
+	return nil
+}
diff --git a/pkg/mark/markdown.go b/pkg/mark/markdown.go
index 2ff2b71..3c44ab8 100644
--- a/pkg/mark/markdown.go
+++ b/pkg/mark/markdown.go
@@ -430,22 +430,9 @@ func (r *ConfluenceRenderer) renderCodeBlock(writer util.BufWriter, source []byt
 	return ast.WalkContinue, nil
 }
 
-// compileMarkdown will replace tags like <ac:rich-tech-body> with escaped
-// equivalent, because goldmark markdown parser replaces that tags with
-// <a href="ac:rich-text-body">ac:rich-text-body</a> because of the autolink
-// rule.
 func CompileMarkdown(markdown []byte, stdlib *stdlib.Lib) string {
 	log.Tracef(nil, "rendering markdown:\n%s", string(markdown))
 
-	colon := []byte("---bf-COLON---")
-
-	tags := regexp.MustCompile(`</?ac:[^>]+>`)
-
-	for _, match := range tags.FindAll(markdown, -1) {
-		// Replace the colon in all "<ac:*>" tags with the colon bytes to avoid having Goldmark escape the HTML output.
-		markdown = bytes.ReplaceAll(markdown, match, bytes.ReplaceAll(match, []byte(":"), colon))
-	}
-
 	converter := goldmark.New(
 		goldmark.WithExtensions(
 			extension.GFM,
@@ -461,6 +448,12 @@ func CompileMarkdown(markdown []byte, stdlib *stdlib.Lib) string {
 			html.WithUnsafe(),
 		))
 
+	converter.Parser().AddOptions(parser.WithInlineParsers(
+		// Must be registered with a higher priority than goldmark's linkParser to make sure goldmark doesn't parse
+		// the <ac:*/> tags.
+		util.Prioritized(NewACTagParser(), 199),
+	))
+
 	converter.Renderer().AddOptions(renderer.WithNodeRenderers(
 		util.Prioritized(NewConfluenceRenderer(stdlib), 100),
 	))
@@ -472,8 +465,7 @@ func CompileMarkdown(markdown []byte, stdlib *stdlib.Lib) string {
 		panic(err)
 	}
 
-	// Restore all the colons we previously replaced.
-	html := bytes.ReplaceAll(buf.Bytes(), colon, []byte(":"))
+	html := buf.Bytes()
 
 	log.Tracef(nil, "rendered markdown to html:\n%s", string(html))
 
diff --git a/pkg/mark/testdata/macro-include.html b/pkg/mark/testdata/macro-include.html
index 55f1496..8d8ba05 100644
--- a/pkg/mark/testdata/macro-include.html
+++ b/pkg/mark/testdata/macro-include.html
@@ -1,6 +1,6 @@
 <p><foo>bar</foo></p>
-<ac:structured-macro ac:name="info">
+<p><ac:structured-macro ac:name="info">
 <ac:parameter ac:name="icon">true</ac:parameter>
 <ac:parameter ac:name="title">Attention</ac:parameter>
 <ac:rich-text-body>This is an info!</ac:rich-text-body>
-</ac:structured-macro>
\ No newline at end of file
+</ac:structured-macro></p>

From 80d906417c230c86e539e3765b36df6a6ae37af8 Mon Sep 17 00:00:00 2001
From: Bernd Ahlers <bernd@graylog.com>
Date: Fri, 31 Mar 2023 11:09:58 +0200
Subject: [PATCH 2/2] Fix custom link renderer and add tests for Confluence
 links

Since we now have a custom parser for <ac:*/> tags, the custom link
renderer added an additional </a> tag at the end of each internal
Confluence link.

Add tests for internal links and add an example for internal links with
spaces in page titles to the README file.
---
 README.md                    | 2 ++
 pkg/mark/markdown.go         | 3 +--
 pkg/mark/testdata/links.html | 4 ++++
 pkg/mark/testdata/links.md   | 8 ++++++++
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 78cb49a..c8f3b8f 100644
--- a/README.md
+++ b/README.md
@@ -517,6 +517,8 @@ See task MYJIRA-123.
 This is a [link to an existing confluence page](ac:Pagetitle)
 
 And this is how to link when the linktext is the same as the [Pagetitle](ac:)
+
+Link to a [page title with space](<ac:With Space>)
 ```
 
 ### Add width for an image
diff --git a/pkg/mark/markdown.go b/pkg/mark/markdown.go
index 3c44ab8..680375d 100644
--- a/pkg/mark/markdown.go
+++ b/pkg/mark/markdown.go
@@ -276,9 +276,8 @@ func (r *ConfluenceRenderer) renderLink(writer util.BufWriter, source []byte, no
 			if err != nil {
 				return ast.WalkStop, err
 			}
-
-			return ast.WalkSkipChildren, nil
 		}
+		return ast.WalkSkipChildren, nil
 	}
 	return r.goldmarkRenderLink(writer, source, node, entering)
 }
diff --git a/pkg/mark/testdata/links.html b/pkg/mark/testdata/links.html
index 3b1f468..625f00e 100644
--- a/pkg/mark/testdata/links.html
+++ b/pkg/mark/testdata/links.html
@@ -1,5 +1,9 @@
 <p>Use <a href="https://example.com">https://example.com</a></p>
 <p>Use <ac:rich-text-body>aaa</ac:rich-text-body></p>
+<p>Use <ac:link><ri:page ri:content-title="Page"/><ac:plain-text-link-body><![CDATA[page link]]></ac:plain-text-link-body></ac:link></p>
+<p>Use <ac:link><ri:page ri:content-title="AnotherPage"/><ac:plain-text-link-body><![CDATA[AnotherPage]]></ac:plain-text-link-body></ac:link></p>
+<p>Use <ac:link><ri:page ri:content-title="Another Page"/><ac:plain-text-link-body><![CDATA[Another Page]]></ac:plain-text-link-body></ac:link></p>
+<p>Use <ac:link><ri:page ri:content-title="Page With Space"/><ac:plain-text-link-body><![CDATA[page link with spaces]]></ac:plain-text-link-body></ac:link></p>
 <p>Use footnotes link <sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup></p>
 <div class="footnotes" role="doc-endnotes">
 <hr />
diff --git a/pkg/mark/testdata/links.md b/pkg/mark/testdata/links.md
index f814795..ce4a282 100644
--- a/pkg/mark/testdata/links.md
+++ b/pkg/mark/testdata/links.md
@@ -2,5 +2,13 @@ Use <https://example.com>
 
 Use <ac:rich-text-body>aaa</ac:rich-text-body>
 
+Use [page link](ac:Page)
+
+Use [AnotherPage](ac:)
+
+Use [Another Page](ac:)
+
+Use [page link with spaces](<ac:Page With Space>)
+
 Use footnotes link [^1]  
 [^1]: a footnote link
\ No newline at end of file