mark/page/link.go

package page

import (
	"bytes"
	"fmt"
	"net/http"
	"net/url"
	"os"
	"path/filepath"
	"regexp"
	"strings"

	"github.com/kovetskiy/mark/confluence"
	"github.com/kovetskiy/mark/metadata"
	"github.com/reconquest/karma-go"
	"github.com/reconquest/pkg/log"
)

type LinkSubstitution struct {
	From string
	To   string
}

type markdownLink struct {
	full     string
	filename string
	hash     string
}

func ResolveRelativeLinks(
	api *confluence.API,
	meta *metadata.Meta,
	markdown []byte,
	base string,
	spaceFromCli string,
	titleFromH1 bool,
	titleFromFilename bool,
	parents []string,
	titleAppendGeneratedHash bool,
) ([]LinkSubstitution, error) {
	matches := parseLinks(string(markdown))

	// If the user didn't provide --space, inherit the current document's space so
	// relative links can be resolved within the same space.
	spaceForLinks := spaceFromCli
	if spaceForLinks == "" && meta != nil {
		spaceForLinks = meta.Space
	}

	links := []LinkSubstitution{}
	for _, match := range matches {
		log.Tracef(
			nil,
			"found a relative link: full=%s filename=%s hash=%s",
			match.full,
			match.filename,
			match.hash,
		)
		resolved, err := resolveLink(api, base, match, spaceForLinks, titleFromH1, titleFromFilename, parents, titleAppendGeneratedHash)
		if err != nil {
			return nil, karma.Format(err, "resolve link: %q", match.full)
		}

		if resolved == "" {
			continue
		}

		links = append(links, LinkSubstitution{
			From: match.full,
			To:   resolved,
		})
	}

	return links, nil
}

func resolveLink(
	api *confluence.API,
	base string,
	link markdownLink,
	spaceForLinks string,
	titleFromH1 bool,
	titleFromFilename bool,
	parents []string,
	titleAppendGeneratedHash bool,
) (string, error) {
	var result string

	if len(link.filename) > 0 {
		filepath := filepath.Join(base, link.filename)

		log.Tracef(nil, "filepath: %s", filepath)
		stat, err := os.Stat(filepath)
		if err != nil {
			return "", nil
		}

		if stat.IsDir() {
			return "", nil
		}

		linkContents, err := os.ReadFile(filepath)

		contentType := http.DetectContentType(linkContents)
		// Check if the MIME type starts with "text/"
		if !strings.HasPrefix(contentType, "text/") {
			log.Debugf(nil, "Ignoring link to file %q: detected content type %v", filepath, contentType)
			return "", nil
		}

		if err != nil {
			return "", karma.Format(err, "read file: %s", filepath)
		}

		linkContents = bytes.ReplaceAll(
			linkContents,
			[]byte("\r\n"),
			[]byte("\n"),
		)

		// This helps to determine if found link points to file that's
		// not markdown or have mark required metadata
		linkMeta, _, err := metadata.ExtractMeta(linkContents, spaceForLinks, titleFromH1, titleFromFilename, filepath, parents, titleAppendGeneratedHash)
		if err != nil {
			log.Errorf(
				err,
				"unable to extract metadata from %q; ignoring the relative link",
				filepath,
			)

			return "", nil
		}

		if linkMeta == nil {
			return "", nil
		}

		log.Tracef(
			nil,
			"extracted metadata: space=%s title=%s",
			linkMeta.Space,
			linkMeta.Title,
		)

		result, err = getConfluenceLink(api, linkMeta.Space, linkMeta.Title)
		if err != nil {
			return "", karma.Format(
				err,
				"find confluence page: %s / %s / %s",
				filepath,
				linkMeta.Space,
				linkMeta.Title,
			)
		}

		if result == "" {
			return "", nil
		}
	}

	if len(link.hash) > 0 {
		result = result + "#" + link.hash
	}

	return result, nil
}

func SubstituteLinks(markdown []byte, links []LinkSubstitution) []byte {
	for _, link := range links {
		if link.From == link.To {
			continue
		}

		log.Tracef(nil, "substitute link: %q -> %q", link.From, link.To)

		markdown = bytes.ReplaceAll(
			markdown,
			[]byte(fmt.Sprintf("](%s)", link.From)),
			[]byte(fmt.Sprintf("](%s)", link.To)),
		)
	}

	return markdown
}

func parseLinks(markdown string) []markdownLink {
	// Matches links but not inline images
	re := regexp.MustCompile(`[^\!]\[.+\]\((([^\)#]+)?#?([^\)]+)?)\)`)
	matches := re.FindAllStringSubmatch(markdown, -1)

	links := make([]markdownLink, len(matches))
	for i, match := range matches {
		links[i] = markdownLink{
			full:     match[1],
			filename: match[2],
			hash:     match[3],
		}
	}

	return links
}

// getConfluenceLink build (to be) link for Confluence, and tries to verify from
// API if there's real link available
func getConfluenceLink(
	api *confluence.API,
	space, title string,
) (string, error) {
	page, err := api.FindPage(space, title, "page")
	if err != nil {
		return "", karma.Format(err, "api: find page")
	}
	if page == nil {
		// Without a page ID there is no stable way to produce
		// /wiki/spaces/<space>/pages/<id>/<name>.
		return "", nil
	}

	// Confluence Cloud web UI URLs can be returned either as a path ("/wiki/..." or
	// "/ex/confluence/<cloudId>/wiki/...") or as a full absolute URL.
	absolute, err := makeAbsoluteConfluenceWebUIURL(api.BaseURL, page.Links.Full)
	if err != nil {
		return "", karma.Format(err, "build confluence webui URL")
	}

	return absolute, nil
}

func makeAbsoluteConfluenceWebUIURL(baseURL string, webui string) (string, error) {
	if webui == "" {
		return "", nil
	}

	u, err := url.Parse(webui)
	if err != nil {
		return "", err
	}

	path := normalizeConfluenceWebUIPath(u.Path)
	if path == "" {
		return "", nil
	}

	// If Confluence returns an absolute URL, trust its host/scheme.
	if u.Scheme != "" && u.Host != "" {
		baseURL = u.Scheme + "://" + u.Host
	}

	baseURL = strings.TrimSuffix(baseURL, "/")
	if !strings.HasPrefix(path, "/") {
		path = "/" + path
	}

	result := baseURL + path
	if u.RawQuery != "" {
		result += "?" + u.RawQuery
	}
	if u.Fragment != "" {
		result += "#" + u.Fragment
	}

	return result, nil
}

// normalizeConfluenceWebUIPath rewrites Confluence Cloud "experience" URLs
// ("/ex/confluence/<cloudId>/wiki/..."), to canonical wiki paths ("/wiki/...").
func normalizeConfluenceWebUIPath(path string) string {
	if path == "" {
		return path
	}

	re := regexp.MustCompile(`^/ex/confluence/[^/]+(/wiki/.*)$`)
	match := re.FindStringSubmatch(path)
	if len(match) == 2 {
		return match[1]
	}

	return path
}
*: Reorganize code 2024-09-26 15:24:39 +02:00			`package page`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00
			`import (`
			`"bytes"`
			`"fmt"`
Use mimesniffing to detect text files 2025-12-08 21:32:28 +01:00			`"net/http"`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`"net/url"`
			`"os"`
			`"path/filepath"`
			`"regexp"`
Use mimesniffing to detect text files 2025-12-08 21:32:28 +01:00			`"strings"`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00
*: Reorganize code 2024-09-26 15:24:39 +02:00			`"github.com/kovetskiy/mark/confluence"`
			`"github.com/kovetskiy/mark/metadata"`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`"github.com/reconquest/karma-go"`
			`"github.com/reconquest/pkg/log"`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`)`

Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`type LinkSubstitution struct {`
			`From string`
			`To string`
			`}`

			`type markdownLink struct {`
			`full string`
			`filename string`
			`hash string`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`

			`func ResolveRelativeLinks(`
			`api *confluence.API,`
*: Reorganize code 2024-09-26 15:24:39 +02:00			`meta *metadata.Meta,`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`markdown []byte,`
			`base string,`
Include space parameter as well 2023-03-20 22:54:11 +01:00			`spaceFromCli string,`
fix: Support relative links with titleFromH1 2023-03-20 19:19:31 +01:00			`titleFromH1 bool,`
Add support for using the filename as the page title 2025-08-29 14:37:59 +02:00			`titleFromFilename bool,`
Define parent pages from CLI 2023-08-09 13:06:31 +02:00			`parents []string,`
feat: add flag to append hash to pages to ensure unique titles 2024-09-30 21:00:49 -04:00			`titleAppendGeneratedHash bool,`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`) ([]LinkSubstitution, error) {`
			`matches := parseLinks(string(markdown))`

fix: resolve link space inheritance and enhance Confluence URL normalization tests Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-19 12:34:29 +01:00			`// If the user didn't provide --space, inherit the current document's space so`
			`// relative links can be resolved within the same space.`
			`spaceForLinks := spaceFromCli`
			`if spaceForLinks == "" && meta != nil {`
			`spaceForLinks = meta.Space`
			`}`

Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`links := []LinkSubstitution{}`
			`for _, match := range matches {`
			`log.Tracef(`
			`nil,`
			`"found a relative link: full=%s filename=%s hash=%s",`
			`match.full,`
			`match.filename,`
			`match.hash,`
			`)`
fix: resolve link space inheritance and enhance Confluence URL normalization tests Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-19 12:34:29 +01:00			`resolved, err := resolveLink(api, base, match, spaceForLinks, titleFromH1, titleFromFilename, parents, titleAppendGeneratedHash)`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`if err != nil {`
			`return nil, karma.Format(err, "resolve link: %q", match.full)`
			`}`

			`if resolved == "" {`
			`continue`
			`}`

			`links = append(links, LinkSubstitution{`
			`From: match.full,`
			`To: resolved,`
			`})`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`

Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`return links, nil`
			`}`

			`func resolveLink(`
			`api *confluence.API,`
			`base string,`
			`link markdownLink,`
fix: resolve link space inheritance and enhance Confluence URL normalization tests Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-19 12:34:29 +01:00			`spaceForLinks string,`
fix: Support relative links with titleFromH1 2023-03-20 19:19:31 +01:00			`titleFromH1 bool,`
Add support for using the filename as the page title 2025-08-29 14:37:59 +02:00			`titleFromFilename bool,`
Define parent pages from CLI 2023-08-09 13:06:31 +02:00			`parents []string,`
feat: add flag to append hash to pages to ensure unique titles 2024-09-30 21:00:49 -04:00			`titleAppendGeneratedHash bool,`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`) (string, error) {`
			`var result string`

			`if len(link.filename) > 0 {`
feat: moving to IsTextFile for file type detection 2023-03-22 19:10:19 -04:00			`filepath := filepath.Join(base, link.filename)`
fix #114 do not crash when resolving links on directories 2021-09-11 14:37:45 +03:00
feat: moving to IsTextFile for file type detection 2023-03-22 19:10:19 -04:00			`log.Tracef(nil, "filepath: %s", filepath)`
			`stat, err := os.Stat(filepath)`
fix #114 do not crash when resolving links on directories 2021-09-11 14:37:45 +03:00			`if err != nil {`
			`return "", nil`
			`}`

			`if stat.IsDir() {`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`return "", nil`
			`}`

feat: moving to IsTextFile for file type detection 2023-03-22 19:10:19 -04:00			`linkContents, err := os.ReadFile(filepath)`

Use mimesniffing to detect text files 2025-12-08 21:32:28 +01:00			`contentType := http.DetectContentType(linkContents)`
			`// Check if the MIME type starts with "text/"`
			`if !strings.HasPrefix(contentType, "text/") {`
			`log.Debugf(nil, "Ignoring link to file %q: detected content type %v", filepath, contentType)`
feat: moving to IsTextFile for file type detection 2023-03-22 19:10:19 -04:00			`return "", nil`
			`}`

Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`if err != nil {`
feat: moving to IsTextFile for file type detection 2023-03-22 19:10:19 -04:00			`return "", karma.Format(err, "read file: %s", filepath)`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`}`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00
replace \r\n with \n, fix #42 2021-11-08 20:15:59 +06:00			`linkContents = bytes.ReplaceAll(`
			`linkContents,`
			`[]byte("\r\n"),`
			`[]byte("\n"),`
			`)`

Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`// This helps to determine if found link points to file that's`
			`// not markdown or have mark required metadata`
fix: resolve link space inheritance and enhance Confluence URL normalization tests Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-19 12:34:29 +01:00			`linkMeta, _, err := metadata.ExtractMeta(linkContents, spaceForLinks, titleFromH1, titleFromFilename, filepath, parents, titleAppendGeneratedHash)`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`if err != nil {`
			`log.Errorf(`
			`err,`
			`"unable to extract metadata from %q; ignoring the relative link",`
feat: moving to IsTextFile for file type detection 2023-03-22 19:10:19 -04:00			`filepath,`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`)`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`return "", nil`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00
			`if linkMeta == nil {`
			`return "", nil`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`

fix: Support relative links with titleFromH1 2023-03-20 19:19:31 +01:00			`log.Tracef(`
			`nil,`
			`"extracted metadata: space=%s title=%s",`
			`linkMeta.Space,`
			`linkMeta.Title,`
			`)`

Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`result, err = getConfluenceLink(api, linkMeta.Space, linkMeta.Title)`
			`if err != nil {`
			`return "", karma.Format(`
			`err,`
			`"find confluence page: %s / %s / %s",`
feat: moving to IsTextFile for file type detection 2023-03-22 19:10:19 -04:00			`filepath,`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`linkMeta.Space,`
			`linkMeta.Title,`
			`)`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`

Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`if result == "" {`
			`return "", nil`
			`}`
			`}`

			`if len(link.hash) > 0 {`
			`result = result + "#" + link.hash`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00
			`return result, nil`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`

Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`func SubstituteLinks(markdown []byte, links []LinkSubstitution) []byte {`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`for _, link := range links {`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`if link.From == link.To {`
			`continue`
			`}`

			`log.Tracef(nil, "substitute link: %q -> %q", link.From, link.To)`

Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`markdown = bytes.ReplaceAll(`
			`markdown,`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`[]byte(fmt.Sprintf("](%s)", link.From)),`
			`[]byte(fmt.Sprintf("](%s)", link.To)),`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`)`
			`}`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`return markdown`
			`}`

Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`func parseLinks(markdown string) []markdownLink {`
Support inline images 2023-04-26 08:02:35 +02:00			`// Matches links but not inline images`
Fix Regex for links that contain square brackets in the text 2024-07-25 23:03:21 +02:00			re := regexp.MustCompile(`[^\!]\[.+\]\((([^\)#]+)?#?([^\)]+)?)\)`)
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`matches := re.FindAllStringSubmatch(markdown, -1)`

			`links := make([]markdownLink, len(matches))`
			`for i, match := range matches {`
			`links[i] = markdownLink{`
			`full: match[1],`
			`filename: match[2],`
			`hash: match[3],`
			`}`
			`}`

			`return links`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`

Simplify config handling * Switch to urfave/cli/v2 * Add more environment variables 2023-04-18 15:06:16 +02:00			`// getConfluenceLink build (to be) link for Confluence, and tries to verify from`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`// API if there's real link available`
fix #114 do not crash when resolving links on directories 2021-09-11 14:37:45 +03:00			`func getConfluenceLink(`
			`api *confluence.API,`
			`space, title string,`
			`) (string, error) {`
Add Type metadata header to enable support for Confluence Blog Posts 2021-03-31 17:49:01 +01:00			`page, err := api.FindPage(space, title, "page")`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`if err != nil {`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00			`return "", karma.Format(err, "api: find page")`
			`}`
fix: resolve link space inheritance and enhance Confluence URL normalization tests Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-19 12:34:29 +01:00			`if page == nil {`
			`// Without a page ID there is no stable way to produce`
			`// /wiki/spaces/<space>/pages/<id>/<name>.`
			`return "", nil`
			`}`
Fix replacing relative links, fix #43 2020-12-04 00:28:52 +03:00
fix: resolve link space inheritance and enhance Confluence URL normalization tests Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-19 12:34:29 +01:00			`// Confluence Cloud web UI URLs can be returned either as a path ("/wiki/..." or`
			`// "/ex/confluence/<cloudId>/wiki/...") or as a full absolute URL.`
			`absolute, err := makeAbsoluteConfluenceWebUIURL(api.BaseURL, page.Links.Full)`
			`if err != nil {`
			`return "", karma.Format(err, "build confluence webui URL")`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`

fix: resolve link space inheritance and enhance Confluence URL normalization tests Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-19 12:34:29 +01:00			`return absolute, nil`
			`}`

			`func makeAbsoluteConfluenceWebUIURL(baseURL string, webui string) (string, error) {`
			`if webui == "" {`
			`return "", nil`
			`}`

			`u, err := url.Parse(webui)`
Use relative links for objects within the same Confluence instance 2024-07-27 19:52:32 +02:00			`if err != nil {`
fix: resolve link space inheritance and enhance Confluence URL normalization tests Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-19 12:34:29 +01:00			`return "", err`
			`}`

			`path := normalizeConfluenceWebUIPath(u.Path)`
			`if path == "" {`
			`return "", nil`
			`}`

			`// If Confluence returns an absolute URL, trust its host/scheme.`
			`if u.Scheme != "" && u.Host != "" {`
			`baseURL = u.Scheme + "://" + u.Host`
			`}`

			`baseURL = strings.TrimSuffix(baseURL, "/")`
			`if !strings.HasPrefix(path, "/") {`
			`path = "/" + path`
Use relative links for objects within the same Confluence instance 2024-07-27 19:52:32 +02:00			`}`
fix: resolve link space inheritance and enhance Confluence URL normalization tests Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-19 12:34:29 +01:00
			`result := baseURL + path`
			`if u.RawQuery != "" {`
			`result += "?" + u.RawQuery`
			`}`
			`if u.Fragment != "" {`
			`result += "#" + u.Fragment`
			`}`

			`return result, nil`
Support for relative links (#33) * Support for relative links Fixes #25 * Error logging fixes * Better regexp 2020-11-30 09:47:46 +02:00			`}`
feat: add normalizeConfluenceWebUIPath function and tests for URL rewriting Signed-off-by: Nikolai Emil Damm <ndam@tv2.dk> 2025-12-18 15:55:39 +01:00
			`// normalizeConfluenceWebUIPath rewrites Confluence Cloud "experience" URLs`
			`// ("/ex/confluence/<cloudId>/wiki/..."), to canonical wiki paths ("/wiki/...").`
			`func normalizeConfluenceWebUIPath(path string) string {`
			`if path == "" {`
			`return path`
			`}`

			re := regexp.MustCompile(`^/ex/confluence/[^/]+(/wiki/.*)$`)
			`match := re.FindStringSubmatch(path)`
			`if len(match) == 2 {`
			`return match[1]`
			`}`

			`return path`
			`}`