mark/mark.go

package mark

import (
	"bytes"
	"crypto/sha1"
	"encoding/hex"
	"errors"
	"fmt"
	stdhtml "html"
	"io"
	"os"
	"path/filepath"
	"regexp"
	"slices"
	"strings"
	"time"
	"unicode/utf8"

	"github.com/bmatcuk/doublestar/v4"
	"github.com/kovetskiy/mark/v16/attachment"
	"github.com/kovetskiy/mark/v16/confluence"
	"github.com/kovetskiy/mark/v16/includes"
	"github.com/kovetskiy/mark/v16/macro"
	markmd "github.com/kovetskiy/mark/v16/markdown"
	"github.com/kovetskiy/mark/v16/metadata"
	"github.com/kovetskiy/mark/v16/page"
	"github.com/kovetskiy/mark/v16/stdlib"
	"github.com/kovetskiy/mark/v16/types"
	"github.com/kovetskiy/mark/v16/vfs"
	"github.com/rs/zerolog/log"
)

var markerRegex = regexp.MustCompile(`(?s)<ac:inline-comment-marker ac:ref="([^"]+)">(.*?)</ac:inline-comment-marker>`)

// Config holds all configuration options for running Mark.
type Config struct {
	// Connection settings
	BaseURL               string
	Username              string
	Password              string
	PageID                string
	InsecureSkipTLSVerify bool

	// File selection
	Files string

	// Behaviour
	CompileOnly     bool
	DryRun          bool
	ContinueOnError bool
	CI              bool

	// Page content
	Space                    string
	Parents                  []string
	TitleFromH1              bool
	TitleFromFilename        bool
	TitleAppendGeneratedHash bool
	ContentAppearance        string

	// Page updates
	MinorEdit        bool
	VersionMessage   string
	EditLock         bool
	ChangesOnly      bool
	PreserveComments bool

	// Rendering
	DropH1          bool
	StripLinebreaks bool
	MermaidScale    float64
	D2Scale         float64
	Features        []string
	ImageAlign      string
	IncludePath     string

	// Output is the writer used for result output (e.g. published page URLs,
	// compiled HTML). If nil, output is discarded; the CLI sets this to
	// os.Stdout.
	Output io.Writer
}

// output returns the configured writer, falling back to io.Discard so that
// library callers that do not set Output receive no implicit stdout writes.
func (c Config) output() io.Writer {
	if c.Output != nil {
		return c.Output
	}
	return io.Discard
}

// Run processes all files matching Config.Files and publishes them to Confluence.
func Run(config Config) error {
	api := confluence.NewAPI(config.BaseURL, config.Username, config.Password, config.InsecureSkipTLSVerify)

	files, err := doublestar.FilepathGlob(config.Files)
	if err != nil {
		return err
	}

	if len(files) == 0 {
		msg := "no files matched"
		if config.CI {
			log.Warn().Msg(msg)
		} else {
			return errors.New(msg)
		}
	}

	var hasErrors bool
	for _, file := range files {
		log.Info().Msgf("processing %s", file)

		target, err := ProcessFile(file, api, config)
		if err != nil {
			if config.ContinueOnError {
				log.Error().Err(err).Msgf("processing %s", file)
				hasErrors = true
				continue
			}
			return err
		}

		if target != nil {
			log.Info().Msgf("page successfully updated: %s", api.BaseURL+target.Links.Full)
			if _, err := fmt.Fprintln(config.output(), api.BaseURL+target.Links.Full); err != nil {
				return err
			}
		}
	}

	if hasErrors {
		return fmt.Errorf("one or more files failed to process")
	}

	return nil
}

// ProcessFile processes a single markdown file and publishes it to Confluence.
// Returns nil for the page info when compile-only or dry-run mode is active.
func ProcessFile(file string, api *confluence.API, config Config) (*confluence.PageInfo, error) {
	markdown, err := os.ReadFile(file)
	if err != nil {
		return nil, fmt.Errorf("unable to read file %q: %w", file, err)
	}

	markdown = bytes.ReplaceAll(markdown, []byte("\r\n"), []byte("\n"))

	meta, markdown, err := metadata.ExtractMeta(
		markdown,
		config.Space,
		config.TitleFromH1,
		config.TitleFromFilename,
		file,
		config.Parents,
		config.TitleAppendGeneratedHash,
		config.ContentAppearance,
	)
	if err != nil {
		return nil, fmt.Errorf("unable to extract metadata from file %q: %w", file, err)
	}

	if config.PageID != "" && meta != nil {
		log.Warn().Msg(
			`specified file contains metadata, ` +
				`but it will be ignored due specified command line URL`,
		)
		meta = nil
	}

	if config.PageID == "" && meta == nil {
		return nil, fmt.Errorf(
			"specified file doesn't contain metadata and URL is not specified " +
				"via command line or doesn't contain pageId GET-parameter",
		)
	}

	if meta != nil {
		if meta.Space == "" {
			return nil, fmt.Errorf(
				"space is not set ('Space' header is not set and '--space' option is not set)",
			)
		}
		if meta.Title == "" {
			return nil, fmt.Errorf(
				"page title is not set: use the 'Title' header, " +
					"or the --title-from-h1 / --title-from-filename flags",
			)
		}
	}

	std, err := stdlib.New(api)
	if err != nil {
		return nil, fmt.Errorf("unable to retrieve standard library: %w", err)
	}

	templates := std.Templates

	var recurse bool
	for {
		templates, markdown, recurse, err = includes.ProcessIncludes(
			filepath.Dir(file),
			config.IncludePath,
			markdown,
			templates,
		)
		if err != nil {
			return nil, fmt.Errorf("unable to process includes: %w", err)
		}
		if !recurse {
			break
		}
	}

	macros, markdown, err := macro.ExtractMacros(
		filepath.Dir(file),
		config.IncludePath,
		markdown,
		templates,
	)
	if err != nil {
		return nil, fmt.Errorf("unable to extract macros: %w", err)
	}

	for _, m := range macros {
		markdown, err = m.Apply(markdown)
		if err != nil {
			return nil, fmt.Errorf("unable to apply macro: %w", err)
		}
	}

	links, err := page.ResolveRelativeLinks(
		api,
		meta,
		markdown,
		filepath.Dir(file),
		config.Space,
		config.TitleFromH1,
		config.TitleFromFilename,
		config.Parents,
		config.TitleAppendGeneratedHash,
	)
	if err != nil {
		return nil, fmt.Errorf("unable to resolve relative links: %w", err)
	}

	markdown = page.SubstituteLinks(markdown, links)

	if config.DryRun {
		if meta != nil {
			if _, _, err := page.ResolvePage(true, api, meta); err != nil {
				return nil, fmt.Errorf("unable to resolve page location: %w", err)
			}
		} else if config.PageID != "" {
			if _, err := api.GetPageByID(config.PageID); err != nil {
				return nil, fmt.Errorf("unable to resolve page by ID: %w", err)
			}
		}
	}

	if config.CompileOnly || config.DryRun {
		if config.DropH1 {
			log.Info().Msg("the leading H1 heading will be excluded from the Confluence output")
		}

		imageAlign, err := getImageAlign(config.ImageAlign, meta)
		if err != nil {
			return nil, fmt.Errorf("unable to determine image-align: %w", err)
		}

		cfg := types.MarkConfig{
			MermaidScale:  config.MermaidScale,
			D2Scale:       config.D2Scale,
			DropFirstH1:   config.DropH1,
			StripNewlines: config.StripLinebreaks,
			Features:      config.Features,
			ImageAlign:    imageAlign,
		}
		html, _, err := markmd.CompileMarkdown(markdown, std, file, cfg)
		if err != nil {
			return nil, fmt.Errorf("unable to compile markdown: %w", err)
		}
		if _, err := fmt.Fprintln(config.output(), html); err != nil {
			return nil, err
		}
		return nil, nil
	}

	var target *confluence.PageInfo
	var pageCreated bool

	if meta != nil {
		parent, pg, err := page.ResolvePage(false, api, meta)
		if err != nil {
			return nil, fmt.Errorf("error resolving page %q: %w", meta.Title, err)
		}

		if pg == nil {
			pg, err = api.CreatePage(meta.Space, meta.Type, parent, meta.Title, ``)
			if err != nil {
				return nil, fmt.Errorf("can't create %s %q: %w", meta.Type, meta.Title, err)
			}
			// A delay between the create and update call helps mitigate a 409
			// conflict that can occur when attempting to update a page just
			// after it was created. See issues/139.
			time.Sleep(1 * time.Second)
			pageCreated = true
		}

		target = pg
	} else {
		pg, err := api.GetPageByID(config.PageID)
		if err != nil {
			return nil, fmt.Errorf("unable to retrieve page by id: %w", err)
		}
		if pg == nil {
			return nil, fmt.Errorf("page with id %q not found", config.PageID)
		}
		target = pg
	}

	// Collect attachments declared via <!-- Attachment: --> directives.
	var declaredAttachments []string
	if meta != nil {
		declaredAttachments = meta.Attachments
	}

	localAttachments, err := attachment.ResolveLocalAttachments(
		vfs.LocalOS,
		filepath.Dir(file),
		declaredAttachments,
	)
	if err != nil {
		return nil, fmt.Errorf("unable to locate attachments: %w", err)
	}

	attaches, err := attachment.ResolveAttachments(api, target, localAttachments)
	if err != nil {
		return nil, fmt.Errorf("unable to create/update attachments: %w", err)
	}

	markdown = attachment.CompileAttachmentLinks(markdown, attaches)

	if config.DropH1 {
		log.Info().Msg("the leading H1 heading will be excluded from the Confluence output")
	}

	imageAlign, err := getImageAlign(config.ImageAlign, meta)
	if err != nil {
		return nil, fmt.Errorf("unable to determine image-align: %w", err)
	}

	cfg := types.MarkConfig{
		MermaidScale:  config.MermaidScale,
		D2Scale:       config.D2Scale,
		DropFirstH1:   config.DropH1,
		StripNewlines: config.StripLinebreaks,
		Features:      config.Features,
		ImageAlign:    imageAlign,
	}

	html, inlineAttachments, err := markmd.CompileMarkdown(markdown, std, file, cfg)
	if err != nil {
		return nil, fmt.Errorf("unable to compile markdown: %w", err)
	}

	if _, err = attachment.ResolveAttachments(api, target, inlineAttachments); err != nil {
		return nil, fmt.Errorf("unable to create/update attachments: %w", err)
	}

	var layout, sidebar string
	var labels []string
	var contentAppearance, emoji string

	if meta != nil {
		layout = meta.Layout
		sidebar = meta.Sidebar
		labels = meta.Labels
		contentAppearance = meta.ContentAppearance
		emoji = meta.Emoji
	}

	{
		var buffer bytes.Buffer
		err := std.Templates.ExecuteTemplate(
			&buffer,
			"ac:layout",
			struct {
				Layout  string
				Sidebar string
				Body    string
			}{
				Layout:  layout,
				Sidebar: sidebar,
				Body:    html,
			},
		)
		if err != nil {
			return nil, fmt.Errorf("unable to execute layout template: %w", err)
		}
		html = buffer.String()
	}

	var finalVersionMessage string
	shouldUpdatePage := true

	if config.ChangesOnly {
		contentHash := sha1Hash(html)
		log.Debug().Msgf("content hash: %s", contentHash)

		re := regexp.MustCompile(`\[v([a-f0-9]{40})]$`)
		if matches := re.FindStringSubmatch(target.Version.Message); len(matches) > 1 {
			log.Debug().Msgf("previous content hash: %s", matches[1])
			if matches[1] == contentHash {
				log.Info().Msgf("page %q is already up to date", target.Title)
				shouldUpdatePage = false
			}
		}

		finalVersionMessage = fmt.Sprintf("%s [v%s]", config.VersionMessage, contentHash)
	} else {
		finalVersionMessage = config.VersionMessage
	}

	// Only fetch the old body and inline comments when we know the page will
	// actually be updated. This avoids unnecessary API round-trips for no-op
	// runs (e.g. when --changes-only determines the content is unchanged).
	if shouldUpdatePage && config.PreserveComments && !pageCreated {
		pg, err := api.GetPageByIDExpanded(target.ID, "ancestors,version,body.storage")
		if err != nil {
			return nil, fmt.Errorf("unable to retrieve page body for comments: %w", err)
		}
		target = pg

		comments, err := api.GetInlineComments(target.ID)
		if err != nil {
			return nil, fmt.Errorf("unable to retrieve inline comments: %w", err)
		}

		html, err = mergeComments(html, target.Body.Storage.Value, comments)
		if err != nil {
			return nil, fmt.Errorf("unable to merge inline comments: %w", err)
		}
	}

	if shouldUpdatePage {
		err = api.UpdatePage(
			target,
			html,
			config.MinorEdit,
			finalVersionMessage,
			contentAppearance,
			emoji,
		)
		if err != nil {
			return nil, fmt.Errorf("unable to update page: %w", err)
		}
	}

	if meta != nil {
		if err := updateLabels(api, target, labels); err != nil {
			return nil, err
		}
	}

	if config.EditLock {
		log.Info().Msgf(
			`edit locked on page %q by user %q to prevent manual edits`,
			target.Title,
			config.Username,
		)
		if err := api.RestrictPageUpdates(target, config.Username); err != nil {
			return nil, fmt.Errorf("unable to restrict page updates: %w", err)
		}
	}

	return target, nil
}

func updateLabels(api *confluence.API, target *confluence.PageInfo, metaLabels []string) error {
	labelInfo, err := api.GetPageLabels(target, "global")
	if err != nil {
		return err
	}

	log.Debug().Msg("Page Labels:")
	log.Debug().Interface("labels", labelInfo.Labels).Send()
	log.Debug().Msg("Meta Labels:")
	log.Debug().Interface("labels", metaLabels).Send()

	delLabels := determineLabelsToRemove(labelInfo, metaLabels)
	log.Debug().Msg("Del Labels:")
	log.Debug().Interface("labels", delLabels).Send()

	addLabels := determineLabelsToAdd(metaLabels, labelInfo)
	log.Debug().Msg("Add Labels:")
	log.Debug().Interface("labels", addLabels).Send()

	if len(addLabels) > 0 {
		if _, err = api.AddPageLabels(target, addLabels); err != nil {
			return fmt.Errorf("error adding labels: %w", err)
		}
	}

	for _, label := range delLabels {
		if _, err = api.DeletePageLabel(target, label); err != nil {
			return fmt.Errorf("error deleting label %q: %w", label, err)
		}
	}

	return nil
}

func determineLabelsToRemove(labelInfo *confluence.LabelInfo, metaLabels []string) []string {
	var labels []string
	for _, label := range labelInfo.Labels {
		if !slices.ContainsFunc(metaLabels, func(metaLabel string) bool {
			return strings.EqualFold(metaLabel, label.Name)
		}) {
			labels = append(labels, label.Name)
		}
	}
	return labels
}

func determineLabelsToAdd(metaLabels []string, labelInfo *confluence.LabelInfo) []string {
	var labels []string
	for _, metaLabel := range metaLabels {
		if !slices.ContainsFunc(labelInfo.Labels, func(label confluence.Label) bool {
			return strings.EqualFold(label.Name, metaLabel)
		}) {
			labels = append(labels, metaLabel)
		}
	}
	return labels
}

func getImageAlign(align string, meta *metadata.Meta) (string, error) {
	if meta != nil && meta.ImageAlign != "" {
		align = meta.ImageAlign
	}

	if align != "" {
		align = strings.ToLower(strings.TrimSpace(align))
		if align != "left" && align != "center" && align != "right" {
			return "", fmt.Errorf(
				`unknown image-align %q, expected one of: left, center, right`,
				align,
			)
		}
		return align, nil
	}

	return "", nil
}

func sha1Hash(input string) string {
	h := sha1.New()
	h.Write([]byte(input))
	return hex.EncodeToString(h.Sum(nil))
}

// htmlEscapeText escapes only the characters that Confluence storage HTML
// always encodes in text nodes (&, <, >). Unlike html.EscapeString it does NOT
// escape single-quotes or double-quotes, because those are frequently left
// unescaped inside text nodes by the Confluence editor and by mark's own
// renderer, so escaping them would prevent the selection-search from finding
// a valid match.
var htmlTextReplacer = strings.NewReplacer("&", "&amp;", "<", "&lt;", ">", "&gt;")

func htmlEscapeText(s string) string {
	return htmlTextReplacer.Replace(s)
}

// truncateSelection returns a truncated preview of s for use in log messages,
// capped at maxRunes runes, with an ellipsis appended when trimmed.
func truncateSelection(s string, maxRunes int) string {
	runes := []rune(s)
	if len(runes) <= maxRunes {
		return s
	}
	return string(runes[:maxRunes]) + "…"
}

// contextBefore returns up to maxBytes of s ending at byteEnd, trimmed
// forward to the nearest valid UTF-8 rune start so the slice is never
// split across a multi-byte sequence.
func contextBefore(s string, byteEnd, maxBytes int) string {
	start := byteEnd - maxBytes
	if start < 0 {
		start = 0
	}
	for start < byteEnd && !utf8.RuneStart(s[start]) {
		start++
	}
	return s[start:byteEnd]
}

// contextAfter returns up to maxBytes of s starting at byteStart, trimmed
// back to the nearest valid UTF-8 rune start so the slice is never split
// across a multi-byte sequence.
func contextAfter(s string, byteStart, maxBytes int) string {
	end := byteStart + maxBytes
	if end >= len(s) {
		return s[byteStart:]
	}
	for end > byteStart && !utf8.RuneStart(s[end]) {
		end--
	}
	return s[byteStart:end]
}

func levenshteinDistance(s1, s2 string) int {
	r1 := []rune(s1)
	r2 := []rune(s2)

	if len(r1) == 0 {
		return len(r2)
	}
	if len(r2) == 0 {
		return len(r1)
	}

	// Use two rolling rows instead of a full matrix to reduce allocations
	// from O(m×n) to O(n). Swap r1/r2 so r2 is the shorter string, keeping
	// the row width (len(r2)+1) as small as possible.
	if len(r1) < len(r2) {
		r1, r2 = r2, r1
	}

	prev := make([]int, len(r2)+1)
	curr := make([]int, len(r2)+1)

	for j := range prev {
		prev[j] = j
	}

	for i := 1; i <= len(r1); i++ {
		curr[0] = i
		for j := 1; j <= len(r2); j++ {
			cost := 0
			if r1[i-1] != r2[j-1] {
				cost = 1
			}
			curr[j] = min(
				prev[j]+1,      // deletion
				curr[j-1]+1,    // insertion
				prev[j-1]+cost, // substitution
			)
		}
		prev, curr = curr, prev
	}
	return prev[len(r2)]
}

type commentContext struct {
	before string
	after  string
}

// mergeComments re-embeds inline comment markers from the Confluence API into
// newBody (the updated storage HTML about to be uploaded). It extracts context
// from each existing marker in oldBody and uses Levenshtein distance to
// relocate each marker to the best-matching position in newBody, so comment
// threads survive page edits even when the surrounding text has shifted.
//
// At most maxCandidates occurrences of each selection are evaluated with
// Levenshtein distance; further occurrences are ignored to bound CPU cost on
// pages where a selection is short or very common.
const maxCandidates = 100

// contextWindowBytes is the number of bytes of surrounding text captured as
// context around each inline-comment marker. It is used both when extracting
// context from oldBody and when scoring candidates in newBody.
const contextWindowBytes = 100

func mergeComments(newBody string, oldBody string, comments *confluence.InlineComments) (string, error) {
	if comments == nil {
		return newBody, nil
	}
	// 1. Extract context for each comment from oldBody
	contexts := make(map[string]commentContext)
	matches := markerRegex.FindAllStringSubmatchIndex(oldBody, -1)
	for _, match := range matches {
		ref := oldBody[match[2]:match[3]]
		// context around the tag
		before := contextBefore(oldBody, match[0], contextWindowBytes)
		after := contextAfter(oldBody, match[1], contextWindowBytes)
		contexts[ref] = commentContext{
			before: before,
			after:  after,
		}
	}

	type replacement struct {
		start     int
		end       int
		ref       string
		selection string
	}
	var replacements []replacement
	seenRefs := make(map[string]bool)

	for _, comment := range comments.Results {
		if comment.Extensions.Location != "inline" {
			log.Debug().
				Str("location", comment.Extensions.Location).
				Str("ref", comment.Extensions.InlineProperties.MarkerRef).
				Msg("comment ignored during inline marker merge: not an inline comment")
			continue
		}

		ref := comment.Extensions.InlineProperties.MarkerRef
		selection := comment.Extensions.InlineProperties.OriginalSelection

		if seenRefs[ref] {
			// Multiple results share the same MarkerRef (e.g. threaded replies).
			// The marker only needs to be inserted once; skip duplicates.
			continue
		}
		// Mark ref as seen immediately so subsequent results for the same ref
		// (threaded replies) are always deduplicated, even if this one is dropped.
		seenRefs[ref] = true

		if selection == "" {
			log.Warn().
				Str("ref", ref).
				Msg("inline comment skipped: original selection is empty; comment will be lost")
			continue
		}

		ctx, hasCtx := contexts[ref]

		// Build the list of forms to search for in newBody. The escaped form
		// is tried first (normal XML text nodes). The raw form is appended as a
		// fallback for text inside CDATA-backed macro bodies (e.g. ac:code),
		// where < and > are stored unescaped inside <![CDATA[...]]>.
		escapedSelection := htmlEscapeText(selection)
		searchForms := []string{escapedSelection}
		if selection != escapedSelection {
			searchForms = append(searchForms, selection)
		}

		var bestStart = -1
		var bestEnd = -1
		var minDistance = 1000000

		// Iterate over search forms; stop as soon as we have a definitive best.
		candidates := 0
		stopSearch := false
		for _, form := range searchForms {
			if stopSearch {
				break
			}
			currentPos := 0
			for {
				index := strings.Index(newBody[currentPos:], form)
				if index == -1 {
					break
				}
				start := currentPos + index
				end := start + len(form)

				// Skip candidates that start or end in the middle of a multi-byte
				// UTF-8 rune; such a match would produce invalid UTF-8 output.
				if !utf8.RuneStart(newBody[start]) || (end < len(newBody) && !utf8.RuneStart(newBody[end])) {
					currentPos = start + 1
					continue
				}

				candidates++
				if candidates > maxCandidates {
					stopSearch = true
					break
				}

				if !hasCtx {
					// No context available; use the first occurrence.
					bestStart = start
					bestEnd = end
					stopSearch = true
					break
				}

				newBefore := contextBefore(newBody, start, contextWindowBytes)
				newAfter := contextAfter(newBody, end, contextWindowBytes)

				// Fast path: exact context match is the best possible result.
				if newBefore == ctx.before && newAfter == ctx.after {
					bestStart = start
					bestEnd = end
					stopSearch = true
					break
				}

				// Lower-bound pruning: Levenshtein distance is at least the
				// absolute difference in rune counts. Use rune counts (not byte
				// lengths) to match the unit levenshteinDistance operates on,
				// avoiding false skips for multibyte UTF-8 content.
				lbBefore := utf8.RuneCountInString(ctx.before) - utf8.RuneCountInString(newBefore)
				if lbBefore < 0 {
					lbBefore = -lbBefore
				}
				lbAfter := utf8.RuneCountInString(ctx.after) - utf8.RuneCountInString(newAfter)
				if lbAfter < 0 {
					lbAfter = -lbAfter
				}
				if lbBefore+lbAfter >= minDistance {
					currentPos = start + 1
					continue
				}

				distance := levenshteinDistance(ctx.before, newBefore) + levenshteinDistance(ctx.after, newAfter)

				if distance < minDistance {
					minDistance = distance
					bestStart = start
					bestEnd = end
				}

				currentPos = start + 1
			}
		}

		if bestStart != -1 {
			replacements = append(replacements, replacement{
				start:     bestStart,
				end:       bestEnd,
				ref:       ref,
				selection: selection,
			})
		} else {
			log.Warn().
				Str("ref", ref).
				Str("selection_preview", truncateSelection(selection, 50)).
				Msg("inline comment dropped: selected text not found in new body; comment will be lost")
		}
	}

	// Sort replacements from back to front to avoid offset issues.
	// Use a stable sort with ref as a tie-breaker so the ordering is
	// deterministic when two markers resolve to the same start offset.
	slices.SortStableFunc(replacements, func(a, b replacement) int {
		if a.start != b.start {
			return b.start - a.start
		}
		if a.ref < b.ref {
			return -1
		}
		if a.ref > b.ref {
			return 1
		}
		return 0
	})

	// Apply replacements back-to-front. Track the minimum start of any
	// applied replacement so that overlapping candidates (whose end exceeds
	// that boundary) are dropped rather than producing nested or malformed
	// <ac:inline-comment-marker> tags.
	minAppliedStart := len(newBody)
	for _, r := range replacements {
		if r.end > minAppliedStart {
			// This replacement overlaps with an already-applied one.
			// Drop it and warn so the user knows the comment was skipped.
			log.Warn().
				Str("ref", r.ref).
				Str("selection_preview", truncateSelection(r.selection, 50)).
				Int("start", r.start).
				Int("end", r.end).
				Int("conflicting_start", minAppliedStart).
				Msg("inline comment marker dropped: selection overlaps an already-placed marker")
			continue
		}
		minAppliedStart = r.start
		selection := newBody[r.start:r.end]
		withComment := fmt.Sprintf(
			`<ac:inline-comment-marker ac:ref="%s">%s</ac:inline-comment-marker>`,
			stdhtml.EscapeString(r.ref),
			selection,
		)
		newBody = newBody[:r.start] + withComment + newBody[r.end:]
	}

	return newBody, nil
}
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+								package mark
 								import (
 									"bytes"
 									"crypto/sha1"
 									"encoding/hex"
-												refactor: modernize Go primitives

- Replace interface{} with any (Go 1.18) across confluence/api.go,
  macro/macro.go, util/cli.go, util/error_handler.go, includes/templates.go
- Replace sort.SliceStable with slices.SortStableFunc + cmp.Compare (Go 1.21)
  in attachment/attachment.go, consistent with existing slices usage
- Replace fmt.Errorf("%s", msg) with errors.New(msg) in mark.go

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 01:34:06 +02:00
+									"errors"
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+									"fmt"
-												Feature/robust comment preservation (#768)

This is based on guoweis-work PR https://github.com/kovetskiy/mark/pull/145

* feat(confluence): add support for fetching page body and inline comments

* feat(cmd): add --preserve-comments flag to preserve inline comments

* feat(mark): implement context-aware inline comment preservation

* test(mark): add tests for context-aware MergeComments logic

* fix: remove empty else branch in MergeComments to fix SA9003

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* perf: compile markerRegex once as package-level variable

Avoids recompiling the inline comment marker regex on every call to
MergeComments, which matters for pages with many comment markers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against nil comments pointer in MergeComments

Prevents a panic when GetInlineComments returns nil (e.g. on pages
where the inline comments feature is not enabled).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: add edge-case tests for MergeComments; fix overlapping replacement

Four new test cases:
- SelectionMissing: comment dropped gracefully when text is gone from new body
- OverlappingSelections: overlapping comments no longer corrupt the body;
  the later match (by position) wins and the earlier overlapping one is dropped
- NilComments: nil pointer returns new body unchanged
- HTMLEntities: &lt;, &gt;, &#39; selections match correctly

Also fixes the overlapping replacement bug: apply back-to-front and skip any
replacement whose end exceeds the start of an already-applied one.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: escape ref attribute value in inline comment marker XML

Use html.EscapeString on r.ref before interpolating it into the
ac:ref attribute to prevent malformed XML if the value ever contains
quotes or other special characters.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: use first occurrence when no context is available in MergeComments

Without context the old code left distance=0 for every match and
updated bestStart on each iteration, so the final result depended on
whichever occurrence was visited last (non-deterministic with respect
to the search order).

Restructure the loop to break immediately on the first match when
hasCtx is false, making the behaviour explicit and deterministic.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: log warning when overlapping inline comment marker is dropped

Previously the overlap was silently skipped. Now a zerolog Warn message
is emitted with the ref, the conflicting byte offsets, and the ref of
the already-placed marker, so users can see which comment was lost
rather than silently getting incomplete output.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: warn when inline comments are silently dropped in MergeComments

Three cases now emit a zerolog Warn instead of silently discarding:

1. Comment location != "inline": logs ref and actual location.
2. Selected text not found in new body: logs ref and selection text.
3. Overlapping replacement (existing): adds selection text to the
   already-present overlap warning for easier diagnosis.

Also adds a selection field to the replacement struct so the overlap
warning can report the dropped text.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: update markerRegex to match markers with nested tags

Replace ([^<]*) with (?s)(.*?) so the pattern:
- Matches marker content that contains nested inline tags (e.g. <strong>)
- Matches across newlines ((?s) / DOTALL mode)

The old character class [^<]* stopped at the first < inside the
marker body, causing the context-extraction step to miss any comment
whose original selection spanned formatted text.

Add TestMergeComments_NestedTags to cover this path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against empty OriginalSelection in MergeComments

strings.Index(s, "") always returns 0, so an empty escapedSelection
would spin the search loop indefinitely (or panic when currentPos
advances past len(newBody)).

Skip comments with an empty selection early, emit a Warn log, and
add TestMergeComments_EmptySelection to cover the path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: paginate GetInlineComments to avoid silently truncating results

The Confluence child/comment endpoint is paginated. The previous
single-request implementation silently dropped any comments beyond
the server's default page size.

Changes:
- Add Links (context, next) to InlineComments struct so the _links
  field from each page response is decoded.
- Rewrite GetInlineComments to loop with limit/start parameters
  (pageSize=100), accumulating all results, following the same pattern
  used by GetAttachments and label fetching.
- Add TestMergeComments_DuplicateMarkerRef to cover the deduplication
  guard added in the previous commit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix UTF-8 safety, API compat, log verbosity

- levenshteinDistance: convert to []rune before empty-string checks so
  rune counts (not byte counts) are returned for strings with multi-byte
  characters

- Add contextBefore/contextAfter helpers that use utf8.RuneStart to
  avoid slicing in the middle of a multi-byte UTF-8 sequence when
  extracting 100-char context windows from oldBody and newBody

- Add truncateSelection helper (50 runes + ellipsis) and apply it in all
  Warn log messages that include the selected text, preventing large or
  sensitive page content from appearing in logs

- Downgrade non-inline comment log from Warn to Debug with message
  'comment ignored during inline marker merge: not an inline comment';
  page-level comments are not inline markers and are not 'lost'

- Restore original one-argument GetPageByID (expand='ancestors,version')
  and add GetPageByIDExpanded for the one caller that needs a custom
  expand value, preserving backward compatibility for API consumers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Address new PR review comments

- Remove custom min() function: shadows the Go 1.21+ built-in min for
  the entire package; the built-in handles the 3-arg call in
  levenshteinDistance identically

- Validate rune boundaries on strings.Index candidates: skip any match
  where start or end falls in the middle of a multi-byte UTF-8 rune
  to prevent corrupt UTF-8 output

- Defer preserve-comments API calls until after shouldUpdatePage is
  determined: avoids unnecessary GetPageByIDExpanded + GetInlineComments
  round-trips on no-op --changes-only runs

- Capitalize Usage string for --preserve-comments flag (util/flags.go)
  and matching README.md entry to match sentence case of surrounding flags

- Run gofmt on util/cli.go to fix struct literal field alignment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: document --preserve-comments feature in README

Add a dedicated 'Preserving Inline Comments' section under Tricks with:
- Usage examples (CLI flag and env var)
- Step-by-step explanation of the Levenshtein-based relocation algorithm
- Limitations (deleted text, overlapping selections, new pages,
  changes-only interaction)

Also add a cross-reference NOTE near the --preserve-comments flag entry
in the Usage section.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: fix markdownlint errors in README

- Change unordered list markers from dashes to asterisks (MD004)
- Remove extra blank line before Issues section (MD012)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Extract named types for InlineComments; optimize Levenshtein search

- Introduce InlineCommentProperties, InlineCommentExtensions, and
  InlineCommentResult named types in confluence/api.go, replacing the
  anonymous nested struct in InlineComments.Results. Callers and tests
  can now construct/inspect comment objects without repeating the JSON
  shape.

- Simplify makeComments helper in mark_test.go to use the new named
  types directly, eliminating the verbose anonymous struct literal.

- Add two Levenshtein candidate-search optimisations in MergeComments:
  * Exact-context fast path: if both the before and after windows match
    exactly, take that occurrence immediately without computing distance.
  * Lower-bound pruning: skip the full O(m*n) Levenshtein computation
    for a candidate when the absolute difference in window lengths alone
    already meets or exceeds the current best distance.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Use stable sort with ref tie-breaker; fix README overlap description

- Replace slices.SortFunc with slices.SortStableFunc for the
  replacements slice, adding ref as a lexicographic tie-breaker when
  two markers resolve to the same start offset. This makes overlap
  resolution fully deterministic across runs.

- Correct the README limitation note: the *earlier* overlapping
  match (lower byte offset) is what gets dropped; the later one
  (higher byte offset, applied first in the back-to-front pass) is
  kept. The previous wording said 'the second one is dropped' which
  was ambiguous and inaccurate.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix rune-based lower-bound pruning; clarify test comment

- Use utf8.RuneCountInString instead of len() for the Levenshtein
  lower-bound pruning computation. The levenshteinDistance function
  operates on rune slices, so byte-length differences can exceed the
  true rune-length difference for multibyte UTF-8 content, causing
  valid candidates to be incorrectly skipped.

- Update TestMergeComments_SelectionMissing comment to say the comment
  is 'dropped with a warning' rather than 'silently dropped', matching
  the actual behavior.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Add missing unit tests for helpers and MergeComments scenarios

Helper function tests:
- TestTruncateSelection: short/exact/long strings and multibyte runes
- TestLevenshteinDistance: empty strings, identical, insertions,
  deletions, substitutions, 'kitten/sitting', and a multibyte UTF-8
  case to exercise rune-based counting
- TestContextBefore / TestContextAfter: basic windowing, window larger
  than string, and a case where the raw byte offset lands mid-rune (é)
  to verify the rune-boundary correction logic

MergeComments scenario tests:
- TestMergeComments_MultipleComments: two non-overlapping comments both
  correctly applied via back-to-front replacement
- TestMergeComments_EmptyResults: non-nil InlineComments with zero
  results returns body unchanged
- TestMergeComments_NonInlineLocation: page-level comments (location
  != 'inline') are skipped; body unchanged
- TestMergeComments_NoContext: when a ref has no marker in oldBody the
  first occurrence of the selection in newBody is used
- TestMergeComments_UTF8: multibyte (Japanese) characters in both body
  and selection are handled correctly

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix three correctness issues in MergeComments

- Fix html import shadowing: alias the 'html' import as 'stdhtml' to
  avoid shadowing by the local 'html' variable used throughout
  ProcessFile. Both callers updated: stdhtml.EscapeString for the
  ref attribute, htmlEscapeText for the selection search.

- Fix selection search with quotes/apostrophes: replace
  html.EscapeString for the selection with a new htmlEscapeText helper
  that only escapes &, <, > — not ' or ". Confluence storage HTML
  often leaves quotes and apostrophes unescaped in text nodes, so
  fully-escaped selections would fail to match and inline comments
  would be silently dropped. Add TestMergeComments_SelectionWithQuotes.

- Fix duplicate-ref warnings: move seenRefs[ref]=true to immediately
  after the duplicate-check, before the search loop. Previously seenRefs
  was only set on a successful match, so multiple results for the same
  MarkerRef with no match in the new body would each emit a 'dropped'
  warning. Add TestMergeComments_DuplicateMarkerRefDropped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Optimize levenshteinDistance to use two rolling rows instead of full matrix

Reduces memory allocation from O(m×n) to O(n) by keeping only the
previous and current rows. Also swaps r1/r2 so the shorter string is
used for column width, minimizing row allocation size.

This matters in MergeComments where levenshteinDistance is called for
every candidate match of every comment's selection in newBody — on
pages with many comments or short/common selections the number of
calls can be high.

Addresses thread [40] from PR review.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix test description and README algorithm doc

mark_test.go (thread [43]):
- TestMergeComments_HTMLEntities: the description incorrectly claimed
  &#39; (apostrophe) was tested; the selection '<world>' contains no
  apostrophe. Updated comment to accurately describe what is covered
  (&lt;/&gt; entity matching) and note the &#39; limitation.
- Add TestMergeComments_ApostropheSelection: verifies a selection with
  a literal apostrophe is found when the new body also has a literal
  apostrophe (the common case from mark's renderer). This exercises
  the htmlEscapeText path which intentionally does not encode ' or ".

README.md (thread [42]):
- Step 2 of the algorithm description said context was recorded
  'immediately before and after the commented selection' which is
  ambiguous. Clarified that context windows are taken around the
  <ac:inline-comment-marker> tag boundaries in the old body (not
  around the raw selection text), so the context is stable even when
  the marker wraps additional inline markup such as <strong>.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Unexport mergeComments and cap candidate evaluation

Thread [44]: MergeComments was exported but is internal-only — only
called within the mark package and tested from the same package.
Unexport it to mergeComments to avoid expanding the public API surface
unnecessarily. Add a Go doc comment describing the function contract,
HTML expectations, and the candidate cap.

Thread [45]: The candidate-scoring loop had no upper bound. For short
or common selections (e.g. 'a', 'the') on large pages the loop could
invoke levenshteinDistance thousands of times, each allocating rune
and int slices. Add a maxCandidates=100 constant and break once that
many on-rune-boundary occurrences have been evaluated. The exact-context
fast-path and lower-bound pruning already skip many candidates before
Levenshtein is called, so in practice the cap is only reached for very
common selections where the 100th candidate is unlikely to be
meaningfully better than an earlier one anyway.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: fix HTMLEntities description and add ApostropheEncoded limitation test

Thread #43: TestMergeComments_HTMLEntities had a misleading note claiming it
covered the &#39; apostrophe case, but the selection under test ('<world>') did
not include an apostrophe. Remove that note and add a dedicated
TestMergeComments_ApostropheEncoded test that explicitly documents the known
limitation: when a Confluence body stores an apostrophe as the numeric entity
&#39;, mergeComments cannot locate the selection (htmlEscapeText does not
encode ' to &#39;), so the comment is dropped with a warning.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix CDATA selection fallback and extract contextWindowBytes constant

Thread #46: mergeComments only searched for htmlEscapeText(selection) and
would fail for selections inside CDATA-backed macro bodies (e.g. ac:code),
where < and > are stored as raw characters rather than HTML entities. Restructure
the search loop to build a searchForms slice: the escaped form is tried first
(covers normal XML text nodes), and the raw unescaped form is appended as a
fallback when they differ. A stopSearch flag exits early on an exact context
match or when maxCandidates is reached, preserving the same performance
guarantees as before. Add TestMergeComments_CDATASelection to cover this path.

Thread #47: The context-window size 100 was repeated in four places across
mergeComments (two in the context-extraction loop and two in the scoring loop).
Extract it to const contextWindowBytes = 100 so it is easy to tune and stays
consistent everywhere.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 15:44:21 +02:00
+									stdhtml "html"
-												fix: route result output through Config.Output, not os.Stdout

mark.Run and ProcessFile were writing directly to os.Stdout via
fmt.Println, which is a surprising side-effect for library callers.

Add Config.Output io.Writer for callers to provide their own sink.
When nil the helper falls back to io.Discard, so library embedders
that do not set Output receive no implicit stdout writes. The CLI
layer sets Output: os.Stdout to preserve existing behaviour.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 23:05:28 +01:00
+									"io"
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+									"os"
 									"path/filepath"
 									"regexp"
 									"slices"
 									"strings"
 									"time"
-												Feature/robust comment preservation (#768)

This is based on guoweis-work PR https://github.com/kovetskiy/mark/pull/145

* feat(confluence): add support for fetching page body and inline comments

* feat(cmd): add --preserve-comments flag to preserve inline comments

* feat(mark): implement context-aware inline comment preservation

* test(mark): add tests for context-aware MergeComments logic

* fix: remove empty else branch in MergeComments to fix SA9003

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* perf: compile markerRegex once as package-level variable

Avoids recompiling the inline comment marker regex on every call to
MergeComments, which matters for pages with many comment markers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against nil comments pointer in MergeComments

Prevents a panic when GetInlineComments returns nil (e.g. on pages
where the inline comments feature is not enabled).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: add edge-case tests for MergeComments; fix overlapping replacement

Four new test cases:
- SelectionMissing: comment dropped gracefully when text is gone from new body
- OverlappingSelections: overlapping comments no longer corrupt the body;
  the later match (by position) wins and the earlier overlapping one is dropped
- NilComments: nil pointer returns new body unchanged
- HTMLEntities: &lt;, &gt;, &#39; selections match correctly

Also fixes the overlapping replacement bug: apply back-to-front and skip any
replacement whose end exceeds the start of an already-applied one.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: escape ref attribute value in inline comment marker XML

Use html.EscapeString on r.ref before interpolating it into the
ac:ref attribute to prevent malformed XML if the value ever contains
quotes or other special characters.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: use first occurrence when no context is available in MergeComments

Without context the old code left distance=0 for every match and
updated bestStart on each iteration, so the final result depended on
whichever occurrence was visited last (non-deterministic with respect
to the search order).

Restructure the loop to break immediately on the first match when
hasCtx is false, making the behaviour explicit and deterministic.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: log warning when overlapping inline comment marker is dropped

Previously the overlap was silently skipped. Now a zerolog Warn message
is emitted with the ref, the conflicting byte offsets, and the ref of
the already-placed marker, so users can see which comment was lost
rather than silently getting incomplete output.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: warn when inline comments are silently dropped in MergeComments

Three cases now emit a zerolog Warn instead of silently discarding:

1. Comment location != "inline": logs ref and actual location.
2. Selected text not found in new body: logs ref and selection text.
3. Overlapping replacement (existing): adds selection text to the
   already-present overlap warning for easier diagnosis.

Also adds a selection field to the replacement struct so the overlap
warning can report the dropped text.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: update markerRegex to match markers with nested tags

Replace ([^<]*) with (?s)(.*?) so the pattern:
- Matches marker content that contains nested inline tags (e.g. <strong>)
- Matches across newlines ((?s) / DOTALL mode)

The old character class [^<]* stopped at the first < inside the
marker body, causing the context-extraction step to miss any comment
whose original selection spanned formatted text.

Add TestMergeComments_NestedTags to cover this path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against empty OriginalSelection in MergeComments

strings.Index(s, "") always returns 0, so an empty escapedSelection
would spin the search loop indefinitely (or panic when currentPos
advances past len(newBody)).

Skip comments with an empty selection early, emit a Warn log, and
add TestMergeComments_EmptySelection to cover the path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: paginate GetInlineComments to avoid silently truncating results

The Confluence child/comment endpoint is paginated. The previous
single-request implementation silently dropped any comments beyond
the server's default page size.

Changes:
- Add Links (context, next) to InlineComments struct so the _links
  field from each page response is decoded.
- Rewrite GetInlineComments to loop with limit/start parameters
  (pageSize=100), accumulating all results, following the same pattern
  used by GetAttachments and label fetching.
- Add TestMergeComments_DuplicateMarkerRef to cover the deduplication
  guard added in the previous commit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix UTF-8 safety, API compat, log verbosity

- levenshteinDistance: convert to []rune before empty-string checks so
  rune counts (not byte counts) are returned for strings with multi-byte
  characters

- Add contextBefore/contextAfter helpers that use utf8.RuneStart to
  avoid slicing in the middle of a multi-byte UTF-8 sequence when
  extracting 100-char context windows from oldBody and newBody

- Add truncateSelection helper (50 runes + ellipsis) and apply it in all
  Warn log messages that include the selected text, preventing large or
  sensitive page content from appearing in logs

- Downgrade non-inline comment log from Warn to Debug with message
  'comment ignored during inline marker merge: not an inline comment';
  page-level comments are not inline markers and are not 'lost'

- Restore original one-argument GetPageByID (expand='ancestors,version')
  and add GetPageByIDExpanded for the one caller that needs a custom
  expand value, preserving backward compatibility for API consumers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Address new PR review comments

- Remove custom min() function: shadows the Go 1.21+ built-in min for
  the entire package; the built-in handles the 3-arg call in
  levenshteinDistance identically

- Validate rune boundaries on strings.Index candidates: skip any match
  where start or end falls in the middle of a multi-byte UTF-8 rune
  to prevent corrupt UTF-8 output

- Defer preserve-comments API calls until after shouldUpdatePage is
  determined: avoids unnecessary GetPageByIDExpanded + GetInlineComments
  round-trips on no-op --changes-only runs

- Capitalize Usage string for --preserve-comments flag (util/flags.go)
  and matching README.md entry to match sentence case of surrounding flags

- Run gofmt on util/cli.go to fix struct literal field alignment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: document --preserve-comments feature in README

Add a dedicated 'Preserving Inline Comments' section under Tricks with:
- Usage examples (CLI flag and env var)
- Step-by-step explanation of the Levenshtein-based relocation algorithm
- Limitations (deleted text, overlapping selections, new pages,
  changes-only interaction)

Also add a cross-reference NOTE near the --preserve-comments flag entry
in the Usage section.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: fix markdownlint errors in README

- Change unordered list markers from dashes to asterisks (MD004)
- Remove extra blank line before Issues section (MD012)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Extract named types for InlineComments; optimize Levenshtein search

- Introduce InlineCommentProperties, InlineCommentExtensions, and
  InlineCommentResult named types in confluence/api.go, replacing the
  anonymous nested struct in InlineComments.Results. Callers and tests
  can now construct/inspect comment objects without repeating the JSON
  shape.

- Simplify makeComments helper in mark_test.go to use the new named
  types directly, eliminating the verbose anonymous struct literal.

- Add two Levenshtein candidate-search optimisations in MergeComments:
  * Exact-context fast path: if both the before and after windows match
    exactly, take that occurrence immediately without computing distance.
  * Lower-bound pruning: skip the full O(m*n) Levenshtein computation
    for a candidate when the absolute difference in window lengths alone
    already meets or exceeds the current best distance.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Use stable sort with ref tie-breaker; fix README overlap description

- Replace slices.SortFunc with slices.SortStableFunc for the
  replacements slice, adding ref as a lexicographic tie-breaker when
  two markers resolve to the same start offset. This makes overlap
  resolution fully deterministic across runs.

- Correct the README limitation note: the *earlier* overlapping
  match (lower byte offset) is what gets dropped; the later one
  (higher byte offset, applied first in the back-to-front pass) is
  kept. The previous wording said 'the second one is dropped' which
  was ambiguous and inaccurate.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix rune-based lower-bound pruning; clarify test comment

- Use utf8.RuneCountInString instead of len() for the Levenshtein
  lower-bound pruning computation. The levenshteinDistance function
  operates on rune slices, so byte-length differences can exceed the
  true rune-length difference for multibyte UTF-8 content, causing
  valid candidates to be incorrectly skipped.

- Update TestMergeComments_SelectionMissing comment to say the comment
  is 'dropped with a warning' rather than 'silently dropped', matching
  the actual behavior.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Add missing unit tests for helpers and MergeComments scenarios

Helper function tests:
- TestTruncateSelection: short/exact/long strings and multibyte runes
- TestLevenshteinDistance: empty strings, identical, insertions,
  deletions, substitutions, 'kitten/sitting', and a multibyte UTF-8
  case to exercise rune-based counting
- TestContextBefore / TestContextAfter: basic windowing, window larger
  than string, and a case where the raw byte offset lands mid-rune (é)
  to verify the rune-boundary correction logic

MergeComments scenario tests:
- TestMergeComments_MultipleComments: two non-overlapping comments both
  correctly applied via back-to-front replacement
- TestMergeComments_EmptyResults: non-nil InlineComments with zero
  results returns body unchanged
- TestMergeComments_NonInlineLocation: page-level comments (location
  != 'inline') are skipped; body unchanged
- TestMergeComments_NoContext: when a ref has no marker in oldBody the
  first occurrence of the selection in newBody is used
- TestMergeComments_UTF8: multibyte (Japanese) characters in both body
  and selection are handled correctly

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix three correctness issues in MergeComments

- Fix html import shadowing: alias the 'html' import as 'stdhtml' to
  avoid shadowing by the local 'html' variable used throughout
  ProcessFile. Both callers updated: stdhtml.EscapeString for the
  ref attribute, htmlEscapeText for the selection search.

- Fix selection search with quotes/apostrophes: replace
  html.EscapeString for the selection with a new htmlEscapeText helper
  that only escapes &, <, > — not ' or ". Confluence storage HTML
  often leaves quotes and apostrophes unescaped in text nodes, so
  fully-escaped selections would fail to match and inline comments
  would be silently dropped. Add TestMergeComments_SelectionWithQuotes.

- Fix duplicate-ref warnings: move seenRefs[ref]=true to immediately
  after the duplicate-check, before the search loop. Previously seenRefs
  was only set on a successful match, so multiple results for the same
  MarkerRef with no match in the new body would each emit a 'dropped'
  warning. Add TestMergeComments_DuplicateMarkerRefDropped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Optimize levenshteinDistance to use two rolling rows instead of full matrix

Reduces memory allocation from O(m×n) to O(n) by keeping only the
previous and current rows. Also swaps r1/r2 so the shorter string is
used for column width, minimizing row allocation size.

This matters in MergeComments where levenshteinDistance is called for
every candidate match of every comment's selection in newBody — on
pages with many comments or short/common selections the number of
calls can be high.

Addresses thread [40] from PR review.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix test description and README algorithm doc

mark_test.go (thread [43]):
- TestMergeComments_HTMLEntities: the description incorrectly claimed
  &#39; (apostrophe) was tested; the selection '<world>' contains no
  apostrophe. Updated comment to accurately describe what is covered
  (&lt;/&gt; entity matching) and note the &#39; limitation.
- Add TestMergeComments_ApostropheSelection: verifies a selection with
  a literal apostrophe is found when the new body also has a literal
  apostrophe (the common case from mark's renderer). This exercises
  the htmlEscapeText path which intentionally does not encode ' or ".

README.md (thread [42]):
- Step 2 of the algorithm description said context was recorded
  'immediately before and after the commented selection' which is
  ambiguous. Clarified that context windows are taken around the
  <ac:inline-comment-marker> tag boundaries in the old body (not
  around the raw selection text), so the context is stable even when
  the marker wraps additional inline markup such as <strong>.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Unexport mergeComments and cap candidate evaluation

Thread [44]: MergeComments was exported but is internal-only — only
called within the mark package and tested from the same package.
Unexport it to mergeComments to avoid expanding the public API surface
unnecessarily. Add a Go doc comment describing the function contract,
HTML expectations, and the candidate cap.

Thread [45]: The candidate-scoring loop had no upper bound. For short
or common selections (e.g. 'a', 'the') on large pages the loop could
invoke levenshteinDistance thousands of times, each allocating rune
and int slices. Add a maxCandidates=100 constant and break once that
many on-rune-boundary occurrences have been evaluated. The exact-context
fast-path and lower-bound pruning already skip many candidates before
Levenshtein is called, so in practice the cap is only reached for very
common selections where the 100th candidate is unlikely to be
meaningfully better than an earlier one anyway.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: fix HTMLEntities description and add ApostropheEncoded limitation test

Thread #43: TestMergeComments_HTMLEntities had a misleading note claiming it
covered the &#39; apostrophe case, but the selection under test ('<world>') did
not include an apostrophe. Remove that note and add a dedicated
TestMergeComments_ApostropheEncoded test that explicitly documents the known
limitation: when a Confluence body stores an apostrophe as the numeric entity
&#39;, mergeComments cannot locate the selection (htmlEscapeText does not
encode ' to &#39;), so the comment is dropped with a warning.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix CDATA selection fallback and extract contextWindowBytes constant

Thread #46: mergeComments only searched for htmlEscapeText(selection) and
would fail for selections inside CDATA-backed macro bodies (e.g. ac:code),
where < and > are stored as raw characters rather than HTML entities. Restructure
the search loop to build a searchForms slice: the escaped form is tried first
(covers normal XML text nodes), and the raw unescaped form is appended as a
fallback when they differ. A stopSearch flag exits early on an exact context
match or when maxCandidates is reached, preserving the same performance
guarantees as before. Add TestMergeComments_CDATASelection to cover this path.

Thread #47: The context-window size 100 was repeated in four places across
mergeComments (two in the context-extraction loop and two in the scoring loop).
Extract it to const contextWindowBytes = 100 so it is easy to tune and stays
consistent everywhere.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 15:44:21 +02:00
+									"unicode/utf8"
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
 									"github.com/bmatcuk/doublestar/v4"
-												chore: bump module path to v16

Update Go module path from github.com/kovetskiy/mark to
github.com/kovetskiy/mark/v16 across all packages and imports,
following Go module versioning conventions for major versions >= 2.

Also update README installation instructions and version string.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-20 23:35:27 +01:00
+									"github.com/kovetskiy/mark/v16/attachment"
 									"github.com/kovetskiy/mark/v16/confluence"
 									"github.com/kovetskiy/mark/v16/includes"
 									"github.com/kovetskiy/mark/v16/macro"
 									markmd "github.com/kovetskiy/mark/v16/markdown"
 									"github.com/kovetskiy/mark/v16/metadata"
 									"github.com/kovetskiy/mark/v16/page"
 									"github.com/kovetskiy/mark/v16/stdlib"
 									"github.com/kovetskiy/mark/v16/types"
 									"github.com/kovetskiy/mark/v16/vfs"
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+									"github.com/rs/zerolog/log"
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+								)
-												Feature/robust comment preservation (#768)

This is based on guoweis-work PR https://github.com/kovetskiy/mark/pull/145

* feat(confluence): add support for fetching page body and inline comments

* feat(cmd): add --preserve-comments flag to preserve inline comments

* feat(mark): implement context-aware inline comment preservation

* test(mark): add tests for context-aware MergeComments logic

* fix: remove empty else branch in MergeComments to fix SA9003

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* perf: compile markerRegex once as package-level variable

Avoids recompiling the inline comment marker regex on every call to
MergeComments, which matters for pages with many comment markers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against nil comments pointer in MergeComments

Prevents a panic when GetInlineComments returns nil (e.g. on pages
where the inline comments feature is not enabled).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: add edge-case tests for MergeComments; fix overlapping replacement

Four new test cases:
- SelectionMissing: comment dropped gracefully when text is gone from new body
- OverlappingSelections: overlapping comments no longer corrupt the body;
  the later match (by position) wins and the earlier overlapping one is dropped
- NilComments: nil pointer returns new body unchanged
- HTMLEntities: &lt;, &gt;, &#39; selections match correctly

Also fixes the overlapping replacement bug: apply back-to-front and skip any
replacement whose end exceeds the start of an already-applied one.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: escape ref attribute value in inline comment marker XML

Use html.EscapeString on r.ref before interpolating it into the
ac:ref attribute to prevent malformed XML if the value ever contains
quotes or other special characters.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: use first occurrence when no context is available in MergeComments

Without context the old code left distance=0 for every match and
updated bestStart on each iteration, so the final result depended on
whichever occurrence was visited last (non-deterministic with respect
to the search order).

Restructure the loop to break immediately on the first match when
hasCtx is false, making the behaviour explicit and deterministic.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: log warning when overlapping inline comment marker is dropped

Previously the overlap was silently skipped. Now a zerolog Warn message
is emitted with the ref, the conflicting byte offsets, and the ref of
the already-placed marker, so users can see which comment was lost
rather than silently getting incomplete output.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: warn when inline comments are silently dropped in MergeComments

Three cases now emit a zerolog Warn instead of silently discarding:

1. Comment location != "inline": logs ref and actual location.
2. Selected text not found in new body: logs ref and selection text.
3. Overlapping replacement (existing): adds selection text to the
   already-present overlap warning for easier diagnosis.

Also adds a selection field to the replacement struct so the overlap
warning can report the dropped text.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: update markerRegex to match markers with nested tags

Replace ([^<]*) with (?s)(.*?) so the pattern:
- Matches marker content that contains nested inline tags (e.g. <strong>)
- Matches across newlines ((?s) / DOTALL mode)

The old character class [^<]* stopped at the first < inside the
marker body, causing the context-extraction step to miss any comment
whose original selection spanned formatted text.

Add TestMergeComments_NestedTags to cover this path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against empty OriginalSelection in MergeComments

strings.Index(s, "") always returns 0, so an empty escapedSelection
would spin the search loop indefinitely (or panic when currentPos
advances past len(newBody)).

Skip comments with an empty selection early, emit a Warn log, and
add TestMergeComments_EmptySelection to cover the path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: paginate GetInlineComments to avoid silently truncating results

The Confluence child/comment endpoint is paginated. The previous
single-request implementation silently dropped any comments beyond
the server's default page size.

Changes:
- Add Links (context, next) to InlineComments struct so the _links
  field from each page response is decoded.
- Rewrite GetInlineComments to loop with limit/start parameters
  (pageSize=100), accumulating all results, following the same pattern
  used by GetAttachments and label fetching.
- Add TestMergeComments_DuplicateMarkerRef to cover the deduplication
  guard added in the previous commit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix UTF-8 safety, API compat, log verbosity

- levenshteinDistance: convert to []rune before empty-string checks so
  rune counts (not byte counts) are returned for strings with multi-byte
  characters

- Add contextBefore/contextAfter helpers that use utf8.RuneStart to
  avoid slicing in the middle of a multi-byte UTF-8 sequence when
  extracting 100-char context windows from oldBody and newBody

- Add truncateSelection helper (50 runes + ellipsis) and apply it in all
  Warn log messages that include the selected text, preventing large or
  sensitive page content from appearing in logs

- Downgrade non-inline comment log from Warn to Debug with message
  'comment ignored during inline marker merge: not an inline comment';
  page-level comments are not inline markers and are not 'lost'

- Restore original one-argument GetPageByID (expand='ancestors,version')
  and add GetPageByIDExpanded for the one caller that needs a custom
  expand value, preserving backward compatibility for API consumers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Address new PR review comments

- Remove custom min() function: shadows the Go 1.21+ built-in min for
  the entire package; the built-in handles the 3-arg call in
  levenshteinDistance identically

- Validate rune boundaries on strings.Index candidates: skip any match
  where start or end falls in the middle of a multi-byte UTF-8 rune
  to prevent corrupt UTF-8 output

- Defer preserve-comments API calls until after shouldUpdatePage is
  determined: avoids unnecessary GetPageByIDExpanded + GetInlineComments
  round-trips on no-op --changes-only runs

- Capitalize Usage string for --preserve-comments flag (util/flags.go)
  and matching README.md entry to match sentence case of surrounding flags

- Run gofmt on util/cli.go to fix struct literal field alignment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: document --preserve-comments feature in README

Add a dedicated 'Preserving Inline Comments' section under Tricks with:
- Usage examples (CLI flag and env var)
- Step-by-step explanation of the Levenshtein-based relocation algorithm
- Limitations (deleted text, overlapping selections, new pages,
  changes-only interaction)

Also add a cross-reference NOTE near the --preserve-comments flag entry
in the Usage section.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: fix markdownlint errors in README

- Change unordered list markers from dashes to asterisks (MD004)
- Remove extra blank line before Issues section (MD012)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Extract named types for InlineComments; optimize Levenshtein search

- Introduce InlineCommentProperties, InlineCommentExtensions, and
  InlineCommentResult named types in confluence/api.go, replacing the
  anonymous nested struct in InlineComments.Results. Callers and tests
  can now construct/inspect comment objects without repeating the JSON
  shape.

- Simplify makeComments helper in mark_test.go to use the new named
  types directly, eliminating the verbose anonymous struct literal.

- Add two Levenshtein candidate-search optimisations in MergeComments:
  * Exact-context fast path: if both the before and after windows match
    exactly, take that occurrence immediately without computing distance.
  * Lower-bound pruning: skip the full O(m*n) Levenshtein computation
    for a candidate when the absolute difference in window lengths alone
    already meets or exceeds the current best distance.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Use stable sort with ref tie-breaker; fix README overlap description

- Replace slices.SortFunc with slices.SortStableFunc for the
  replacements slice, adding ref as a lexicographic tie-breaker when
  two markers resolve to the same start offset. This makes overlap
  resolution fully deterministic across runs.

- Correct the README limitation note: the *earlier* overlapping
  match (lower byte offset) is what gets dropped; the later one
  (higher byte offset, applied first in the back-to-front pass) is
  kept. The previous wording said 'the second one is dropped' which
  was ambiguous and inaccurate.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix rune-based lower-bound pruning; clarify test comment

- Use utf8.RuneCountInString instead of len() for the Levenshtein
  lower-bound pruning computation. The levenshteinDistance function
  operates on rune slices, so byte-length differences can exceed the
  true rune-length difference for multibyte UTF-8 content, causing
  valid candidates to be incorrectly skipped.

- Update TestMergeComments_SelectionMissing comment to say the comment
  is 'dropped with a warning' rather than 'silently dropped', matching
  the actual behavior.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Add missing unit tests for helpers and MergeComments scenarios

Helper function tests:
- TestTruncateSelection: short/exact/long strings and multibyte runes
- TestLevenshteinDistance: empty strings, identical, insertions,
  deletions, substitutions, 'kitten/sitting', and a multibyte UTF-8
  case to exercise rune-based counting
- TestContextBefore / TestContextAfter: basic windowing, window larger
  than string, and a case where the raw byte offset lands mid-rune (é)
  to verify the rune-boundary correction logic

MergeComments scenario tests:
- TestMergeComments_MultipleComments: two non-overlapping comments both
  correctly applied via back-to-front replacement
- TestMergeComments_EmptyResults: non-nil InlineComments with zero
  results returns body unchanged
- TestMergeComments_NonInlineLocation: page-level comments (location
  != 'inline') are skipped; body unchanged
- TestMergeComments_NoContext: when a ref has no marker in oldBody the
  first occurrence of the selection in newBody is used
- TestMergeComments_UTF8: multibyte (Japanese) characters in both body
  and selection are handled correctly

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix three correctness issues in MergeComments

- Fix html import shadowing: alias the 'html' import as 'stdhtml' to
  avoid shadowing by the local 'html' variable used throughout
  ProcessFile. Both callers updated: stdhtml.EscapeString for the
  ref attribute, htmlEscapeText for the selection search.

- Fix selection search with quotes/apostrophes: replace
  html.EscapeString for the selection with a new htmlEscapeText helper
  that only escapes &, <, > — not ' or ". Confluence storage HTML
  often leaves quotes and apostrophes unescaped in text nodes, so
  fully-escaped selections would fail to match and inline comments
  would be silently dropped. Add TestMergeComments_SelectionWithQuotes.

- Fix duplicate-ref warnings: move seenRefs[ref]=true to immediately
  after the duplicate-check, before the search loop. Previously seenRefs
  was only set on a successful match, so multiple results for the same
  MarkerRef with no match in the new body would each emit a 'dropped'
  warning. Add TestMergeComments_DuplicateMarkerRefDropped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Optimize levenshteinDistance to use two rolling rows instead of full matrix

Reduces memory allocation from O(m×n) to O(n) by keeping only the
previous and current rows. Also swaps r1/r2 so the shorter string is
used for column width, minimizing row allocation size.

This matters in MergeComments where levenshteinDistance is called for
every candidate match of every comment's selection in newBody — on
pages with many comments or short/common selections the number of
calls can be high.

Addresses thread [40] from PR review.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix test description and README algorithm doc

mark_test.go (thread [43]):
- TestMergeComments_HTMLEntities: the description incorrectly claimed
  &#39; (apostrophe) was tested; the selection '<world>' contains no
  apostrophe. Updated comment to accurately describe what is covered
  (&lt;/&gt; entity matching) and note the &#39; limitation.
- Add TestMergeComments_ApostropheSelection: verifies a selection with
  a literal apostrophe is found when the new body also has a literal
  apostrophe (the common case from mark's renderer). This exercises
  the htmlEscapeText path which intentionally does not encode ' or ".

README.md (thread [42]):
- Step 2 of the algorithm description said context was recorded
  'immediately before and after the commented selection' which is
  ambiguous. Clarified that context windows are taken around the
  <ac:inline-comment-marker> tag boundaries in the old body (not
  around the raw selection text), so the context is stable even when
  the marker wraps additional inline markup such as <strong>.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Unexport mergeComments and cap candidate evaluation

Thread [44]: MergeComments was exported but is internal-only — only
called within the mark package and tested from the same package.
Unexport it to mergeComments to avoid expanding the public API surface
unnecessarily. Add a Go doc comment describing the function contract,
HTML expectations, and the candidate cap.

Thread [45]: The candidate-scoring loop had no upper bound. For short
or common selections (e.g. 'a', 'the') on large pages the loop could
invoke levenshteinDistance thousands of times, each allocating rune
and int slices. Add a maxCandidates=100 constant and break once that
many on-rune-boundary occurrences have been evaluated. The exact-context
fast-path and lower-bound pruning already skip many candidates before
Levenshtein is called, so in practice the cap is only reached for very
common selections where the 100th candidate is unlikely to be
meaningfully better than an earlier one anyway.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: fix HTMLEntities description and add ApostropheEncoded limitation test

Thread #43: TestMergeComments_HTMLEntities had a misleading note claiming it
covered the &#39; apostrophe case, but the selection under test ('<world>') did
not include an apostrophe. Remove that note and add a dedicated
TestMergeComments_ApostropheEncoded test that explicitly documents the known
limitation: when a Confluence body stores an apostrophe as the numeric entity
&#39;, mergeComments cannot locate the selection (htmlEscapeText does not
encode ' to &#39;), so the comment is dropped with a warning.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix CDATA selection fallback and extract contextWindowBytes constant

Thread #46: mergeComments only searched for htmlEscapeText(selection) and
would fail for selections inside CDATA-backed macro bodies (e.g. ac:code),
where < and > are stored as raw characters rather than HTML entities. Restructure
the search loop to build a searchForms slice: the escaped form is tried first
(covers normal XML text nodes), and the raw unescaped form is appended as a
fallback when they differ. A stopSearch flag exits early on an exact context
match or when maxCandidates is reached, preserving the same performance
guarantees as before. Add TestMergeComments_CDATASelection to cover this path.

Thread #47: The context-window size 100 was repeated in four places across
mergeComments (two in the context-extraction loop and two in the scoring loop).
Extract it to const contextWindowBytes = 100 so it is easy to tune and stays
consistent everywhere.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 15:44:21 +02:00
+								var markerRegex = regexp.MustCompile(`(?s)<ac:inline-comment-marker ac:ref="([^"]+)">(.*?)</ac:inline-comment-marker>`)
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+								// Config holds all configuration options for running Mark.
 								type Config struct {
 									// Connection settings
 									BaseURL               string
 									Username              string
 									Password              string
 									PageID                string
 									InsecureSkipTLSVerify bool
 									// File selection
 									Files string
 									// Behaviour
 									CompileOnly     bool
 									DryRun          bool
 									ContinueOnError bool
 									CI              bool
 									// Page content
 									Space                    string
 									Parents                  []string
 									TitleFromH1              bool
 									TitleFromFilename        bool
 									TitleAppendGeneratedHash bool
 									ContentAppearance        string
 									// Page updates
-												Feature/robust comment preservation (#768)

This is based on guoweis-work PR https://github.com/kovetskiy/mark/pull/145

* feat(confluence): add support for fetching page body and inline comments

* feat(cmd): add --preserve-comments flag to preserve inline comments

* feat(mark): implement context-aware inline comment preservation

* test(mark): add tests for context-aware MergeComments logic

* fix: remove empty else branch in MergeComments to fix SA9003

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* perf: compile markerRegex once as package-level variable

Avoids recompiling the inline comment marker regex on every call to
MergeComments, which matters for pages with many comment markers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against nil comments pointer in MergeComments

Prevents a panic when GetInlineComments returns nil (e.g. on pages
where the inline comments feature is not enabled).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: add edge-case tests for MergeComments; fix overlapping replacement

Four new test cases:
- SelectionMissing: comment dropped gracefully when text is gone from new body
- OverlappingSelections: overlapping comments no longer corrupt the body;
  the later match (by position) wins and the earlier overlapping one is dropped
- NilComments: nil pointer returns new body unchanged
- HTMLEntities: &lt;, &gt;, &#39; selections match correctly

Also fixes the overlapping replacement bug: apply back-to-front and skip any
replacement whose end exceeds the start of an already-applied one.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: escape ref attribute value in inline comment marker XML

Use html.EscapeString on r.ref before interpolating it into the
ac:ref attribute to prevent malformed XML if the value ever contains
quotes or other special characters.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: use first occurrence when no context is available in MergeComments

Without context the old code left distance=0 for every match and
updated bestStart on each iteration, so the final result depended on
whichever occurrence was visited last (non-deterministic with respect
to the search order).

Restructure the loop to break immediately on the first match when
hasCtx is false, making the behaviour explicit and deterministic.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: log warning when overlapping inline comment marker is dropped

Previously the overlap was silently skipped. Now a zerolog Warn message
is emitted with the ref, the conflicting byte offsets, and the ref of
the already-placed marker, so users can see which comment was lost
rather than silently getting incomplete output.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: warn when inline comments are silently dropped in MergeComments

Three cases now emit a zerolog Warn instead of silently discarding:

1. Comment location != "inline": logs ref and actual location.
2. Selected text not found in new body: logs ref and selection text.
3. Overlapping replacement (existing): adds selection text to the
   already-present overlap warning for easier diagnosis.

Also adds a selection field to the replacement struct so the overlap
warning can report the dropped text.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: update markerRegex to match markers with nested tags

Replace ([^<]*) with (?s)(.*?) so the pattern:
- Matches marker content that contains nested inline tags (e.g. <strong>)
- Matches across newlines ((?s) / DOTALL mode)

The old character class [^<]* stopped at the first < inside the
marker body, causing the context-extraction step to miss any comment
whose original selection spanned formatted text.

Add TestMergeComments_NestedTags to cover this path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against empty OriginalSelection in MergeComments

strings.Index(s, "") always returns 0, so an empty escapedSelection
would spin the search loop indefinitely (or panic when currentPos
advances past len(newBody)).

Skip comments with an empty selection early, emit a Warn log, and
add TestMergeComments_EmptySelection to cover the path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: paginate GetInlineComments to avoid silently truncating results

The Confluence child/comment endpoint is paginated. The previous
single-request implementation silently dropped any comments beyond
the server's default page size.

Changes:
- Add Links (context, next) to InlineComments struct so the _links
  field from each page response is decoded.
- Rewrite GetInlineComments to loop with limit/start parameters
  (pageSize=100), accumulating all results, following the same pattern
  used by GetAttachments and label fetching.
- Add TestMergeComments_DuplicateMarkerRef to cover the deduplication
  guard added in the previous commit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix UTF-8 safety, API compat, log verbosity

- levenshteinDistance: convert to []rune before empty-string checks so
  rune counts (not byte counts) are returned for strings with multi-byte
  characters

- Add contextBefore/contextAfter helpers that use utf8.RuneStart to
  avoid slicing in the middle of a multi-byte UTF-8 sequence when
  extracting 100-char context windows from oldBody and newBody

- Add truncateSelection helper (50 runes + ellipsis) and apply it in all
  Warn log messages that include the selected text, preventing large or
  sensitive page content from appearing in logs

- Downgrade non-inline comment log from Warn to Debug with message
  'comment ignored during inline marker merge: not an inline comment';
  page-level comments are not inline markers and are not 'lost'

- Restore original one-argument GetPageByID (expand='ancestors,version')
  and add GetPageByIDExpanded for the one caller that needs a custom
  expand value, preserving backward compatibility for API consumers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Address new PR review comments

- Remove custom min() function: shadows the Go 1.21+ built-in min for
  the entire package; the built-in handles the 3-arg call in
  levenshteinDistance identically

- Validate rune boundaries on strings.Index candidates: skip any match
  where start or end falls in the middle of a multi-byte UTF-8 rune
  to prevent corrupt UTF-8 output

- Defer preserve-comments API calls until after shouldUpdatePage is
  determined: avoids unnecessary GetPageByIDExpanded + GetInlineComments
  round-trips on no-op --changes-only runs

- Capitalize Usage string for --preserve-comments flag (util/flags.go)
  and matching README.md entry to match sentence case of surrounding flags

- Run gofmt on util/cli.go to fix struct literal field alignment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: document --preserve-comments feature in README

Add a dedicated 'Preserving Inline Comments' section under Tricks with:
- Usage examples (CLI flag and env var)
- Step-by-step explanation of the Levenshtein-based relocation algorithm
- Limitations (deleted text, overlapping selections, new pages,
  changes-only interaction)

Also add a cross-reference NOTE near the --preserve-comments flag entry
in the Usage section.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: fix markdownlint errors in README

- Change unordered list markers from dashes to asterisks (MD004)
- Remove extra blank line before Issues section (MD012)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Extract named types for InlineComments; optimize Levenshtein search

- Introduce InlineCommentProperties, InlineCommentExtensions, and
  InlineCommentResult named types in confluence/api.go, replacing the
  anonymous nested struct in InlineComments.Results. Callers and tests
  can now construct/inspect comment objects without repeating the JSON
  shape.

- Simplify makeComments helper in mark_test.go to use the new named
  types directly, eliminating the verbose anonymous struct literal.

- Add two Levenshtein candidate-search optimisations in MergeComments:
  * Exact-context fast path: if both the before and after windows match
    exactly, take that occurrence immediately without computing distance.
  * Lower-bound pruning: skip the full O(m*n) Levenshtein computation
    for a candidate when the absolute difference in window lengths alone
    already meets or exceeds the current best distance.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Use stable sort with ref tie-breaker; fix README overlap description

- Replace slices.SortFunc with slices.SortStableFunc for the
  replacements slice, adding ref as a lexicographic tie-breaker when
  two markers resolve to the same start offset. This makes overlap
  resolution fully deterministic across runs.

- Correct the README limitation note: the *earlier* overlapping
  match (lower byte offset) is what gets dropped; the later one
  (higher byte offset, applied first in the back-to-front pass) is
  kept. The previous wording said 'the second one is dropped' which
  was ambiguous and inaccurate.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix rune-based lower-bound pruning; clarify test comment

- Use utf8.RuneCountInString instead of len() for the Levenshtein
  lower-bound pruning computation. The levenshteinDistance function
  operates on rune slices, so byte-length differences can exceed the
  true rune-length difference for multibyte UTF-8 content, causing
  valid candidates to be incorrectly skipped.

- Update TestMergeComments_SelectionMissing comment to say the comment
  is 'dropped with a warning' rather than 'silently dropped', matching
  the actual behavior.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Add missing unit tests for helpers and MergeComments scenarios

Helper function tests:
- TestTruncateSelection: short/exact/long strings and multibyte runes
- TestLevenshteinDistance: empty strings, identical, insertions,
  deletions, substitutions, 'kitten/sitting', and a multibyte UTF-8
  case to exercise rune-based counting
- TestContextBefore / TestContextAfter: basic windowing, window larger
  than string, and a case where the raw byte offset lands mid-rune (é)
  to verify the rune-boundary correction logic

MergeComments scenario tests:
- TestMergeComments_MultipleComments: two non-overlapping comments both
  correctly applied via back-to-front replacement
- TestMergeComments_EmptyResults: non-nil InlineComments with zero
  results returns body unchanged
- TestMergeComments_NonInlineLocation: page-level comments (location
  != 'inline') are skipped; body unchanged
- TestMergeComments_NoContext: when a ref has no marker in oldBody the
  first occurrence of the selection in newBody is used
- TestMergeComments_UTF8: multibyte (Japanese) characters in both body
  and selection are handled correctly

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix three correctness issues in MergeComments

- Fix html import shadowing: alias the 'html' import as 'stdhtml' to
  avoid shadowing by the local 'html' variable used throughout
  ProcessFile. Both callers updated: stdhtml.EscapeString for the
  ref attribute, htmlEscapeText for the selection search.

- Fix selection search with quotes/apostrophes: replace
  html.EscapeString for the selection with a new htmlEscapeText helper
  that only escapes &, <, > — not ' or ". Confluence storage HTML
  often leaves quotes and apostrophes unescaped in text nodes, so
  fully-escaped selections would fail to match and inline comments
  would be silently dropped. Add TestMergeComments_SelectionWithQuotes.

- Fix duplicate-ref warnings: move seenRefs[ref]=true to immediately
  after the duplicate-check, before the search loop. Previously seenRefs
  was only set on a successful match, so multiple results for the same
  MarkerRef with no match in the new body would each emit a 'dropped'
  warning. Add TestMergeComments_DuplicateMarkerRefDropped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Optimize levenshteinDistance to use two rolling rows instead of full matrix

Reduces memory allocation from O(m×n) to O(n) by keeping only the
previous and current rows. Also swaps r1/r2 so the shorter string is
used for column width, minimizing row allocation size.

This matters in MergeComments where levenshteinDistance is called for
every candidate match of every comment's selection in newBody — on
pages with many comments or short/common selections the number of
calls can be high.

Addresses thread [40] from PR review.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix test description and README algorithm doc

mark_test.go (thread [43]):
- TestMergeComments_HTMLEntities: the description incorrectly claimed
  &#39; (apostrophe) was tested; the selection '<world>' contains no
  apostrophe. Updated comment to accurately describe what is covered
  (&lt;/&gt; entity matching) and note the &#39; limitation.
- Add TestMergeComments_ApostropheSelection: verifies a selection with
  a literal apostrophe is found when the new body also has a literal
  apostrophe (the common case from mark's renderer). This exercises
  the htmlEscapeText path which intentionally does not encode ' or ".

README.md (thread [42]):
- Step 2 of the algorithm description said context was recorded
  'immediately before and after the commented selection' which is
  ambiguous. Clarified that context windows are taken around the
  <ac:inline-comment-marker> tag boundaries in the old body (not
  around the raw selection text), so the context is stable even when
  the marker wraps additional inline markup such as <strong>.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Unexport mergeComments and cap candidate evaluation

Thread [44]: MergeComments was exported but is internal-only — only
called within the mark package and tested from the same package.
Unexport it to mergeComments to avoid expanding the public API surface
unnecessarily. Add a Go doc comment describing the function contract,
HTML expectations, and the candidate cap.

Thread [45]: The candidate-scoring loop had no upper bound. For short
or common selections (e.g. 'a', 'the') on large pages the loop could
invoke levenshteinDistance thousands of times, each allocating rune
and int slices. Add a maxCandidates=100 constant and break once that
many on-rune-boundary occurrences have been evaluated. The exact-context
fast-path and lower-bound pruning already skip many candidates before
Levenshtein is called, so in practice the cap is only reached for very
common selections where the 100th candidate is unlikely to be
meaningfully better than an earlier one anyway.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: fix HTMLEntities description and add ApostropheEncoded limitation test

Thread #43: TestMergeComments_HTMLEntities had a misleading note claiming it
covered the &#39; apostrophe case, but the selection under test ('<world>') did
not include an apostrophe. Remove that note and add a dedicated
TestMergeComments_ApostropheEncoded test that explicitly documents the known
limitation: when a Confluence body stores an apostrophe as the numeric entity
&#39;, mergeComments cannot locate the selection (htmlEscapeText does not
encode ' to &#39;), so the comment is dropped with a warning.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix CDATA selection fallback and extract contextWindowBytes constant

Thread #46: mergeComments only searched for htmlEscapeText(selection) and
would fail for selections inside CDATA-backed macro bodies (e.g. ac:code),
where < and > are stored as raw characters rather than HTML entities. Restructure
the search loop to build a searchForms slice: the escaped form is tried first
(covers normal XML text nodes), and the raw unescaped form is appended as a
fallback when they differ. A stopSearch flag exits early on an exact context
match or when maxCandidates is reached, preserving the same performance
guarantees as before. Add TestMergeComments_CDATASelection to cover this path.

Thread #47: The context-window size 100 was repeated in four places across
mergeComments (two in the context-extraction loop and two in the scoring loop).
Extract it to const contextWindowBytes = 100 so it is easy to tune and stays
consistent everywhere.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 15:44:21 +02:00
+									MinorEdit        bool
 									VersionMessage   string
 									EditLock         bool
 									ChangesOnly      bool
 									PreserveComments bool
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
 									// Rendering
 									DropH1          bool
 									StripLinebreaks bool
 									MermaidScale    float64
 									D2Scale         float64
 									Features        []string
 									ImageAlign      string
 									IncludePath     string
-												fix: route result output through Config.Output, not os.Stdout

mark.Run and ProcessFile were writing directly to os.Stdout via
fmt.Println, which is a surprising side-effect for library callers.

Add Config.Output io.Writer for callers to provide their own sink.
When nil the helper falls back to io.Discard, so library embedders
that do not set Output receive no implicit stdout writes. The CLI
layer sets Output: os.Stdout to preserve existing behaviour.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 23:05:28 +01:00
 									// Output is the writer used for result output (e.g. published page URLs,
 									// compiled HTML). If nil, output is discarded; the CLI sets this to
 									// os.Stdout.
 									Output io.Writer
 								}
 								// output returns the configured writer, falling back to io.Discard so that
 								// library callers that do not set Output receive no implicit stdout writes.
 								func (c Config) output() io.Writer {
 									if c.Output != nil {
 										return c.Output
 									}
 									return io.Discard
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+								}
 								// Run processes all files matching Config.Files and publishes them to Confluence.
 								func Run(config Config) error {
 									api := confluence.NewAPI(config.BaseURL, config.Username, config.Password, config.InsecureSkipTLSVerify)
 									files, err := doublestar.FilepathGlob(config.Files)
 									if err != nil {
 										return err
 									}
 									if len(files) == 0 {
 										msg := "no files matched"
 										if config.CI {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+											log.Warn().Msg(msg)
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+										} else {
-												refactor: modernize Go primitives

- Replace interface{} with any (Go 1.18) across confluence/api.go,
  macro/macro.go, util/cli.go, util/error_handler.go, includes/templates.go
- Replace sort.SliceStable with slices.SortStableFunc + cmp.Compare (Go 1.21)
  in attachment/attachment.go, consistent with existing slices usage
- Replace fmt.Errorf("%s", msg) with errors.New(msg) in mark.go

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 01:34:06 +02:00
+											return errors.New(msg)
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+										}
 									}
-												fix: return error from Run() when ContinueOnError files fail

When --continue-on-error was set and one or more files failed to
process, Run() logged each failure but returned nil, making it
impossible for callers or CI systems to detect partial failures.

Track whether any file failed with a hasErrors flag and return a
descriptive error after all files have been attempted.

Update TestContinueOnError to reflect the corrected behaviour: the
test now asserts that an error IS returned (partial failure is
surfaced) while still verifying that all files in the batch are
attempted (not just the first one).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-13 02:11:18 +01:00
+									var hasErrors bool
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+									for _, file := range files {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+										log.Info().Msgf("processing %s", file)
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
 										target, err := ProcessFile(file, api, config)
 										if err != nil {
 											if config.ContinueOnError {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+												log.Error().Err(err).Msgf("processing %s", file)
-												fix: return error from Run() when ContinueOnError files fail

When --continue-on-error was set and one or more files failed to
process, Run() logged each failure but returned nil, making it
impossible for callers or CI systems to detect partial failures.

Track whether any file failed with a hasErrors flag and return a
descriptive error after all files have been attempted.

Update TestContinueOnError to reflect the corrected behaviour: the
test now asserts that an error IS returned (partial failure is
surfaced) while still verifying that all files in the batch are
attempted (not just the first one).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-13 02:11:18 +01:00
+												hasErrors = true
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+												continue
 											}
 											return err
 										}
 										if target != nil {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+											log.Info().Msgf("page successfully updated: %s", api.BaseURL+target.Links.Full)
-												fix: use api.BaseURL instead of config.BaseURL for page URL output

confluence.NewAPI trims trailing slashes from the base URL into
api.BaseURL. Using config.BaseURL directly could produce double
slashes in the logged/printed URL when the caller passes a
trailing-slash BaseURL.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 23:15:58 +01:00
+											if _, err := fmt.Fprintln(config.output(), api.BaseURL+target.Links.Full); err != nil {
-												fix: check error return from fmt.Fprintln

errcheck lint requires all error return values to be handled.
Propagate write errors from both Fprintln call sites.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 23:07:13 +01:00
+												return err
 											}
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+										}
 									}
-												fix: return error from Run() when ContinueOnError files fail

When --continue-on-error was set and one or more files failed to
process, Run() logged each failure but returned nil, making it
impossible for callers or CI systems to detect partial failures.

Track whether any file failed with a hasErrors flag and return a
descriptive error after all files have been attempted.

Update TestContinueOnError to reflect the corrected behaviour: the
test now asserts that an error IS returned (partial failure is
surfaced) while still verifying that all files in the batch are
attempted (not just the first one).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-13 02:11:18 +01:00
+									if hasErrors {
 										return fmt.Errorf("one or more files failed to process")
 									}
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+									return nil
 								}
 								// ProcessFile processes a single markdown file and publishes it to Confluence.
 								// Returns nil for the page info when compile-only or dry-run mode is active.
 								func ProcessFile(file string, api *confluence.API, config Config) (*confluence.PageInfo, error) {
 									markdown, err := os.ReadFile(file)
 									if err != nil {
 										return nil, fmt.Errorf("unable to read file %q: %w", file, err)
 									}
 									markdown = bytes.ReplaceAll(markdown, []byte("\r\n"), []byte("\n"))
 									meta, markdown, err := metadata.ExtractMeta(
 										markdown,
 										config.Space,
 										config.TitleFromH1,
 										config.TitleFromFilename,
 										file,
 										config.Parents,
 										config.TitleAppendGeneratedHash,
 										config.ContentAppearance,
 									)
 									if err != nil {
 										return nil, fmt.Errorf("unable to extract metadata from file %q: %w", file, err)
 									}
 									if config.PageID != "" && meta != nil {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+										log.Warn().Msg(
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+											`specified file contains metadata, ` +
 												`but it will be ignored due specified command line URL`,
 										)
 										meta = nil
 									}
 									if config.PageID == "" && meta == nil {
 										return nil, fmt.Errorf(
 											"specified file doesn't contain metadata and URL is not specified " +
 												"via command line or doesn't contain pageId GET-parameter",
 										)
 									}
 									if meta != nil {
 										if meta.Space == "" {
 											return nil, fmt.Errorf(
 												"space is not set ('Space' header is not set and '--space' option is not set)",
 											)
 										}
 										if meta.Title == "" {
 											return nil, fmt.Errorf(
 												"page title is not set: use the 'Title' header, " +
 													"or the --title-from-h1 / --title-from-filename flags",
 											)
 										}
 									}
 									std, err := stdlib.New(api)
 									if err != nil {
 										return nil, fmt.Errorf("unable to retrieve standard library: %w", err)
 									}
 									templates := std.Templates
 									var recurse bool
 									for {
 										templates, markdown, recurse, err = includes.ProcessIncludes(
 											filepath.Dir(file),
 											config.IncludePath,
 											markdown,
 											templates,
 										)
 										if err != nil {
 											return nil, fmt.Errorf("unable to process includes: %w", err)
 										}
 										if !recurse {
 											break
 										}
 									}
 									macros, markdown, err := macro.ExtractMacros(
 										filepath.Dir(file),
 										config.IncludePath,
 										markdown,
 										templates,
 									)
 									if err != nil {
 										return nil, fmt.Errorf("unable to extract macros: %w", err)
 									}
 									for _, m := range macros {
 										markdown, err = m.Apply(markdown)
 										if err != nil {
 											return nil, fmt.Errorf("unable to apply macro: %w", err)
 										}
 									}
 									links, err := page.ResolveRelativeLinks(
 										api,
 										meta,
 										markdown,
 										filepath.Dir(file),
 										config.Space,
 										config.TitleFromH1,
 										config.TitleFromFilename,
 										config.Parents,
 										config.TitleAppendGeneratedHash,
 									)
 									if err != nil {
 										return nil, fmt.Errorf("unable to resolve relative links: %w", err)
 									}
 									markdown = page.SubstituteLinks(markdown, links)
 									if config.DryRun {
-												fix: handle nil meta in dry-run mode when PageID is set

page.ResolvePage requires non-nil metadata and would error immediately
when called with meta == nil (e.g. when --page-id is used and the file
has no metadata header, or when metadata is intentionally suppressed).

Guard the call: when meta != nil use ResolvePage as before; when meta
is nil but PageID is provided, validate the page exists via
api.GetPageByID instead; when neither is set the earlier mandatory-
field check already returns an error, so no further action is needed.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 23:05:55 +01:00
+										if meta != nil {
 											if _, _, err := page.ResolvePage(true, api, meta); err != nil {
 												return nil, fmt.Errorf("unable to resolve page location: %w", err)
 											}
 										} else if config.PageID != "" {
 											if _, err := api.GetPageByID(config.PageID); err != nil {
 												return nil, fmt.Errorf("unable to resolve page by ID: %w", err)
 											}
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+										}
 									}
 									if config.CompileOnly || config.DryRun {
 										if config.DropH1 {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+											log.Info().Msg("the leading H1 heading will be excluded from the Confluence output")
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+										}
 										imageAlign, err := getImageAlign(config.ImageAlign, meta)
 										if err != nil {
 											return nil, fmt.Errorf("unable to determine image-align: %w", err)
 										}
 										cfg := types.MarkConfig{
 											MermaidScale:  config.MermaidScale,
 											D2Scale:       config.D2Scale,
 											DropFirstH1:   config.DropH1,
 											StripNewlines: config.StripLinebreaks,
 											Features:      config.Features,
 											ImageAlign:    imageAlign,
 										}
-												fix: return error instead of panic from CompileMarkdown

Markdown conversion failures called panic(err), crashing the process
rather than allowing graceful error handling. Change the return type
to (string, []attachment.Attachment, error) and propagate the error.
Update all callers (mark.go, markdown_test.go) accordingly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-13 01:13:51 +01:00
+										html, _, err := markmd.CompileMarkdown(markdown, std, file, cfg)
 										if err != nil {
 											return nil, fmt.Errorf("unable to compile markdown: %w", err)
 										}
-												fix: check error return from fmt.Fprintln

errcheck lint requires all error return values to be handled.
Propagate write errors from both Fprintln call sites.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 23:07:13 +01:00
+										if _, err := fmt.Fprintln(config.output(), html); err != nil {
 											return nil, err
 										}
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+										return nil, nil
 									}
 									var target *confluence.PageInfo
-												Feature/robust comment preservation (#768)

This is based on guoweis-work PR https://github.com/kovetskiy/mark/pull/145

* feat(confluence): add support for fetching page body and inline comments

* feat(cmd): add --preserve-comments flag to preserve inline comments

* feat(mark): implement context-aware inline comment preservation

* test(mark): add tests for context-aware MergeComments logic

* fix: remove empty else branch in MergeComments to fix SA9003

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* perf: compile markerRegex once as package-level variable

Avoids recompiling the inline comment marker regex on every call to
MergeComments, which matters for pages with many comment markers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against nil comments pointer in MergeComments

Prevents a panic when GetInlineComments returns nil (e.g. on pages
where the inline comments feature is not enabled).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: add edge-case tests for MergeComments; fix overlapping replacement

Four new test cases:
- SelectionMissing: comment dropped gracefully when text is gone from new body
- OverlappingSelections: overlapping comments no longer corrupt the body;
  the later match (by position) wins and the earlier overlapping one is dropped
- NilComments: nil pointer returns new body unchanged
- HTMLEntities: &lt;, &gt;, &#39; selections match correctly

Also fixes the overlapping replacement bug: apply back-to-front and skip any
replacement whose end exceeds the start of an already-applied one.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: escape ref attribute value in inline comment marker XML

Use html.EscapeString on r.ref before interpolating it into the
ac:ref attribute to prevent malformed XML if the value ever contains
quotes or other special characters.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: use first occurrence when no context is available in MergeComments

Without context the old code left distance=0 for every match and
updated bestStart on each iteration, so the final result depended on
whichever occurrence was visited last (non-deterministic with respect
to the search order).

Restructure the loop to break immediately on the first match when
hasCtx is false, making the behaviour explicit and deterministic.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: log warning when overlapping inline comment marker is dropped

Previously the overlap was silently skipped. Now a zerolog Warn message
is emitted with the ref, the conflicting byte offsets, and the ref of
the already-placed marker, so users can see which comment was lost
rather than silently getting incomplete output.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: warn when inline comments are silently dropped in MergeComments

Three cases now emit a zerolog Warn instead of silently discarding:

1. Comment location != "inline": logs ref and actual location.
2. Selected text not found in new body: logs ref and selection text.
3. Overlapping replacement (existing): adds selection text to the
   already-present overlap warning for easier diagnosis.

Also adds a selection field to the replacement struct so the overlap
warning can report the dropped text.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: update markerRegex to match markers with nested tags

Replace ([^<]*) with (?s)(.*?) so the pattern:
- Matches marker content that contains nested inline tags (e.g. <strong>)
- Matches across newlines ((?s) / DOTALL mode)

The old character class [^<]* stopped at the first < inside the
marker body, causing the context-extraction step to miss any comment
whose original selection spanned formatted text.

Add TestMergeComments_NestedTags to cover this path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against empty OriginalSelection in MergeComments

strings.Index(s, "") always returns 0, so an empty escapedSelection
would spin the search loop indefinitely (or panic when currentPos
advances past len(newBody)).

Skip comments with an empty selection early, emit a Warn log, and
add TestMergeComments_EmptySelection to cover the path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: paginate GetInlineComments to avoid silently truncating results

The Confluence child/comment endpoint is paginated. The previous
single-request implementation silently dropped any comments beyond
the server's default page size.

Changes:
- Add Links (context, next) to InlineComments struct so the _links
  field from each page response is decoded.
- Rewrite GetInlineComments to loop with limit/start parameters
  (pageSize=100), accumulating all results, following the same pattern
  used by GetAttachments and label fetching.
- Add TestMergeComments_DuplicateMarkerRef to cover the deduplication
  guard added in the previous commit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix UTF-8 safety, API compat, log verbosity

- levenshteinDistance: convert to []rune before empty-string checks so
  rune counts (not byte counts) are returned for strings with multi-byte
  characters

- Add contextBefore/contextAfter helpers that use utf8.RuneStart to
  avoid slicing in the middle of a multi-byte UTF-8 sequence when
  extracting 100-char context windows from oldBody and newBody

- Add truncateSelection helper (50 runes + ellipsis) and apply it in all
  Warn log messages that include the selected text, preventing large or
  sensitive page content from appearing in logs

- Downgrade non-inline comment log from Warn to Debug with message
  'comment ignored during inline marker merge: not an inline comment';
  page-level comments are not inline markers and are not 'lost'

- Restore original one-argument GetPageByID (expand='ancestors,version')
  and add GetPageByIDExpanded for the one caller that needs a custom
  expand value, preserving backward compatibility for API consumers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Address new PR review comments

- Remove custom min() function: shadows the Go 1.21+ built-in min for
  the entire package; the built-in handles the 3-arg call in
  levenshteinDistance identically

- Validate rune boundaries on strings.Index candidates: skip any match
  where start or end falls in the middle of a multi-byte UTF-8 rune
  to prevent corrupt UTF-8 output

- Defer preserve-comments API calls until after shouldUpdatePage is
  determined: avoids unnecessary GetPageByIDExpanded + GetInlineComments
  round-trips on no-op --changes-only runs

- Capitalize Usage string for --preserve-comments flag (util/flags.go)
  and matching README.md entry to match sentence case of surrounding flags

- Run gofmt on util/cli.go to fix struct literal field alignment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: document --preserve-comments feature in README

Add a dedicated 'Preserving Inline Comments' section under Tricks with:
- Usage examples (CLI flag and env var)
- Step-by-step explanation of the Levenshtein-based relocation algorithm
- Limitations (deleted text, overlapping selections, new pages,
  changes-only interaction)

Also add a cross-reference NOTE near the --preserve-comments flag entry
in the Usage section.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: fix markdownlint errors in README

- Change unordered list markers from dashes to asterisks (MD004)
- Remove extra blank line before Issues section (MD012)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Extract named types for InlineComments; optimize Levenshtein search

- Introduce InlineCommentProperties, InlineCommentExtensions, and
  InlineCommentResult named types in confluence/api.go, replacing the
  anonymous nested struct in InlineComments.Results. Callers and tests
  can now construct/inspect comment objects without repeating the JSON
  shape.

- Simplify makeComments helper in mark_test.go to use the new named
  types directly, eliminating the verbose anonymous struct literal.

- Add two Levenshtein candidate-search optimisations in MergeComments:
  * Exact-context fast path: if both the before and after windows match
    exactly, take that occurrence immediately without computing distance.
  * Lower-bound pruning: skip the full O(m*n) Levenshtein computation
    for a candidate when the absolute difference in window lengths alone
    already meets or exceeds the current best distance.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Use stable sort with ref tie-breaker; fix README overlap description

- Replace slices.SortFunc with slices.SortStableFunc for the
  replacements slice, adding ref as a lexicographic tie-breaker when
  two markers resolve to the same start offset. This makes overlap
  resolution fully deterministic across runs.

- Correct the README limitation note: the *earlier* overlapping
  match (lower byte offset) is what gets dropped; the later one
  (higher byte offset, applied first in the back-to-front pass) is
  kept. The previous wording said 'the second one is dropped' which
  was ambiguous and inaccurate.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix rune-based lower-bound pruning; clarify test comment

- Use utf8.RuneCountInString instead of len() for the Levenshtein
  lower-bound pruning computation. The levenshteinDistance function
  operates on rune slices, so byte-length differences can exceed the
  true rune-length difference for multibyte UTF-8 content, causing
  valid candidates to be incorrectly skipped.

- Update TestMergeComments_SelectionMissing comment to say the comment
  is 'dropped with a warning' rather than 'silently dropped', matching
  the actual behavior.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Add missing unit tests for helpers and MergeComments scenarios

Helper function tests:
- TestTruncateSelection: short/exact/long strings and multibyte runes
- TestLevenshteinDistance: empty strings, identical, insertions,
  deletions, substitutions, 'kitten/sitting', and a multibyte UTF-8
  case to exercise rune-based counting
- TestContextBefore / TestContextAfter: basic windowing, window larger
  than string, and a case where the raw byte offset lands mid-rune (é)
  to verify the rune-boundary correction logic

MergeComments scenario tests:
- TestMergeComments_MultipleComments: two non-overlapping comments both
  correctly applied via back-to-front replacement
- TestMergeComments_EmptyResults: non-nil InlineComments with zero
  results returns body unchanged
- TestMergeComments_NonInlineLocation: page-level comments (location
  != 'inline') are skipped; body unchanged
- TestMergeComments_NoContext: when a ref has no marker in oldBody the
  first occurrence of the selection in newBody is used
- TestMergeComments_UTF8: multibyte (Japanese) characters in both body
  and selection are handled correctly

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix three correctness issues in MergeComments

- Fix html import shadowing: alias the 'html' import as 'stdhtml' to
  avoid shadowing by the local 'html' variable used throughout
  ProcessFile. Both callers updated: stdhtml.EscapeString for the
  ref attribute, htmlEscapeText for the selection search.

- Fix selection search with quotes/apostrophes: replace
  html.EscapeString for the selection with a new htmlEscapeText helper
  that only escapes &, <, > — not ' or ". Confluence storage HTML
  often leaves quotes and apostrophes unescaped in text nodes, so
  fully-escaped selections would fail to match and inline comments
  would be silently dropped. Add TestMergeComments_SelectionWithQuotes.

- Fix duplicate-ref warnings: move seenRefs[ref]=true to immediately
  after the duplicate-check, before the search loop. Previously seenRefs
  was only set on a successful match, so multiple results for the same
  MarkerRef with no match in the new body would each emit a 'dropped'
  warning. Add TestMergeComments_DuplicateMarkerRefDropped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Optimize levenshteinDistance to use two rolling rows instead of full matrix

Reduces memory allocation from O(m×n) to O(n) by keeping only the
previous and current rows. Also swaps r1/r2 so the shorter string is
used for column width, minimizing row allocation size.

This matters in MergeComments where levenshteinDistance is called for
every candidate match of every comment's selection in newBody — on
pages with many comments or short/common selections the number of
calls can be high.

Addresses thread [40] from PR review.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix test description and README algorithm doc

mark_test.go (thread [43]):
- TestMergeComments_HTMLEntities: the description incorrectly claimed
  &#39; (apostrophe) was tested; the selection '<world>' contains no
  apostrophe. Updated comment to accurately describe what is covered
  (&lt;/&gt; entity matching) and note the &#39; limitation.
- Add TestMergeComments_ApostropheSelection: verifies a selection with
  a literal apostrophe is found when the new body also has a literal
  apostrophe (the common case from mark's renderer). This exercises
  the htmlEscapeText path which intentionally does not encode ' or ".

README.md (thread [42]):
- Step 2 of the algorithm description said context was recorded
  'immediately before and after the commented selection' which is
  ambiguous. Clarified that context windows are taken around the
  <ac:inline-comment-marker> tag boundaries in the old body (not
  around the raw selection text), so the context is stable even when
  the marker wraps additional inline markup such as <strong>.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Unexport mergeComments and cap candidate evaluation

Thread [44]: MergeComments was exported but is internal-only — only
called within the mark package and tested from the same package.
Unexport it to mergeComments to avoid expanding the public API surface
unnecessarily. Add a Go doc comment describing the function contract,
HTML expectations, and the candidate cap.

Thread [45]: The candidate-scoring loop had no upper bound. For short
or common selections (e.g. 'a', 'the') on large pages the loop could
invoke levenshteinDistance thousands of times, each allocating rune
and int slices. Add a maxCandidates=100 constant and break once that
many on-rune-boundary occurrences have been evaluated. The exact-context
fast-path and lower-bound pruning already skip many candidates before
Levenshtein is called, so in practice the cap is only reached for very
common selections where the 100th candidate is unlikely to be
meaningfully better than an earlier one anyway.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: fix HTMLEntities description and add ApostropheEncoded limitation test

Thread #43: TestMergeComments_HTMLEntities had a misleading note claiming it
covered the &#39; apostrophe case, but the selection under test ('<world>') did
not include an apostrophe. Remove that note and add a dedicated
TestMergeComments_ApostropheEncoded test that explicitly documents the known
limitation: when a Confluence body stores an apostrophe as the numeric entity
&#39;, mergeComments cannot locate the selection (htmlEscapeText does not
encode ' to &#39;), so the comment is dropped with a warning.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix CDATA selection fallback and extract contextWindowBytes constant

Thread #46: mergeComments only searched for htmlEscapeText(selection) and
would fail for selections inside CDATA-backed macro bodies (e.g. ac:code),
where < and > are stored as raw characters rather than HTML entities. Restructure
the search loop to build a searchForms slice: the escaped form is tried first
(covers normal XML text nodes), and the raw unescaped form is appended as a
fallback when they differ. A stopSearch flag exits early on an exact context
match or when maxCandidates is reached, preserving the same performance
guarantees as before. Add TestMergeComments_CDATASelection to cover this path.

Thread #47: The context-window size 100 was repeated in four places across
mergeComments (two in the context-extraction loop and two in the scoring loop).
Extract it to const contextWindowBytes = 100 so it is easy to tune and stays
consistent everywhere.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 15:44:21 +02:00
+									var pageCreated bool
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
 									if meta != nil {
 										parent, pg, err := page.ResolvePage(false, api, meta)
 										if err != nil {
-												refactor: replace karma-go with standard error handling

											
										
										
											2026-03-28 10:16:29 +01:00
+											return nil, fmt.Errorf("error resolving page %q: %w", meta.Title, err)
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+										}
 										if pg == nil {
 											pg, err = api.CreatePage(meta.Space, meta.Type, parent, meta.Title, ``)
 											if err != nil {
 												return nil, fmt.Errorf("can't create %s %q: %w", meta.Type, meta.Title, err)
 											}
 											// A delay between the create and update call helps mitigate a 409
 											// conflict that can occur when attempting to update a page just
 											// after it was created. See issues/139.
 											time.Sleep(1 * time.Second)
-												Feature/robust comment preservation (#768)

This is based on guoweis-work PR https://github.com/kovetskiy/mark/pull/145

* feat(confluence): add support for fetching page body and inline comments

* feat(cmd): add --preserve-comments flag to preserve inline comments

* feat(mark): implement context-aware inline comment preservation

* test(mark): add tests for context-aware MergeComments logic

* fix: remove empty else branch in MergeComments to fix SA9003

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* perf: compile markerRegex once as package-level variable

Avoids recompiling the inline comment marker regex on every call to
MergeComments, which matters for pages with many comment markers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against nil comments pointer in MergeComments

Prevents a panic when GetInlineComments returns nil (e.g. on pages
where the inline comments feature is not enabled).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: add edge-case tests for MergeComments; fix overlapping replacement

Four new test cases:
- SelectionMissing: comment dropped gracefully when text is gone from new body
- OverlappingSelections: overlapping comments no longer corrupt the body;
  the later match (by position) wins and the earlier overlapping one is dropped
- NilComments: nil pointer returns new body unchanged
- HTMLEntities: &lt;, &gt;, &#39; selections match correctly

Also fixes the overlapping replacement bug: apply back-to-front and skip any
replacement whose end exceeds the start of an already-applied one.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: escape ref attribute value in inline comment marker XML

Use html.EscapeString on r.ref before interpolating it into the
ac:ref attribute to prevent malformed XML if the value ever contains
quotes or other special characters.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: use first occurrence when no context is available in MergeComments

Without context the old code left distance=0 for every match and
updated bestStart on each iteration, so the final result depended on
whichever occurrence was visited last (non-deterministic with respect
to the search order).

Restructure the loop to break immediately on the first match when
hasCtx is false, making the behaviour explicit and deterministic.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: log warning when overlapping inline comment marker is dropped

Previously the overlap was silently skipped. Now a zerolog Warn message
is emitted with the ref, the conflicting byte offsets, and the ref of
the already-placed marker, so users can see which comment was lost
rather than silently getting incomplete output.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: warn when inline comments are silently dropped in MergeComments

Three cases now emit a zerolog Warn instead of silently discarding:

1. Comment location != "inline": logs ref and actual location.
2. Selected text not found in new body: logs ref and selection text.
3. Overlapping replacement (existing): adds selection text to the
   already-present overlap warning for easier diagnosis.

Also adds a selection field to the replacement struct so the overlap
warning can report the dropped text.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: update markerRegex to match markers with nested tags

Replace ([^<]*) with (?s)(.*?) so the pattern:
- Matches marker content that contains nested inline tags (e.g. <strong>)
- Matches across newlines ((?s) / DOTALL mode)

The old character class [^<]* stopped at the first < inside the
marker body, causing the context-extraction step to miss any comment
whose original selection spanned formatted text.

Add TestMergeComments_NestedTags to cover this path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against empty OriginalSelection in MergeComments

strings.Index(s, "") always returns 0, so an empty escapedSelection
would spin the search loop indefinitely (or panic when currentPos
advances past len(newBody)).

Skip comments with an empty selection early, emit a Warn log, and
add TestMergeComments_EmptySelection to cover the path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: paginate GetInlineComments to avoid silently truncating results

The Confluence child/comment endpoint is paginated. The previous
single-request implementation silently dropped any comments beyond
the server's default page size.

Changes:
- Add Links (context, next) to InlineComments struct so the _links
  field from each page response is decoded.
- Rewrite GetInlineComments to loop with limit/start parameters
  (pageSize=100), accumulating all results, following the same pattern
  used by GetAttachments and label fetching.
- Add TestMergeComments_DuplicateMarkerRef to cover the deduplication
  guard added in the previous commit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix UTF-8 safety, API compat, log verbosity

- levenshteinDistance: convert to []rune before empty-string checks so
  rune counts (not byte counts) are returned for strings with multi-byte
  characters

- Add contextBefore/contextAfter helpers that use utf8.RuneStart to
  avoid slicing in the middle of a multi-byte UTF-8 sequence when
  extracting 100-char context windows from oldBody and newBody

- Add truncateSelection helper (50 runes + ellipsis) and apply it in all
  Warn log messages that include the selected text, preventing large or
  sensitive page content from appearing in logs

- Downgrade non-inline comment log from Warn to Debug with message
  'comment ignored during inline marker merge: not an inline comment';
  page-level comments are not inline markers and are not 'lost'

- Restore original one-argument GetPageByID (expand='ancestors,version')
  and add GetPageByIDExpanded for the one caller that needs a custom
  expand value, preserving backward compatibility for API consumers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Address new PR review comments

- Remove custom min() function: shadows the Go 1.21+ built-in min for
  the entire package; the built-in handles the 3-arg call in
  levenshteinDistance identically

- Validate rune boundaries on strings.Index candidates: skip any match
  where start or end falls in the middle of a multi-byte UTF-8 rune
  to prevent corrupt UTF-8 output

- Defer preserve-comments API calls until after shouldUpdatePage is
  determined: avoids unnecessary GetPageByIDExpanded + GetInlineComments
  round-trips on no-op --changes-only runs

- Capitalize Usage string for --preserve-comments flag (util/flags.go)
  and matching README.md entry to match sentence case of surrounding flags

- Run gofmt on util/cli.go to fix struct literal field alignment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: document --preserve-comments feature in README

Add a dedicated 'Preserving Inline Comments' section under Tricks with:
- Usage examples (CLI flag and env var)
- Step-by-step explanation of the Levenshtein-based relocation algorithm
- Limitations (deleted text, overlapping selections, new pages,
  changes-only interaction)

Also add a cross-reference NOTE near the --preserve-comments flag entry
in the Usage section.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: fix markdownlint errors in README

- Change unordered list markers from dashes to asterisks (MD004)
- Remove extra blank line before Issues section (MD012)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Extract named types for InlineComments; optimize Levenshtein search

- Introduce InlineCommentProperties, InlineCommentExtensions, and
  InlineCommentResult named types in confluence/api.go, replacing the
  anonymous nested struct in InlineComments.Results. Callers and tests
  can now construct/inspect comment objects without repeating the JSON
  shape.

- Simplify makeComments helper in mark_test.go to use the new named
  types directly, eliminating the verbose anonymous struct literal.

- Add two Levenshtein candidate-search optimisations in MergeComments:
  * Exact-context fast path: if both the before and after windows match
    exactly, take that occurrence immediately without computing distance.
  * Lower-bound pruning: skip the full O(m*n) Levenshtein computation
    for a candidate when the absolute difference in window lengths alone
    already meets or exceeds the current best distance.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Use stable sort with ref tie-breaker; fix README overlap description

- Replace slices.SortFunc with slices.SortStableFunc for the
  replacements slice, adding ref as a lexicographic tie-breaker when
  two markers resolve to the same start offset. This makes overlap
  resolution fully deterministic across runs.

- Correct the README limitation note: the *earlier* overlapping
  match (lower byte offset) is what gets dropped; the later one
  (higher byte offset, applied first in the back-to-front pass) is
  kept. The previous wording said 'the second one is dropped' which
  was ambiguous and inaccurate.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix rune-based lower-bound pruning; clarify test comment

- Use utf8.RuneCountInString instead of len() for the Levenshtein
  lower-bound pruning computation. The levenshteinDistance function
  operates on rune slices, so byte-length differences can exceed the
  true rune-length difference for multibyte UTF-8 content, causing
  valid candidates to be incorrectly skipped.

- Update TestMergeComments_SelectionMissing comment to say the comment
  is 'dropped with a warning' rather than 'silently dropped', matching
  the actual behavior.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Add missing unit tests for helpers and MergeComments scenarios

Helper function tests:
- TestTruncateSelection: short/exact/long strings and multibyte runes
- TestLevenshteinDistance: empty strings, identical, insertions,
  deletions, substitutions, 'kitten/sitting', and a multibyte UTF-8
  case to exercise rune-based counting
- TestContextBefore / TestContextAfter: basic windowing, window larger
  than string, and a case where the raw byte offset lands mid-rune (é)
  to verify the rune-boundary correction logic

MergeComments scenario tests:
- TestMergeComments_MultipleComments: two non-overlapping comments both
  correctly applied via back-to-front replacement
- TestMergeComments_EmptyResults: non-nil InlineComments with zero
  results returns body unchanged
- TestMergeComments_NonInlineLocation: page-level comments (location
  != 'inline') are skipped; body unchanged
- TestMergeComments_NoContext: when a ref has no marker in oldBody the
  first occurrence of the selection in newBody is used
- TestMergeComments_UTF8: multibyte (Japanese) characters in both body
  and selection are handled correctly

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix three correctness issues in MergeComments

- Fix html import shadowing: alias the 'html' import as 'stdhtml' to
  avoid shadowing by the local 'html' variable used throughout
  ProcessFile. Both callers updated: stdhtml.EscapeString for the
  ref attribute, htmlEscapeText for the selection search.

- Fix selection search with quotes/apostrophes: replace
  html.EscapeString for the selection with a new htmlEscapeText helper
  that only escapes &, <, > — not ' or ". Confluence storage HTML
  often leaves quotes and apostrophes unescaped in text nodes, so
  fully-escaped selections would fail to match and inline comments
  would be silently dropped. Add TestMergeComments_SelectionWithQuotes.

- Fix duplicate-ref warnings: move seenRefs[ref]=true to immediately
  after the duplicate-check, before the search loop. Previously seenRefs
  was only set on a successful match, so multiple results for the same
  MarkerRef with no match in the new body would each emit a 'dropped'
  warning. Add TestMergeComments_DuplicateMarkerRefDropped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Optimize levenshteinDistance to use two rolling rows instead of full matrix

Reduces memory allocation from O(m×n) to O(n) by keeping only the
previous and current rows. Also swaps r1/r2 so the shorter string is
used for column width, minimizing row allocation size.

This matters in MergeComments where levenshteinDistance is called for
every candidate match of every comment's selection in newBody — on
pages with many comments or short/common selections the number of
calls can be high.

Addresses thread [40] from PR review.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix test description and README algorithm doc

mark_test.go (thread [43]):
- TestMergeComments_HTMLEntities: the description incorrectly claimed
  &#39; (apostrophe) was tested; the selection '<world>' contains no
  apostrophe. Updated comment to accurately describe what is covered
  (&lt;/&gt; entity matching) and note the &#39; limitation.
- Add TestMergeComments_ApostropheSelection: verifies a selection with
  a literal apostrophe is found when the new body also has a literal
  apostrophe (the common case from mark's renderer). This exercises
  the htmlEscapeText path which intentionally does not encode ' or ".

README.md (thread [42]):
- Step 2 of the algorithm description said context was recorded
  'immediately before and after the commented selection' which is
  ambiguous. Clarified that context windows are taken around the
  <ac:inline-comment-marker> tag boundaries in the old body (not
  around the raw selection text), so the context is stable even when
  the marker wraps additional inline markup such as <strong>.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Unexport mergeComments and cap candidate evaluation

Thread [44]: MergeComments was exported but is internal-only — only
called within the mark package and tested from the same package.
Unexport it to mergeComments to avoid expanding the public API surface
unnecessarily. Add a Go doc comment describing the function contract,
HTML expectations, and the candidate cap.

Thread [45]: The candidate-scoring loop had no upper bound. For short
or common selections (e.g. 'a', 'the') on large pages the loop could
invoke levenshteinDistance thousands of times, each allocating rune
and int slices. Add a maxCandidates=100 constant and break once that
many on-rune-boundary occurrences have been evaluated. The exact-context
fast-path and lower-bound pruning already skip many candidates before
Levenshtein is called, so in practice the cap is only reached for very
common selections where the 100th candidate is unlikely to be
meaningfully better than an earlier one anyway.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: fix HTMLEntities description and add ApostropheEncoded limitation test

Thread #43: TestMergeComments_HTMLEntities had a misleading note claiming it
covered the &#39; apostrophe case, but the selection under test ('<world>') did
not include an apostrophe. Remove that note and add a dedicated
TestMergeComments_ApostropheEncoded test that explicitly documents the known
limitation: when a Confluence body stores an apostrophe as the numeric entity
&#39;, mergeComments cannot locate the selection (htmlEscapeText does not
encode ' to &#39;), so the comment is dropped with a warning.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix CDATA selection fallback and extract contextWindowBytes constant

Thread #46: mergeComments only searched for htmlEscapeText(selection) and
would fail for selections inside CDATA-backed macro bodies (e.g. ac:code),
where < and > are stored as raw characters rather than HTML entities. Restructure
the search loop to build a searchForms slice: the escaped form is tried first
(covers normal XML text nodes), and the raw unescaped form is appended as a
fallback when they differ. A stopSearch flag exits early on an exact context
match or when maxCandidates is reached, preserving the same performance
guarantees as before. Add TestMergeComments_CDATASelection to cover this path.

Thread #47: The context-window size 100 was repeated in four places across
mergeComments (two in the context-extraction loop and two in the scoring loop).
Extract it to const contextWindowBytes = 100 so it is easy to tune and stays
consistent everywhere.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 15:44:21 +02:00
+											pageCreated = true
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+										}
 										target = pg
 									} else {
 										pg, err := api.GetPageByID(config.PageID)
 										if err != nil {
 											return nil, fmt.Errorf("unable to retrieve page by id: %w", err)
 										}
-												fix: return error from Run() when ContinueOnError files fail

When --continue-on-error was set and one or more files failed to
process, Run() logged each failure but returned nil, making it
impossible for callers or CI systems to detect partial failures.

Track whether any file failed with a hasErrors flag and return a
descriptive error after all files have been attempted.

Update TestContinueOnError to reflect the corrected behaviour: the
test now asserts that an error IS returned (partial failure is
surfaced) while still verifying that all files in the batch are
attempted (not just the first one).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-13 02:11:18 +01:00
+										if pg == nil {
 											return nil, fmt.Errorf("page with id %q not found", config.PageID)
 										}
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+										target = pg
 									}
 									// Collect attachments declared via <!-- Attachment: --> directives.
 									var declaredAttachments []string
 									if meta != nil {
 										declaredAttachments = meta.Attachments
 									}
 									localAttachments, err := attachment.ResolveLocalAttachments(
 										vfs.LocalOS,
 										filepath.Dir(file),
 										declaredAttachments,
 									)
 									if err != nil {
 										return nil, fmt.Errorf("unable to locate attachments: %w", err)
 									}
 									attaches, err := attachment.ResolveAttachments(api, target, localAttachments)
 									if err != nil {
 										return nil, fmt.Errorf("unable to create/update attachments: %w", err)
 									}
 									markdown = attachment.CompileAttachmentLinks(markdown, attaches)
 									if config.DropH1 {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+										log.Info().Msg("the leading H1 heading will be excluded from the Confluence output")
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+									}
 									imageAlign, err := getImageAlign(config.ImageAlign, meta)
 									if err != nil {
 										return nil, fmt.Errorf("unable to determine image-align: %w", err)
 									}
 									cfg := types.MarkConfig{
 										MermaidScale:  config.MermaidScale,
 										D2Scale:       config.D2Scale,
 										DropFirstH1:   config.DropH1,
 										StripNewlines: config.StripLinebreaks,
 										Features:      config.Features,
 										ImageAlign:    imageAlign,
 									}
-												fix: return error instead of panic from CompileMarkdown

Markdown conversion failures called panic(err), crashing the process
rather than allowing graceful error handling. Change the return type
to (string, []attachment.Attachment, error) and propagate the error.
Update all callers (mark.go, markdown_test.go) accordingly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-13 01:13:51 +01:00
+									html, inlineAttachments, err := markmd.CompileMarkdown(markdown, std, file, cfg)
 									if err != nil {
 										return nil, fmt.Errorf("unable to compile markdown: %w", err)
 									}
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
 									if _, err = attachment.ResolveAttachments(api, target, inlineAttachments); err != nil {
 										return nil, fmt.Errorf("unable to create/update attachments: %w", err)
 									}
 									var layout, sidebar string
 									var labels []string
 									var contentAppearance, emoji string
 									if meta != nil {
 										layout = meta.Layout
 										sidebar = meta.Sidebar
 										labels = meta.Labels
 										contentAppearance = meta.ContentAppearance
 										emoji = meta.Emoji
 									}
 									{
 										var buffer bytes.Buffer
 										err := std.Templates.ExecuteTemplate(
 											&buffer,
 											"ac:layout",
 											struct {
 												Layout  string
 												Sidebar string
 												Body    string
 											}{
 												Layout:  layout,
 												Sidebar: sidebar,
 												Body:    html,
 											},
 										)
 										if err != nil {
 											return nil, fmt.Errorf("unable to execute layout template: %w", err)
 										}
 										html = buffer.String()
 									}
 									var finalVersionMessage string
 									shouldUpdatePage := true
 									if config.ChangesOnly {
 										contentHash := sha1Hash(html)
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+										log.Debug().Msgf("content hash: %s", contentHash)
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
 										re := regexp.MustCompile(`\[v([a-f0-9]{40})]$`)
 										if matches := re.FindStringSubmatch(target.Version.Message); len(matches) > 1 {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+											log.Debug().Msgf("previous content hash: %s", matches[1])
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+											if matches[1] == contentHash {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+												log.Info().Msgf("page %q is already up to date", target.Title)
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+												shouldUpdatePage = false
 											}
 										}
 										finalVersionMessage = fmt.Sprintf("%s [v%s]", config.VersionMessage, contentHash)
 									} else {
 										finalVersionMessage = config.VersionMessage
 									}
-												Feature/robust comment preservation (#768)

This is based on guoweis-work PR https://github.com/kovetskiy/mark/pull/145

* feat(confluence): add support for fetching page body and inline comments

* feat(cmd): add --preserve-comments flag to preserve inline comments

* feat(mark): implement context-aware inline comment preservation

* test(mark): add tests for context-aware MergeComments logic

* fix: remove empty else branch in MergeComments to fix SA9003

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* perf: compile markerRegex once as package-level variable

Avoids recompiling the inline comment marker regex on every call to
MergeComments, which matters for pages with many comment markers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against nil comments pointer in MergeComments

Prevents a panic when GetInlineComments returns nil (e.g. on pages
where the inline comments feature is not enabled).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: add edge-case tests for MergeComments; fix overlapping replacement

Four new test cases:
- SelectionMissing: comment dropped gracefully when text is gone from new body
- OverlappingSelections: overlapping comments no longer corrupt the body;
  the later match (by position) wins and the earlier overlapping one is dropped
- NilComments: nil pointer returns new body unchanged
- HTMLEntities: &lt;, &gt;, &#39; selections match correctly

Also fixes the overlapping replacement bug: apply back-to-front and skip any
replacement whose end exceeds the start of an already-applied one.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: escape ref attribute value in inline comment marker XML

Use html.EscapeString on r.ref before interpolating it into the
ac:ref attribute to prevent malformed XML if the value ever contains
quotes or other special characters.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: use first occurrence when no context is available in MergeComments

Without context the old code left distance=0 for every match and
updated bestStart on each iteration, so the final result depended on
whichever occurrence was visited last (non-deterministic with respect
to the search order).

Restructure the loop to break immediately on the first match when
hasCtx is false, making the behaviour explicit and deterministic.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: log warning when overlapping inline comment marker is dropped

Previously the overlap was silently skipped. Now a zerolog Warn message
is emitted with the ref, the conflicting byte offsets, and the ref of
the already-placed marker, so users can see which comment was lost
rather than silently getting incomplete output.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: warn when inline comments are silently dropped in MergeComments

Three cases now emit a zerolog Warn instead of silently discarding:

1. Comment location != "inline": logs ref and actual location.
2. Selected text not found in new body: logs ref and selection text.
3. Overlapping replacement (existing): adds selection text to the
   already-present overlap warning for easier diagnosis.

Also adds a selection field to the replacement struct so the overlap
warning can report the dropped text.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: update markerRegex to match markers with nested tags

Replace ([^<]*) with (?s)(.*?) so the pattern:
- Matches marker content that contains nested inline tags (e.g. <strong>)
- Matches across newlines ((?s) / DOTALL mode)

The old character class [^<]* stopped at the first < inside the
marker body, causing the context-extraction step to miss any comment
whose original selection spanned formatted text.

Add TestMergeComments_NestedTags to cover this path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against empty OriginalSelection in MergeComments

strings.Index(s, "") always returns 0, so an empty escapedSelection
would spin the search loop indefinitely (or panic when currentPos
advances past len(newBody)).

Skip comments with an empty selection early, emit a Warn log, and
add TestMergeComments_EmptySelection to cover the path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: paginate GetInlineComments to avoid silently truncating results

The Confluence child/comment endpoint is paginated. The previous
single-request implementation silently dropped any comments beyond
the server's default page size.

Changes:
- Add Links (context, next) to InlineComments struct so the _links
  field from each page response is decoded.
- Rewrite GetInlineComments to loop with limit/start parameters
  (pageSize=100), accumulating all results, following the same pattern
  used by GetAttachments and label fetching.
- Add TestMergeComments_DuplicateMarkerRef to cover the deduplication
  guard added in the previous commit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix UTF-8 safety, API compat, log verbosity

- levenshteinDistance: convert to []rune before empty-string checks so
  rune counts (not byte counts) are returned for strings with multi-byte
  characters

- Add contextBefore/contextAfter helpers that use utf8.RuneStart to
  avoid slicing in the middle of a multi-byte UTF-8 sequence when
  extracting 100-char context windows from oldBody and newBody

- Add truncateSelection helper (50 runes + ellipsis) and apply it in all
  Warn log messages that include the selected text, preventing large or
  sensitive page content from appearing in logs

- Downgrade non-inline comment log from Warn to Debug with message
  'comment ignored during inline marker merge: not an inline comment';
  page-level comments are not inline markers and are not 'lost'

- Restore original one-argument GetPageByID (expand='ancestors,version')
  and add GetPageByIDExpanded for the one caller that needs a custom
  expand value, preserving backward compatibility for API consumers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Address new PR review comments

- Remove custom min() function: shadows the Go 1.21+ built-in min for
  the entire package; the built-in handles the 3-arg call in
  levenshteinDistance identically

- Validate rune boundaries on strings.Index candidates: skip any match
  where start or end falls in the middle of a multi-byte UTF-8 rune
  to prevent corrupt UTF-8 output

- Defer preserve-comments API calls until after shouldUpdatePage is
  determined: avoids unnecessary GetPageByIDExpanded + GetInlineComments
  round-trips on no-op --changes-only runs

- Capitalize Usage string for --preserve-comments flag (util/flags.go)
  and matching README.md entry to match sentence case of surrounding flags

- Run gofmt on util/cli.go to fix struct literal field alignment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: document --preserve-comments feature in README

Add a dedicated 'Preserving Inline Comments' section under Tricks with:
- Usage examples (CLI flag and env var)
- Step-by-step explanation of the Levenshtein-based relocation algorithm
- Limitations (deleted text, overlapping selections, new pages,
  changes-only interaction)

Also add a cross-reference NOTE near the --preserve-comments flag entry
in the Usage section.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: fix markdownlint errors in README

- Change unordered list markers from dashes to asterisks (MD004)
- Remove extra blank line before Issues section (MD012)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Extract named types for InlineComments; optimize Levenshtein search

- Introduce InlineCommentProperties, InlineCommentExtensions, and
  InlineCommentResult named types in confluence/api.go, replacing the
  anonymous nested struct in InlineComments.Results. Callers and tests
  can now construct/inspect comment objects without repeating the JSON
  shape.

- Simplify makeComments helper in mark_test.go to use the new named
  types directly, eliminating the verbose anonymous struct literal.

- Add two Levenshtein candidate-search optimisations in MergeComments:
  * Exact-context fast path: if both the before and after windows match
    exactly, take that occurrence immediately without computing distance.
  * Lower-bound pruning: skip the full O(m*n) Levenshtein computation
    for a candidate when the absolute difference in window lengths alone
    already meets or exceeds the current best distance.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Use stable sort with ref tie-breaker; fix README overlap description

- Replace slices.SortFunc with slices.SortStableFunc for the
  replacements slice, adding ref as a lexicographic tie-breaker when
  two markers resolve to the same start offset. This makes overlap
  resolution fully deterministic across runs.

- Correct the README limitation note: the *earlier* overlapping
  match (lower byte offset) is what gets dropped; the later one
  (higher byte offset, applied first in the back-to-front pass) is
  kept. The previous wording said 'the second one is dropped' which
  was ambiguous and inaccurate.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix rune-based lower-bound pruning; clarify test comment

- Use utf8.RuneCountInString instead of len() for the Levenshtein
  lower-bound pruning computation. The levenshteinDistance function
  operates on rune slices, so byte-length differences can exceed the
  true rune-length difference for multibyte UTF-8 content, causing
  valid candidates to be incorrectly skipped.

- Update TestMergeComments_SelectionMissing comment to say the comment
  is 'dropped with a warning' rather than 'silently dropped', matching
  the actual behavior.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Add missing unit tests for helpers and MergeComments scenarios

Helper function tests:
- TestTruncateSelection: short/exact/long strings and multibyte runes
- TestLevenshteinDistance: empty strings, identical, insertions,
  deletions, substitutions, 'kitten/sitting', and a multibyte UTF-8
  case to exercise rune-based counting
- TestContextBefore / TestContextAfter: basic windowing, window larger
  than string, and a case where the raw byte offset lands mid-rune (é)
  to verify the rune-boundary correction logic

MergeComments scenario tests:
- TestMergeComments_MultipleComments: two non-overlapping comments both
  correctly applied via back-to-front replacement
- TestMergeComments_EmptyResults: non-nil InlineComments with zero
  results returns body unchanged
- TestMergeComments_NonInlineLocation: page-level comments (location
  != 'inline') are skipped; body unchanged
- TestMergeComments_NoContext: when a ref has no marker in oldBody the
  first occurrence of the selection in newBody is used
- TestMergeComments_UTF8: multibyte (Japanese) characters in both body
  and selection are handled correctly

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix three correctness issues in MergeComments

- Fix html import shadowing: alias the 'html' import as 'stdhtml' to
  avoid shadowing by the local 'html' variable used throughout
  ProcessFile. Both callers updated: stdhtml.EscapeString for the
  ref attribute, htmlEscapeText for the selection search.

- Fix selection search with quotes/apostrophes: replace
  html.EscapeString for the selection with a new htmlEscapeText helper
  that only escapes &, <, > — not ' or ". Confluence storage HTML
  often leaves quotes and apostrophes unescaped in text nodes, so
  fully-escaped selections would fail to match and inline comments
  would be silently dropped. Add TestMergeComments_SelectionWithQuotes.

- Fix duplicate-ref warnings: move seenRefs[ref]=true to immediately
  after the duplicate-check, before the search loop. Previously seenRefs
  was only set on a successful match, so multiple results for the same
  MarkerRef with no match in the new body would each emit a 'dropped'
  warning. Add TestMergeComments_DuplicateMarkerRefDropped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Optimize levenshteinDistance to use two rolling rows instead of full matrix

Reduces memory allocation from O(m×n) to O(n) by keeping only the
previous and current rows. Also swaps r1/r2 so the shorter string is
used for column width, minimizing row allocation size.

This matters in MergeComments where levenshteinDistance is called for
every candidate match of every comment's selection in newBody — on
pages with many comments or short/common selections the number of
calls can be high.

Addresses thread [40] from PR review.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix test description and README algorithm doc

mark_test.go (thread [43]):
- TestMergeComments_HTMLEntities: the description incorrectly claimed
  &#39; (apostrophe) was tested; the selection '<world>' contains no
  apostrophe. Updated comment to accurately describe what is covered
  (&lt;/&gt; entity matching) and note the &#39; limitation.
- Add TestMergeComments_ApostropheSelection: verifies a selection with
  a literal apostrophe is found when the new body also has a literal
  apostrophe (the common case from mark's renderer). This exercises
  the htmlEscapeText path which intentionally does not encode ' or ".

README.md (thread [42]):
- Step 2 of the algorithm description said context was recorded
  'immediately before and after the commented selection' which is
  ambiguous. Clarified that context windows are taken around the
  <ac:inline-comment-marker> tag boundaries in the old body (not
  around the raw selection text), so the context is stable even when
  the marker wraps additional inline markup such as <strong>.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Unexport mergeComments and cap candidate evaluation

Thread [44]: MergeComments was exported but is internal-only — only
called within the mark package and tested from the same package.
Unexport it to mergeComments to avoid expanding the public API surface
unnecessarily. Add a Go doc comment describing the function contract,
HTML expectations, and the candidate cap.

Thread [45]: The candidate-scoring loop had no upper bound. For short
or common selections (e.g. 'a', 'the') on large pages the loop could
invoke levenshteinDistance thousands of times, each allocating rune
and int slices. Add a maxCandidates=100 constant and break once that
many on-rune-boundary occurrences have been evaluated. The exact-context
fast-path and lower-bound pruning already skip many candidates before
Levenshtein is called, so in practice the cap is only reached for very
common selections where the 100th candidate is unlikely to be
meaningfully better than an earlier one anyway.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: fix HTMLEntities description and add ApostropheEncoded limitation test

Thread #43: TestMergeComments_HTMLEntities had a misleading note claiming it
covered the &#39; apostrophe case, but the selection under test ('<world>') did
not include an apostrophe. Remove that note and add a dedicated
TestMergeComments_ApostropheEncoded test that explicitly documents the known
limitation: when a Confluence body stores an apostrophe as the numeric entity
&#39;, mergeComments cannot locate the selection (htmlEscapeText does not
encode ' to &#39;), so the comment is dropped with a warning.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix CDATA selection fallback and extract contextWindowBytes constant

Thread #46: mergeComments only searched for htmlEscapeText(selection) and
would fail for selections inside CDATA-backed macro bodies (e.g. ac:code),
where < and > are stored as raw characters rather than HTML entities. Restructure
the search loop to build a searchForms slice: the escaped form is tried first
(covers normal XML text nodes), and the raw unescaped form is appended as a
fallback when they differ. A stopSearch flag exits early on an exact context
match or when maxCandidates is reached, preserving the same performance
guarantees as before. Add TestMergeComments_CDATASelection to cover this path.

Thread #47: The context-window size 100 was repeated in four places across
mergeComments (two in the context-extraction loop and two in the scoring loop).
Extract it to const contextWindowBytes = 100 so it is easy to tune and stays
consistent everywhere.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 15:44:21 +02:00
+									// Only fetch the old body and inline comments when we know the page will
 									// actually be updated. This avoids unnecessary API round-trips for no-op
 									// runs (e.g. when --changes-only determines the content is unchanged).
 									if shouldUpdatePage && config.PreserveComments && !pageCreated {
 										pg, err := api.GetPageByIDExpanded(target.ID, "ancestors,version,body.storage")
 										if err != nil {
 											return nil, fmt.Errorf("unable to retrieve page body for comments: %w", err)
 										}
 										target = pg
 										comments, err := api.GetInlineComments(target.ID)
 										if err != nil {
 											return nil, fmt.Errorf("unable to retrieve inline comments: %w", err)
 										}
 										html, err = mergeComments(html, target.Body.Storage.Value, comments)
 										if err != nil {
 											return nil, fmt.Errorf("unable to merge inline comments: %w", err)
 										}
 									}
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+									if shouldUpdatePage {
 										err = api.UpdatePage(
 											target,
 											html,
 											config.MinorEdit,
 											finalVersionMessage,
 											contentAppearance,
 											emoji,
 										)
 										if err != nil {
 											return nil, fmt.Errorf("unable to update page: %w", err)
 										}
 									}
-												fix: skip label sync when metadata is absent

When PageID mode is used (meta == nil), labels is nil and calling
updateLabels unconditionally treats that as an empty desired set,
silently removing all existing global labels from the page. Guard
the call so label syncing only runs when metadata is present.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 23:04:33 +01:00
+									if meta != nil {
 										if err := updateLabels(api, target, labels); err != nil {
 											return nil, err
 										}
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+									}
 									if config.EditLock {
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+										log.Info().Msgf(
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
+											`edit locked on page %q by user %q to prevent manual edits`,
 											target.Title,
 											config.Username,
 										)
 										if err := api.RestrictPageUpdates(target, config.Username); err != nil {
 											return nil, fmt.Errorf("unable to restrict page updates: %w", err)
 										}
 									}
 									return target, nil
 								}
 								func updateLabels(api *confluence.API, target *confluence.PageInfo, metaLabels []string) error {
 									labelInfo, err := api.GetPageLabels(target, "global")
 									if err != nil {
 										return err
 									}
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+									log.Debug().Msg("Page Labels:")
 									log.Debug().Interface("labels", labelInfo.Labels).Send()
 									log.Debug().Msg("Meta Labels:")
 									log.Debug().Interface("labels", metaLabels).Send()
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
 									delLabels := determineLabelsToRemove(labelInfo, metaLabels)
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+									log.Debug().Msg("Del Labels:")
 									log.Debug().Interface("labels", delLabels).Send()
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
 									addLabels := determineLabelsToAdd(metaLabels, labelInfo)
-												feat: replace logging with zerolog

											
										
										
											2026-03-28 09:55:58 +01:00
+									log.Debug().Msg("Add Labels:")
 									log.Debug().Interface("labels", addLabels).Send()
-												Add root library package with Config, Run and ProcessFile

Expose the core mark functionality as an importable Go library.
Library users can now import github.com/kovetskiy/mark and call:

  err := mark.Run(mark.Config{
      BaseURL:  "https://confluence.example.com",
      Username: "user",
      Password: "token",
      Files:    "docs/**/*.md",
      Features: []string{"mermaid", "mention"},
  })

The new package provides:
- Config struct: all options decoupled from the CLI framework
- Run(config Config) error: process all files matching Config.Files
- ProcessFile(file, api, config): process a single markdown file

Also moves the CLI entry point to cmd/mark/main.go following standard
Go convention for projects that serve as both a library and a binary.

Fixes a pre-existing nil-pointer dereference on meta.Attachments,
meta.Layout and related fields when using --target-url with a pageId
(meta was nil in that code path).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-03-12 10:07:38 +01:00
 									if len(addLabels) > 0 {
 										if _, err = api.AddPageLabels(target, addLabels); err != nil {
 											return fmt.Errorf("error adding labels: %w", err)
 										}
 									}
 									for _, label := range delLabels {
 										if _, err = api.DeletePageLabel(target, label); err != nil {
 											return fmt.Errorf("error deleting label %q: %w", label, err)
 										}
 									}
 									return nil
 								}
 								func determineLabelsToRemove(labelInfo *confluence.LabelInfo, metaLabels []string) []string {
 									var labels []string
 									for _, label := range labelInfo.Labels {
 										if !slices.ContainsFunc(metaLabels, func(metaLabel string) bool {
 											return strings.EqualFold(metaLabel, label.Name)
 										}) {
 											labels = append(labels, label.Name)
 										}
 									}
 									return labels
 								}
 								func determineLabelsToAdd(metaLabels []string, labelInfo *confluence.LabelInfo) []string {
 									var labels []string
 									for _, metaLabel := range metaLabels {
 										if !slices.ContainsFunc(labelInfo.Labels, func(label confluence.Label) bool {
 											return strings.EqualFold(label.Name, metaLabel)
 										}) {
 											labels = append(labels, metaLabel)
 										}
 									}
 									return labels
 								}
 								func getImageAlign(align string, meta *metadata.Meta) (string, error) {
 									if meta != nil && meta.ImageAlign != "" {
 										align = meta.ImageAlign
 									}
 									if align != "" {
 										align = strings.ToLower(strings.TrimSpace(align))
 										if align != "left" && align != "center" && align != "right" {
 											return "", fmt.Errorf(
 												`unknown image-align %q, expected one of: left, center, right`,
 												align,
 											)
 										}
 										return align, nil
 									}
 									return "", nil
 								}
 								func sha1Hash(input string) string {
 									h := sha1.New()
 									h.Write([]byte(input))
 									return hex.EncodeToString(h.Sum(nil))
 								}
-												Feature/robust comment preservation (#768)

This is based on guoweis-work PR https://github.com/kovetskiy/mark/pull/145

* feat(confluence): add support for fetching page body and inline comments

* feat(cmd): add --preserve-comments flag to preserve inline comments

* feat(mark): implement context-aware inline comment preservation

* test(mark): add tests for context-aware MergeComments logic

* fix: remove empty else branch in MergeComments to fix SA9003

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* perf: compile markerRegex once as package-level variable

Avoids recompiling the inline comment marker regex on every call to
MergeComments, which matters for pages with many comment markers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against nil comments pointer in MergeComments

Prevents a panic when GetInlineComments returns nil (e.g. on pages
where the inline comments feature is not enabled).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: add edge-case tests for MergeComments; fix overlapping replacement

Four new test cases:
- SelectionMissing: comment dropped gracefully when text is gone from new body
- OverlappingSelections: overlapping comments no longer corrupt the body;
  the later match (by position) wins and the earlier overlapping one is dropped
- NilComments: nil pointer returns new body unchanged
- HTMLEntities: &lt;, &gt;, &#39; selections match correctly

Also fixes the overlapping replacement bug: apply back-to-front and skip any
replacement whose end exceeds the start of an already-applied one.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: escape ref attribute value in inline comment marker XML

Use html.EscapeString on r.ref before interpolating it into the
ac:ref attribute to prevent malformed XML if the value ever contains
quotes or other special characters.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: use first occurrence when no context is available in MergeComments

Without context the old code left distance=0 for every match and
updated bestStart on each iteration, so the final result depended on
whichever occurrence was visited last (non-deterministic with respect
to the search order).

Restructure the loop to break immediately on the first match when
hasCtx is false, making the behaviour explicit and deterministic.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: log warning when overlapping inline comment marker is dropped

Previously the overlap was silently skipped. Now a zerolog Warn message
is emitted with the ref, the conflicting byte offsets, and the ref of
the already-placed marker, so users can see which comment was lost
rather than silently getting incomplete output.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: warn when inline comments are silently dropped in MergeComments

Three cases now emit a zerolog Warn instead of silently discarding:

1. Comment location != "inline": logs ref and actual location.
2. Selected text not found in new body: logs ref and selection text.
3. Overlapping replacement (existing): adds selection text to the
   already-present overlap warning for easier diagnosis.

Also adds a selection field to the replacement struct so the overlap
warning can report the dropped text.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: update markerRegex to match markers with nested tags

Replace ([^<]*) with (?s)(.*?) so the pattern:
- Matches marker content that contains nested inline tags (e.g. <strong>)
- Matches across newlines ((?s) / DOTALL mode)

The old character class [^<]* stopped at the first < inside the
marker body, causing the context-extraction step to miss any comment
whose original selection spanned formatted text.

Add TestMergeComments_NestedTags to cover this path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: guard against empty OriginalSelection in MergeComments

strings.Index(s, "") always returns 0, so an empty escapedSelection
would spin the search loop indefinitely (or panic when currentPos
advances past len(newBody)).

Skip comments with an empty selection early, emit a Warn log, and
add TestMergeComments_EmptySelection to cover the path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* fix: paginate GetInlineComments to avoid silently truncating results

The Confluence child/comment endpoint is paginated. The previous
single-request implementation silently dropped any comments beyond
the server's default page size.

Changes:
- Add Links (context, next) to InlineComments struct so the _links
  field from each page response is decoded.
- Rewrite GetInlineComments to loop with limit/start parameters
  (pageSize=100), accumulating all results, following the same pattern
  used by GetAttachments and label fetching.
- Add TestMergeComments_DuplicateMarkerRef to cover the deduplication
  guard added in the previous commit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix UTF-8 safety, API compat, log verbosity

- levenshteinDistance: convert to []rune before empty-string checks so
  rune counts (not byte counts) are returned for strings with multi-byte
  characters

- Add contextBefore/contextAfter helpers that use utf8.RuneStart to
  avoid slicing in the middle of a multi-byte UTF-8 sequence when
  extracting 100-char context windows from oldBody and newBody

- Add truncateSelection helper (50 runes + ellipsis) and apply it in all
  Warn log messages that include the selected text, preventing large or
  sensitive page content from appearing in logs

- Downgrade non-inline comment log from Warn to Debug with message
  'comment ignored during inline marker merge: not an inline comment';
  page-level comments are not inline markers and are not 'lost'

- Restore original one-argument GetPageByID (expand='ancestors,version')
  and add GetPageByIDExpanded for the one caller that needs a custom
  expand value, preserving backward compatibility for API consumers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Address new PR review comments

- Remove custom min() function: shadows the Go 1.21+ built-in min for
  the entire package; the built-in handles the 3-arg call in
  levenshteinDistance identically

- Validate rune boundaries on strings.Index candidates: skip any match
  where start or end falls in the middle of a multi-byte UTF-8 rune
  to prevent corrupt UTF-8 output

- Defer preserve-comments API calls until after shouldUpdatePage is
  determined: avoids unnecessary GetPageByIDExpanded + GetInlineComments
  round-trips on no-op --changes-only runs

- Capitalize Usage string for --preserve-comments flag (util/flags.go)
  and matching README.md entry to match sentence case of surrounding flags

- Run gofmt on util/cli.go to fix struct literal field alignment

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: document --preserve-comments feature in README

Add a dedicated 'Preserving Inline Comments' section under Tricks with:
- Usage examples (CLI flag and env var)
- Step-by-step explanation of the Levenshtein-based relocation algorithm
- Limitations (deleted text, overlapping selections, new pages,
  changes-only interaction)

Also add a cross-reference NOTE near the --preserve-comments flag entry
in the Usage section.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* docs: fix markdownlint errors in README

- Change unordered list markers from dashes to asterisks (MD004)
- Remove extra blank line before Issues section (MD012)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Extract named types for InlineComments; optimize Levenshtein search

- Introduce InlineCommentProperties, InlineCommentExtensions, and
  InlineCommentResult named types in confluence/api.go, replacing the
  anonymous nested struct in InlineComments.Results. Callers and tests
  can now construct/inspect comment objects without repeating the JSON
  shape.

- Simplify makeComments helper in mark_test.go to use the new named
  types directly, eliminating the verbose anonymous struct literal.

- Add two Levenshtein candidate-search optimisations in MergeComments:
  * Exact-context fast path: if both the before and after windows match
    exactly, take that occurrence immediately without computing distance.
  * Lower-bound pruning: skip the full O(m*n) Levenshtein computation
    for a candidate when the absolute difference in window lengths alone
    already meets or exceeds the current best distance.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Use stable sort with ref tie-breaker; fix README overlap description

- Replace slices.SortFunc with slices.SortStableFunc for the
  replacements slice, adding ref as a lexicographic tie-breaker when
  two markers resolve to the same start offset. This makes overlap
  resolution fully deterministic across runs.

- Correct the README limitation note: the *earlier* overlapping
  match (lower byte offset) is what gets dropped; the later one
  (higher byte offset, applied first in the back-to-front pass) is
  kept. The previous wording said 'the second one is dropped' which
  was ambiguous and inaccurate.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix rune-based lower-bound pruning; clarify test comment

- Use utf8.RuneCountInString instead of len() for the Levenshtein
  lower-bound pruning computation. The levenshteinDistance function
  operates on rune slices, so byte-length differences can exceed the
  true rune-length difference for multibyte UTF-8 content, causing
  valid candidates to be incorrectly skipped.

- Update TestMergeComments_SelectionMissing comment to say the comment
  is 'dropped with a warning' rather than 'silently dropped', matching
  the actual behavior.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Add missing unit tests for helpers and MergeComments scenarios

Helper function tests:
- TestTruncateSelection: short/exact/long strings and multibyte runes
- TestLevenshteinDistance: empty strings, identical, insertions,
  deletions, substitutions, 'kitten/sitting', and a multibyte UTF-8
  case to exercise rune-based counting
- TestContextBefore / TestContextAfter: basic windowing, window larger
  than string, and a case where the raw byte offset lands mid-rune (é)
  to verify the rune-boundary correction logic

MergeComments scenario tests:
- TestMergeComments_MultipleComments: two non-overlapping comments both
  correctly applied via back-to-front replacement
- TestMergeComments_EmptyResults: non-nil InlineComments with zero
  results returns body unchanged
- TestMergeComments_NonInlineLocation: page-level comments (location
  != 'inline') are skipped; body unchanged
- TestMergeComments_NoContext: when a ref has no marker in oldBody the
  first occurrence of the selection in newBody is used
- TestMergeComments_UTF8: multibyte (Japanese) characters in both body
  and selection are handled correctly

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix three correctness issues in MergeComments

- Fix html import shadowing: alias the 'html' import as 'stdhtml' to
  avoid shadowing by the local 'html' variable used throughout
  ProcessFile. Both callers updated: stdhtml.EscapeString for the
  ref attribute, htmlEscapeText for the selection search.

- Fix selection search with quotes/apostrophes: replace
  html.EscapeString for the selection with a new htmlEscapeText helper
  that only escapes &, <, > — not ' or ". Confluence storage HTML
  often leaves quotes and apostrophes unescaped in text nodes, so
  fully-escaped selections would fail to match and inline comments
  would be silently dropped. Add TestMergeComments_SelectionWithQuotes.

- Fix duplicate-ref warnings: move seenRefs[ref]=true to immediately
  after the duplicate-check, before the search loop. Previously seenRefs
  was only set on a successful match, so multiple results for the same
  MarkerRef with no match in the new body would each emit a 'dropped'
  warning. Add TestMergeComments_DuplicateMarkerRefDropped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Optimize levenshteinDistance to use two rolling rows instead of full matrix

Reduces memory allocation from O(m×n) to O(n) by keeping only the
previous and current rows. Also swaps r1/r2 so the shorter string is
used for column width, minimizing row allocation size.

This matters in MergeComments where levenshteinDistance is called for
every candidate match of every comment's selection in newBody — on
pages with many comments or short/common selections the number of
calls can be high.

Addresses thread [40] from PR review.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix test description and README algorithm doc

mark_test.go (thread [43]):
- TestMergeComments_HTMLEntities: the description incorrectly claimed
  &#39; (apostrophe) was tested; the selection '<world>' contains no
  apostrophe. Updated comment to accurately describe what is covered
  (&lt;/&gt; entity matching) and note the &#39; limitation.
- Add TestMergeComments_ApostropheSelection: verifies a selection with
  a literal apostrophe is found when the new body also has a literal
  apostrophe (the common case from mark's renderer). This exercises
  the htmlEscapeText path which intentionally does not encode ' or ".

README.md (thread [42]):
- Step 2 of the algorithm description said context was recorded
  'immediately before and after the commented selection' which is
  ambiguous. Clarified that context windows are taken around the
  <ac:inline-comment-marker> tag boundaries in the old body (not
  around the raw selection text), so the context is stable even when
  the marker wraps additional inline markup such as <strong>.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Unexport mergeComments and cap candidate evaluation

Thread [44]: MergeComments was exported but is internal-only — only
called within the mark package and tested from the same package.
Unexport it to mergeComments to avoid expanding the public API surface
unnecessarily. Add a Go doc comment describing the function contract,
HTML expectations, and the candidate cap.

Thread [45]: The candidate-scoring loop had no upper bound. For short
or common selections (e.g. 'a', 'the') on large pages the loop could
invoke levenshteinDistance thousands of times, each allocating rune
and int slices. Add a maxCandidates=100 constant and break once that
many on-rune-boundary occurrences have been evaluated. The exact-context
fast-path and lower-bound pruning already skip many candidates before
Levenshtein is called, so in practice the cap is only reached for very
common selections where the 100th candidate is unlikely to be
meaningfully better than an earlier one anyway.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: fix HTMLEntities description and add ApostropheEncoded limitation test

Thread #43: TestMergeComments_HTMLEntities had a misleading note claiming it
covered the &#39; apostrophe case, but the selection under test ('<world>') did
not include an apostrophe. Remove that note and add a dedicated
TestMergeComments_ApostropheEncoded test that explicitly documents the known
limitation: when a Confluence body stores an apostrophe as the numeric entity
&#39;, mergeComments cannot locate the selection (htmlEscapeText does not
encode ' to &#39;), so the comment is dropped with a warning.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* Fix CDATA selection fallback and extract contextWindowBytes constant

Thread #46: mergeComments only searched for htmlEscapeText(selection) and
would fail for selections inside CDATA-backed macro bodies (e.g. ac:code),
where < and > are stored as raw characters rather than HTML entities. Restructure
the search loop to build a searchForms slice: the escaped form is tried first
(covers normal XML text nodes), and the raw unescaped form is appended as a
fallback when they differ. A stopSearch flag exits early on an exact context
match or when maxCandidates is reached, preserving the same performance
guarantees as before. Add TestMergeComments_CDATASelection to cover this path.

Thread #47: The context-window size 100 was repeated in four places across
mergeComments (two in the context-extraction loop and two in the scoring loop).
Extract it to const contextWindowBytes = 100 so it is easy to tune and stays
consistent everywhere.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-04-08 15:44:21 +02:00
 								// htmlEscapeText escapes only the characters that Confluence storage HTML
 								// always encodes in text nodes (&, <, >). Unlike html.EscapeString it does NOT
 								// escape single-quotes or double-quotes, because those are frequently left
 								// unescaped inside text nodes by the Confluence editor and by mark's own
 								// renderer, so escaping them would prevent the selection-search from finding
 								// a valid match.
 								var htmlTextReplacer = strings.NewReplacer("&", "&amp;", "<", "&lt;", ">", "&gt;")
 								func htmlEscapeText(s string) string {
 									return htmlTextReplacer.Replace(s)
 								}
 								// truncateSelection returns a truncated preview of s for use in log messages,
 								// capped at maxRunes runes, with an ellipsis appended when trimmed.
 								func truncateSelection(s string, maxRunes int) string {
 									runes := []rune(s)
 									if len(runes) <= maxRunes {
 										return s
 									}
 									return string(runes[:maxRunes]) + "…"
 								}
 								// contextBefore returns up to maxBytes of s ending at byteEnd, trimmed
 								// forward to the nearest valid UTF-8 rune start so the slice is never
 								// split across a multi-byte sequence.
 								func contextBefore(s string, byteEnd, maxBytes int) string {
 									start := byteEnd - maxBytes
 									if start < 0 {
 										start = 0
 									}
 									for start < byteEnd && !utf8.RuneStart(s[start]) {
 										start++
 									}
 									return s[start:byteEnd]
 								}
 								// contextAfter returns up to maxBytes of s starting at byteStart, trimmed
 								// back to the nearest valid UTF-8 rune start so the slice is never split
 								// across a multi-byte sequence.
 								func contextAfter(s string, byteStart, maxBytes int) string {
 									end := byteStart + maxBytes
 									if end >= len(s) {
 										return s[byteStart:]
 									}
 									for end > byteStart && !utf8.RuneStart(s[end]) {
 										end--
 									}
 									return s[byteStart:end]
 								}
 								func levenshteinDistance(s1, s2 string) int {
 									r1 := []rune(s1)
 									r2 := []rune(s2)
 									if len(r1) == 0 {
 										return len(r2)
 									}
 									if len(r2) == 0 {
 										return len(r1)
 									}
 									// Use two rolling rows instead of a full matrix to reduce allocations
 									// from O(m×n) to O(n). Swap r1/r2 so r2 is the shorter string, keeping
 									// the row width (len(r2)+1) as small as possible.
 									if len(r1) < len(r2) {
 										r1, r2 = r2, r1
 									}
 									prev := make([]int, len(r2)+1)
 									curr := make([]int, len(r2)+1)
 									for j := range prev {
 										prev[j] = j
 									}
 									for i := 1; i <= len(r1); i++ {
 										curr[0] = i
 										for j := 1; j <= len(r2); j++ {
 											cost := 0
 											if r1[i-1] != r2[j-1] {
 												cost = 1
 											}
 											curr[j] = min(
 												prev[j]+1,      // deletion
 												curr[j-1]+1,    // insertion
 												prev[j-1]+cost, // substitution
 											)
 										}
 										prev, curr = curr, prev
 									}
 									return prev[len(r2)]
 								}
 								type commentContext struct {
 									before string
 									after  string
 								}
 								// mergeComments re-embeds inline comment markers from the Confluence API into
 								// newBody (the updated storage HTML about to be uploaded). It extracts context
 								// from each existing marker in oldBody and uses Levenshtein distance to
 								// relocate each marker to the best-matching position in newBody, so comment
 								// threads survive page edits even when the surrounding text has shifted.
 								//
 								// At most maxCandidates occurrences of each selection are evaluated with
 								// Levenshtein distance; further occurrences are ignored to bound CPU cost on
 								// pages where a selection is short or very common.
 								const maxCandidates = 100
 								// contextWindowBytes is the number of bytes of surrounding text captured as
 								// context around each inline-comment marker. It is used both when extracting
 								// context from oldBody and when scoring candidates in newBody.
 								const contextWindowBytes = 100
 								func mergeComments(newBody string, oldBody string, comments *confluence.InlineComments) (string, error) {
 									if comments == nil {
 										return newBody, nil
 									}
 									// 1. Extract context for each comment from oldBody
 									contexts := make(map[string]commentContext)
 									matches := markerRegex.FindAllStringSubmatchIndex(oldBody, -1)
 									for _, match := range matches {
 										ref := oldBody[match[2]:match[3]]
 										// context around the tag
 										before := contextBefore(oldBody, match[0], contextWindowBytes)
 										after := contextAfter(oldBody, match[1], contextWindowBytes)
 										contexts[ref] = commentContext{
 											before: before,
 											after:  after,
 										}
 									}
 									type replacement struct {
 										start     int
 										end       int
 										ref       string
 										selection string
 									}
 									var replacements []replacement
 									seenRefs := make(map[string]bool)
 									for _, comment := range comments.Results {
 										if comment.Extensions.Location != "inline" {
 											log.Debug().
 												Str("location", comment.Extensions.Location).
 												Str("ref", comment.Extensions.InlineProperties.MarkerRef).
 												Msg("comment ignored during inline marker merge: not an inline comment")
 											continue
 										}
 										ref := comment.Extensions.InlineProperties.MarkerRef
 										selection := comment.Extensions.InlineProperties.OriginalSelection
 										if seenRefs[ref] {
 											// Multiple results share the same MarkerRef (e.g. threaded replies).
 											// The marker only needs to be inserted once; skip duplicates.
 											continue
 										}
 										// Mark ref as seen immediately so subsequent results for the same ref
 										// (threaded replies) are always deduplicated, even if this one is dropped.
 										seenRefs[ref] = true
 										if selection == "" {
 											log.Warn().
 												Str("ref", ref).
 												Msg("inline comment skipped: original selection is empty; comment will be lost")
 											continue
 										}
 										ctx, hasCtx := contexts[ref]
 										// Build the list of forms to search for in newBody. The escaped form
 										// is tried first (normal XML text nodes). The raw form is appended as a
 										// fallback for text inside CDATA-backed macro bodies (e.g. ac:code),
 										// where < and > are stored unescaped inside <![CDATA[...]]>.
 										escapedSelection := htmlEscapeText(selection)
 										searchForms := []string{escapedSelection}
 										if selection != escapedSelection {
 											searchForms = append(searchForms, selection)
 										}
 										var bestStart = -1
 										var bestEnd = -1
 										var minDistance = 1000000
 										// Iterate over search forms; stop as soon as we have a definitive best.
 										candidates := 0
 										stopSearch := false
 										for _, form := range searchForms {
 											if stopSearch {
 												break
 											}
 											currentPos := 0
 											for {
 												index := strings.Index(newBody[currentPos:], form)
 												if index == -1 {
 													break
 												}
 												start := currentPos + index
 												end := start + len(form)
 												// Skip candidates that start or end in the middle of a multi-byte
 												// UTF-8 rune; such a match would produce invalid UTF-8 output.
 												if !utf8.RuneStart(newBody[start]) || (end < len(newBody) && !utf8.RuneStart(newBody[end])) {
 													currentPos = start + 1
 													continue
 												}
 												candidates++
 												if candidates > maxCandidates {
 													stopSearch = true
 													break
 												}
 												if !hasCtx {
 													// No context available; use the first occurrence.
 													bestStart = start
 													bestEnd = end
 													stopSearch = true
 													break
 												}
 												newBefore := contextBefore(newBody, start, contextWindowBytes)
 												newAfter := contextAfter(newBody, end, contextWindowBytes)
 												// Fast path: exact context match is the best possible result.
 												if newBefore == ctx.before && newAfter == ctx.after {
 													bestStart = start
 													bestEnd = end
 													stopSearch = true
 													break
 												}
 												// Lower-bound pruning: Levenshtein distance is at least the
 												// absolute difference in rune counts. Use rune counts (not byte
 												// lengths) to match the unit levenshteinDistance operates on,
 												// avoiding false skips for multibyte UTF-8 content.
 												lbBefore := utf8.RuneCountInString(ctx.before) - utf8.RuneCountInString(newBefore)
 												if lbBefore < 0 {
 													lbBefore = -lbBefore
 												}
 												lbAfter := utf8.RuneCountInString(ctx.after) - utf8.RuneCountInString(newAfter)
 												if lbAfter < 0 {
 													lbAfter = -lbAfter
 												}
 												if lbBefore+lbAfter >= minDistance {
 													currentPos = start + 1
 													continue
 												}
 												distance := levenshteinDistance(ctx.before, newBefore) + levenshteinDistance(ctx.after, newAfter)
 												if distance < minDistance {
 													minDistance = distance
 													bestStart = start
 													bestEnd = end
 												}
 												currentPos = start + 1
 											}
 										}
 										if bestStart != -1 {
 											replacements = append(replacements, replacement{
 												start:     bestStart,
 												end:       bestEnd,
 												ref:       ref,
 												selection: selection,
 											})
 										} else {
 											log.Warn().
 												Str("ref", ref).
 												Str("selection_preview", truncateSelection(selection, 50)).
 												Msg("inline comment dropped: selected text not found in new body; comment will be lost")
 										}
 									}
 									// Sort replacements from back to front to avoid offset issues.
 									// Use a stable sort with ref as a tie-breaker so the ordering is
 									// deterministic when two markers resolve to the same start offset.
 									slices.SortStableFunc(replacements, func(a, b replacement) int {
 										if a.start != b.start {
 											return b.start - a.start
 										}
 										if a.ref < b.ref {
 											return -1
 										}
 										if a.ref > b.ref {
 											return 1
 										}
 										return 0
 									})
 									// Apply replacements back-to-front. Track the minimum start of any
 									// applied replacement so that overlapping candidates (whose end exceeds
 									// that boundary) are dropped rather than producing nested or malformed
 									// <ac:inline-comment-marker> tags.
 									minAppliedStart := len(newBody)
 									for _, r := range replacements {
 										if r.end > minAppliedStart {
 											// This replacement overlaps with an already-applied one.
 											// Drop it and warn so the user knows the comment was skipped.
 											log.Warn().
 												Str("ref", r.ref).
 												Str("selection_preview", truncateSelection(r.selection, 50)).
 												Int("start", r.start).
 												Int("end", r.end).
 												Int("conflicting_start", minAppliedStart).
 												Msg("inline comment marker dropped: selection overlaps an already-placed marker")
 											continue
 										}
 										minAppliedStart = r.start
 										selection := newBody[r.start:r.end]
 										withComment := fmt.Sprintf(
 											`<ac:inline-comment-marker ac:ref="%s">%s</ac:inline-comment-marker>`,
 											stdhtml.EscapeString(r.ref),
 											selection,
 										)
 										newBody = newBody[:r.start] + withComment + newBody[r.end:]
 									}
 									return newBody, nil
 								}