2024-09-26 15:24:39 +02:00
|
|
|
package metadata
|
2019-04-08 22:12:00 +03:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
2024-09-30 21:00:49 -04:00
|
|
|
"crypto/sha256"
|
|
|
|
"fmt"
|
2019-04-08 22:12:00 +03:00
|
|
|
"regexp"
|
|
|
|
"strings"
|
2019-04-19 10:31:41 +03:00
|
|
|
|
2020-11-03 17:12:51 +03:00
|
|
|
"github.com/reconquest/pkg/log"
|
2019-04-19 10:31:41 +03:00
|
|
|
)
|
|
|
|
|
2019-04-08 22:12:00 +03:00
|
|
|
const (
|
2023-03-06 10:09:28 +01:00
|
|
|
HeaderParent = `Parent`
|
|
|
|
HeaderSpace = `Space`
|
|
|
|
HeaderType = `Type`
|
|
|
|
HeaderTitle = `Title`
|
|
|
|
HeaderLayout = `Layout`
|
2025-02-14 17:10:45 +01:00
|
|
|
HeaderEmoji = `Emoji`
|
2023-03-06 10:09:28 +01:00
|
|
|
HeaderAttachment = `Attachment`
|
|
|
|
HeaderLabel = `Label`
|
|
|
|
HeaderInclude = `Include`
|
|
|
|
HeaderSidebar = `Sidebar`
|
|
|
|
ContentAppearance = `Content-Appearance`
|
2019-04-08 22:12:00 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
type Meta struct {
|
2023-03-06 10:09:28 +01:00
|
|
|
Parents []string
|
|
|
|
Space string
|
|
|
|
Type string
|
|
|
|
Title string
|
|
|
|
Layout string
|
|
|
|
Sidebar string
|
2025-02-14 17:10:45 +01:00
|
|
|
Emoji string
|
2023-03-06 10:09:28 +01:00
|
|
|
Attachments []string
|
|
|
|
Labels []string
|
|
|
|
ContentAppearance string
|
2019-04-08 22:12:00 +03:00
|
|
|
}
|
|
|
|
|
2023-03-06 10:09:28 +01:00
|
|
|
const (
|
|
|
|
FullWidthContentAppearance = "full-width"
|
|
|
|
FixedContentAppearance = "fixed"
|
|
|
|
)
|
|
|
|
|
2019-08-08 23:41:26 +03:00
|
|
|
var (
|
2023-01-25 13:08:08 +01:00
|
|
|
reHeaderPatternV2 = regexp.MustCompile(`<!--\s*([^:]+):\s*(.*)\s*-->`)
|
|
|
|
reHeaderPatternMacro = regexp.MustCompile(`<!-- Macro: .*`)
|
2019-08-08 23:41:26 +03:00
|
|
|
)
|
|
|
|
|
2024-09-30 21:00:49 -04:00
|
|
|
func ExtractMeta(data []byte, spaceFromCli string, titleFromH1 bool, parents []string, titleAppendGeneratedHash bool) (*Meta, []byte, error) {
|
2019-05-01 16:58:28 +03:00
|
|
|
var (
|
2019-08-08 23:41:26 +03:00
|
|
|
meta *Meta
|
|
|
|
offset int
|
2019-05-01 16:58:28 +03:00
|
|
|
)
|
2019-04-08 22:12:00 +03:00
|
|
|
|
|
|
|
scanner := bufio.NewScanner(bytes.NewBuffer(data))
|
|
|
|
for scanner.Scan() {
|
|
|
|
line := scanner.Text()
|
|
|
|
|
|
|
|
if err := scanner.Err(); err != nil {
|
2019-08-08 23:41:26 +03:00
|
|
|
return nil, nil, err
|
2019-04-08 22:12:00 +03:00
|
|
|
}
|
|
|
|
|
2019-08-13 19:48:11 +03:00
|
|
|
offset += len(line) + 1
|
2019-08-08 23:41:26 +03:00
|
|
|
|
|
|
|
matches := reHeaderPatternV2.FindStringSubmatch(line)
|
2019-04-08 22:12:00 +03:00
|
|
|
if matches == nil {
|
2024-07-25 23:48:48 +02:00
|
|
|
matches = reHeaderPatternMacro.FindStringSubmatch(line)
|
|
|
|
// If we have a match, then we started reading a macro.
|
|
|
|
// We want to keep it in the document for it to be read by ExtractMacros
|
|
|
|
if matches != nil {
|
|
|
|
offset -= len(line) + 1
|
2019-05-01 16:58:28 +03:00
|
|
|
}
|
2024-07-25 23:48:48 +02:00
|
|
|
break
|
2019-04-08 22:12:00 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if meta == nil {
|
|
|
|
meta = &Meta{}
|
2023-03-06 10:09:28 +01:00
|
|
|
meta.Type = "page" // Default if not specified
|
|
|
|
meta.ContentAppearance = FullWidthContentAppearance // Default to full-width for backwards compatibility
|
2019-04-08 22:12:00 +03:00
|
|
|
}
|
|
|
|
|
2023-01-18 19:46:25 +01:00
|
|
|
//nolint:staticcheck
|
2019-04-08 22:12:00 +03:00
|
|
|
header := strings.Title(matches[1])
|
|
|
|
|
2019-04-19 10:31:41 +03:00
|
|
|
var value string
|
|
|
|
if len(matches) > 1 {
|
|
|
|
value = strings.TrimSpace(matches[2])
|
|
|
|
}
|
|
|
|
|
2019-04-08 22:12:00 +03:00
|
|
|
switch header {
|
|
|
|
case HeaderParent:
|
2019-04-19 10:31:41 +03:00
|
|
|
meta.Parents = append(meta.Parents, value)
|
2019-04-08 22:12:00 +03:00
|
|
|
|
|
|
|
case HeaderSpace:
|
2019-11-22 15:20:44 -05:00
|
|
|
meta.Space = strings.TrimSpace(value)
|
2019-04-08 22:12:00 +03:00
|
|
|
|
2021-03-31 17:49:01 +01:00
|
|
|
case HeaderType:
|
|
|
|
meta.Type = strings.TrimSpace(value)
|
|
|
|
|
2019-04-08 22:12:00 +03:00
|
|
|
case HeaderTitle:
|
2019-04-19 10:31:41 +03:00
|
|
|
meta.Title = strings.TrimSpace(value)
|
2019-04-08 22:12:00 +03:00
|
|
|
|
|
|
|
case HeaderLayout:
|
2019-04-19 10:31:41 +03:00
|
|
|
meta.Layout = strings.TrimSpace(value)
|
|
|
|
|
2021-06-17 14:56:27 -04:00
|
|
|
case HeaderSidebar:
|
|
|
|
meta.Layout = "article"
|
|
|
|
meta.Sidebar = strings.TrimSpace(value)
|
|
|
|
|
2025-02-14 17:10:45 +01:00
|
|
|
case HeaderEmoji:
|
|
|
|
meta.Emoji = strings.TrimSpace(value)
|
|
|
|
|
2019-04-19 10:31:41 +03:00
|
|
|
case HeaderAttachment:
|
2022-01-18 09:07:36 +03:00
|
|
|
meta.Attachments = append(meta.Attachments, value)
|
2019-04-08 22:12:00 +03:00
|
|
|
|
2021-01-04 13:08:58 +02:00
|
|
|
case HeaderLabel:
|
|
|
|
meta.Labels = append(meta.Labels, value)
|
|
|
|
|
2021-03-16 07:24:51 +03:00
|
|
|
case HeaderInclude:
|
|
|
|
// Includes are parsed by a different func
|
|
|
|
continue
|
|
|
|
|
2023-03-06 10:09:28 +01:00
|
|
|
case ContentAppearance:
|
|
|
|
if strings.TrimSpace(value) == FixedContentAppearance {
|
|
|
|
meta.ContentAppearance = FixedContentAppearance
|
|
|
|
} else {
|
|
|
|
meta.ContentAppearance = FullWidthContentAppearance
|
|
|
|
}
|
|
|
|
|
2019-04-08 22:12:00 +03:00
|
|
|
default:
|
2019-04-19 10:31:41 +03:00
|
|
|
log.Errorf(
|
|
|
|
nil,
|
|
|
|
`encountered unknown header %q line: %#v`,
|
2019-04-08 22:12:00 +03:00
|
|
|
header,
|
|
|
|
line,
|
|
|
|
)
|
|
|
|
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-20 22:54:11 +01:00
|
|
|
if titleFromH1 || spaceFromCli != "" {
|
2023-03-20 19:19:31 +01:00
|
|
|
if meta == nil {
|
|
|
|
meta = &Meta{}
|
|
|
|
}
|
|
|
|
|
|
|
|
if meta.Type == "" {
|
|
|
|
meta.Type = "page"
|
|
|
|
}
|
|
|
|
|
|
|
|
if meta.ContentAppearance == "" {
|
|
|
|
meta.ContentAppearance = FullWidthContentAppearance // Default to full-width for backwards compatibility
|
|
|
|
}
|
|
|
|
|
2023-03-20 22:54:11 +01:00
|
|
|
if titleFromH1 && meta.Title == "" {
|
2023-03-20 19:19:31 +01:00
|
|
|
meta.Title = ExtractDocumentLeadingH1(data)
|
|
|
|
}
|
2023-03-20 22:54:11 +01:00
|
|
|
if spaceFromCli != "" && meta.Space == "" {
|
|
|
|
meta.Space = spaceFromCli
|
|
|
|
}
|
2023-03-20 19:19:31 +01:00
|
|
|
}
|
|
|
|
|
2019-04-08 22:12:00 +03:00
|
|
|
if meta == nil {
|
2019-08-08 23:41:26 +03:00
|
|
|
return nil, data, nil
|
2019-04-08 22:12:00 +03:00
|
|
|
}
|
|
|
|
|
2023-08-09 13:06:31 +02:00
|
|
|
// Prepend parent pages that are defined via the cli flag
|
|
|
|
if len(parents) > 0 && parents[0] != "" {
|
|
|
|
meta.Parents = append(parents, meta.Parents...)
|
|
|
|
}
|
|
|
|
|
2024-09-30 21:00:49 -04:00
|
|
|
// deterministically generate a hash from the page's parents, space, and title
|
|
|
|
if titleAppendGeneratedHash {
|
|
|
|
path := strings.Join(append(meta.Parents, meta.Space, meta.Title), "/")
|
|
|
|
pathHash := sha256.Sum256([]byte(path))
|
|
|
|
// postfix is an 8-character hexadecimal string representation of the first 4 out of 32 bytes of the hash
|
|
|
|
meta.Title = fmt.Sprintf("%s - %x", meta.Title, pathHash[0:4])
|
|
|
|
log.Debugf(
|
|
|
|
nil,
|
|
|
|
"appended hash to page title: %s",
|
|
|
|
meta.Title,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2019-08-13 19:48:11 +03:00
|
|
|
return meta, data[offset:], nil
|
2019-04-08 22:12:00 +03:00
|
|
|
}
|
2024-09-26 15:24:39 +02:00
|
|
|
|
|
|
|
// ExtractDocumentLeadingH1 will extract leading H1 heading
|
|
|
|
func ExtractDocumentLeadingH1(markdown []byte) string {
|
|
|
|
h1 := regexp.MustCompile(`#[^#]\s*(.*)\s*\n`)
|
|
|
|
groups := h1.FindSubmatch(markdown)
|
|
|
|
if groups == nil {
|
|
|
|
return ""
|
|
|
|
} else {
|
|
|
|
return string(groups[1])
|
|
|
|
}
|
|
|
|
}
|