Cyrene/backend/ai-core/internal/tools/markdown_tool.go

package tools

import (
	"context"
	"fmt"
	"regexp"
	"strings"
)

// MarkdownTool provides Markdown processing utilities for the LLM.
// Supports HTML conversion, plain text extraction, link/code extraction, and TOC generation.
type MarkdownTool struct{}

// NewMarkdownTool creates a Markdown processing tool.
func NewMarkdownTool() *MarkdownTool {
	return &MarkdownTool{}
}

// Definition returns the tool definition for LLM function calling.
func (t *MarkdownTool) Definition() ToolDefinition {
	return ToolDefinition{
		Name:        "markdown",
		Description: "Markdown处理工具。将Markdown转为HTML、提取纯文本、提取链接/代码块、生成目录。用于处理Markdown格式的文档内容。",
		Parameters: map[string]interface{}{
			"type": "object",
			"properties": map[string]interface{}{
				"action": map[string]interface{}{
					"type":        "string",
					"enum":        []string{"to_html", "to_text", "extract_links", "extract_code", "table_of_contents"},
					"description": "操作类型。to_html: 转换为HTML；to_text: 提取纯文本；extract_links: 提取所有链接；extract_code: 提取所有代码块；table_of_contents: 生成目录",
				},
				"markdown": map[string]interface{}{
					"type":        "string",
					"description": "Markdown格式文本，需要处理的Markdown内容",
				},
			},
			"required": []string{"action", "markdown"},
		},
	}
}

// Execute performs Markdown processing operations.
func (t *MarkdownTool) Execute(ctx context.Context, arguments map[string]interface{}) (*ToolResult, error) {
	action, ok := arguments["action"].(string)
	if !ok || action == "" {
		return &ToolResult{
			ToolName: "markdown",
			Success:  false,
			Error:    "缺少 action 参数",
		}, nil
	}

	md, ok := arguments["markdown"].(string)
	if !ok || strings.TrimSpace(md) == "" {
		return &ToolResult{
			ToolName: "markdown",
			Success:  false,
			Error:    "缺少 markdown 参数或内容为空",
		}, nil
	}

	switch action {
	case "to_html":
		return t.handleToHTML(md)
	case "to_text":
		return t.handleToText(md)
	case "extract_links":
		return t.handleExtractLinks(md)
	case "extract_code":
		return t.handleExtractCode(md)
	case "table_of_contents":
		return t.handleTableOfContents(md)
	default:
		return &ToolResult{
			ToolName: "markdown",
			Success:  false,
			Error:    fmt.Sprintf("未知操作: %s，支持: to_html, to_text, extract_links, extract_code, table_of_contents", action),
		}, nil
	}
}

// handleToHTML converts Markdown to HTML using simple regex-based approach.
func (t *MarkdownTool) handleToHTML(md string) (*ToolResult, error) {
	html := md

	// Process in order: code blocks first (to avoid interference), then inline elements, then blocks

	// 1. Code blocks (```...```) - preserve with placeholder
	codeBlocks := make([]string, 0)
	reFence := regexp.MustCompile("(?s)```[^`]*```")
	html = reFence.ReplaceAllStringFunc(html, func(match string) string {
		codeBlocks = append(codeBlocks, match)
		return fmt.Sprintf("\x00CODEBLOCK%d\x00", len(codeBlocks)-1)
	})

	// 2. Inline code (`...`)
	inlineCodes := make([]string, 0)
	reInlineCode := regexp.MustCompile("`[^`]+`")
	html = reInlineCode.ReplaceAllStringFunc(html, func(match string) string {
		inlineCodes = append(inlineCodes, match)
		return fmt.Sprintf("\x00INLINECODE%d\x00", len(inlineCodes)-1)
	})

	// 3. Images ![alt](url)
	reImage := regexp.MustCompile(`!\[([^\]]*)\]\(([^)]+)\)`)
	html = reImage.ReplaceAllString(html, `<img src="$2" alt="$1">`)

	// 4. Links [text](url)
	reLink := regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`)
	html = reLink.ReplaceAllString(html, `<a href="$2">$1</a>`)

	// 5. Bold **text** or __text__
	reBold := regexp.MustCompile(`\*\*([^*]+)\*\*`)
	html = reBold.ReplaceAllString(html, `<strong>$1</strong>`)
	reBold2 := regexp.MustCompile(`__([^_]+)__`)
	html = reBold2.ReplaceAllString(html, `<strong>$1</strong>`)

	// 6. Italic *text* or _text_
	reItalic := regexp.MustCompile(`\*([^*]+)\*`)
	html = reItalic.ReplaceAllString(html, `<em>$1</em>`)
	reItalic2 := regexp.MustCompile(`_([^_]+)_`)
	html = reItalic2.ReplaceAllString(html, `<em>$1</em>`)

	// 7. Strikethrough ~~text~~
	reStrike := regexp.MustCompile(`~~([^~]+)~~`)
	html = reStrike.ReplaceAllString(html, `<del>$1</del>`)

	// 8. Headings (# to ######)
	reH6 := regexp.MustCompile(`(?m)^######\s+(.+)$`)
	html = reH6.ReplaceAllString(html, `<h6>$1</h6>`)
	reH5 := regexp.MustCompile(`(?m)^#####\s+(.+)$`)
	html = reH5.ReplaceAllString(html, `<h5>$1</h5>`)
	reH4 := regexp.MustCompile(`(?m)^####\s+(.+)$`)
	html = reH4.ReplaceAllString(html, `<h4>$1</h4>`)
	reH3 := regexp.MustCompile(`(?m)^###\s+(.+)$`)
	html = reH3.ReplaceAllString(html, `<h3>$1</h3>`)
	reH2 := regexp.MustCompile(`(?m)^##\s+(.+)$`)
	html = reH2.ReplaceAllString(html, `<h2>$1</h2>`)
	reH1 := regexp.MustCompile(`(?m)^#\s+(.+)$`)
	html = reH1.ReplaceAllString(html, `<h1>$1</h1>`)

	// 9. Horizontal rules
	reHR := regexp.MustCompile(`(?m)^(---|\*\*\*|___)\s*$`)
	html = reHR.ReplaceAllString(html, `<hr>`)

	// 10. Unordered lists (- item)
	html = t.processLists(html, `(?m)^[\-*]\s+`, "ul")
	// 11. Ordered lists (1. item)
	html = t.processLists(html, `(?m)^\d+\.\s+`, "ol")

	// 12. Blockquotes
	reBlockquote := regexp.MustCompile(`(?m)^>\s?(.+)$`)
	html = reBlockquote.ReplaceAllString(html, `<blockquote>$1</blockquote>`)

	// 13. Paragraphs: wrap remaining text lines
	html = t.wrapParagraphs(html)

	// 14. Restore code blocks
	for i, cb := range codeBlocks {
		// Strip the opening/closing ```
		content := strings.TrimPrefix(cb, "```")
		content = strings.TrimSuffix(content, "```")
		// Extract language if present on first line
		lang := ""
		content = strings.TrimSpace(content)
		if idx := strings.Index(content, "\n"); idx > 0 {
			lang = strings.TrimSpace(content[:idx])
			content = strings.TrimSpace(content[idx+1:])
		}
		if lang != "" {
			html = strings.ReplaceAll(html, fmt.Sprintf("\x00CODEBLOCK%d\x00", i),
				fmt.Sprintf(`<pre><code class="language-%s">%s</code></pre>`, lang, escapeHTML(content)))
		} else {
			html = strings.ReplaceAll(html, fmt.Sprintf("\x00CODEBLOCK%d\x00", i),
				fmt.Sprintf("<pre><code>%s</code></pre>", escapeHTML(content)))
		}
	}

	// 15. Restore inline code
	for i, ic := range inlineCodes {
		content := strings.Trim(ic, "`")
		html = strings.ReplaceAll(html, fmt.Sprintf("\x00INLINECODE%d\x00", i),
			fmt.Sprintf("<code>%s</code>", escapeHTML(content)))
	}

	return &ToolResult{
		ToolName: "markdown",
		Success:  true,
		Data:     html,
	}, nil
}

// handleToText strips Markdown formatting and extracts plain text.
func (t *MarkdownTool) handleToText(md string) (*ToolResult, error) {
	text := md

	// Remove code blocks
	reFence := regexp.MustCompile("(?s)```[^`]*```")
	text = reFence.ReplaceAllString(text, "[代码块]")

	// Remove inline code
	reInlineCode := regexp.MustCompile("`[^`]+`")
	text = reInlineCode.ReplaceAllString(text, "[代码]")

	// Remove images ![alt](url) - keep alt text
	reImage := regexp.MustCompile(`!\[([^\]]*)\]\([^)]+\)`)
	text = reImage.ReplaceAllString(text, "$1")

	// Remove links [text](url) - keep text
	reLink := regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`)
	text = reLink.ReplaceAllString(text, "$1")

	// Remove bold/italic markers
	text = regexp.MustCompile(`\*\*([^*]+)\*\*`).ReplaceAllString(text, "$1")
	text = regexp.MustCompile(`__([^_]+)__`).ReplaceAllString(text, "$1")
	text = regexp.MustCompile(`\*([^*]+)\*`).ReplaceAllString(text, "$1")
	text = regexp.MustCompile(`_([^_]+)_`).ReplaceAllString(text, "$1")

	// Remove strikethrough
	text = regexp.MustCompile(`~~([^~]+)~~`).ReplaceAllString(text, "$1")

	// Remove heading markers but keep the text
	text = regexp.MustCompile(`(?m)^#{1,6}\s+`).ReplaceAllString(text, "")

	// Remove horizontal rules
	text = regexp.MustCompile(`(?m)^(---|\*\*\*|___)\s*$`).ReplaceAllString(text, "")

	// Remove list markers
	text = regexp.MustCompile(`(?m)^[\-*]\s+`).ReplaceAllString(text, "")
	text = regexp.MustCompile(`(?m)^\d+\.\s+`).ReplaceAllString(text, "")

	// Remove blockquote markers
	text = regexp.MustCompile(`(?m)^>\s?`).ReplaceAllString(text, "")

	// Collapse multiple blank lines
	text = regexp.MustCompile(`\n{3,}`).ReplaceAllString(text, "\n\n")

	return &ToolResult{
		ToolName: "markdown",
		Success:  true,
		Data: fmt.Sprintf("纯文本提取结果 (%d 字符):\n\n%s",
			len([]rune(text)), strings.TrimSpace(text)),
	}, nil
}

// handleExtractLinks extracts all [text](url) links from Markdown.
func (t *MarkdownTool) handleExtractLinks(md string) (*ToolResult, error) {
	reLink := regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`)
	matches := reLink.FindAllStringSubmatch(md, -1)

	if len(matches) == 0 {
		return &ToolResult{
			ToolName: "markdown",
			Success:  true,
			Data:     "未找到任何链接",
		}, nil
	}

	var result strings.Builder
	result.WriteString(fmt.Sprintf("提取链接 (共 %d 个):\n\n", len(matches)))
	for i, m := range matches {
		result.WriteString(fmt.Sprintf("%d. [%s](%s)\n   - 文本: %s\n   - URL: %s\n\n",
			i+1, m[1], m[2], m[1], m[2]))
	}

	return &ToolResult{
		ToolName: "markdown",
		Success:  true,
		Data:     strings.TrimSpace(result.String()),
	}, nil
}

// handleExtractCode extracts all code blocks from Markdown.
func (t *MarkdownTool) handleExtractCode(md string) (*ToolResult, error) {
	reFence := regexp.MustCompile("(?s)```([^`]*)```")
	matches := reFence.FindAllStringSubmatch(md, -1)

	if len(matches) == 0 {
		return &ToolResult{
			ToolName: "markdown",
			Success:  true,
			Data:     "未找到任何代码块",
		}, nil
	}

	var result strings.Builder
	result.WriteString(fmt.Sprintf("提取代码块 (共 %d 个):\n\n", len(matches)))
	for i, m := range matches {
		content := strings.TrimSpace(m[1])
		lang := ""
		if idx := strings.Index(content, "\n"); idx > 0 {
			lang = strings.TrimSpace(content[:idx])
			content = strings.TrimSpace(content[idx+1:])
		}

		result.WriteString(fmt.Sprintf("--- 代码块 %d", i+1))
		if lang != "" {
			result.WriteString(fmt.Sprintf(" (语言: %s)", lang))
		}
		result.WriteString(fmt.Sprintf(" ---\n%s\n\n", truncateText(content, 500)))
	}

	return &ToolResult{
		ToolName: "markdown",
		Success:  true,
		Data:     strings.TrimSpace(result.String()),
	}, nil
}

// handleTableOfContents generates a table of contents from headings.
func (t *MarkdownTool) handleTableOfContents(md string) (*ToolResult, error) {
	reHeading := regexp.MustCompile(`(?m)^(#{1,6})\s+(.+)$`)
	matches := reHeading.FindAllStringSubmatch(md, -1)

	if len(matches) == 0 {
		return &ToolResult{
			ToolName: "markdown",
			Success:  true,
			Data:     "未找到任何标题，无法生成目录",
		}, nil
	}

	var result strings.Builder
	result.WriteString(fmt.Sprintf("文档目录 (共 %d 个标题):\n\n", len(matches)))
	for _, m := range matches {
		level := len(m[1])
		title := strings.TrimSpace(m[2])
		indent := strings.Repeat("  ", level-1)
		result.WriteString(fmt.Sprintf("%s%s %s\n", indent, strings.Repeat("#", level), title))
	}

	return &ToolResult{
		ToolName: "markdown",
		Success:  true,
		Data:     result.String(),
	}, nil
}

// --- Markdown helper functions below ---

// processLists wraps consecutive list items in <ul> or <ol> tags.
func (t *MarkdownTool) processLists(html, itemPattern, listTag string) string {
	reItem := regexp.MustCompile(itemPattern + `(.+)$`)
	lines := strings.Split(html, "\n")
	result := make([]string, 0, len(lines))

	inList := false
	for _, line := range lines {
		if reItem.MatchString(line) {
			content := reItem.ReplaceAllString(line, "$1")
			if !inList {
				result = append(result, fmt.Sprintf("<%s>", listTag))
				inList = true
			}
			result = append(result, fmt.Sprintf("<li>%s</li>", content))
		} else {
			if inList {
				result = append(result, fmt.Sprintf("</%s>", listTag))
				inList = false
			}
			result = append(result, line)
		}
	}
	if inList {
		result = append(result, fmt.Sprintf("</%s>", listTag))
	}

	return strings.Join(result, "\n")
}

// wrapParagraphs wraps non-tag lines in <p> tags.
func (t *MarkdownTool) wrapParagraphs(html string) string {
	lines := strings.Split(html, "\n")
	result := make([]string, 0, len(lines))

	skipTags := map[string]bool{
		"<h1>": true, "<h2>": true, "<h3>": true, "<h4>": true, "<h5>": true, "<h6>": true,
		"<hr>": true, "<ul>": true, "</ul>": true, "<ol>": true, "</ol>": true,
		"<li>": true, "</li>": true, "<blockquote>": true, "</blockquote>": true,
		"<pre>": true, "</pre>": true, "<img": true,
	}

	for _, line := range lines {
		trimmed := strings.TrimSpace(line)
		if trimmed == "" {
			result = append(result, line)
			continue
		}

		// Check if line starts with an HTML tag
		isTag := false
		for tag := range skipTags {
			if strings.HasPrefix(trimmed, tag) {
				isTag = true
				break
			}
		}

		if !isTag {
			result = append(result, fmt.Sprintf("<p>%s</p>", trimmed))
		} else {
			result = append(result, line)
		}
	}

	return strings.Join(result, "\n")
}

// escapeHTML escapes special HTML characters.
func escapeHTML(s string) string {
	replacer := strings.NewReplacer(
		"&", "&"+"amp;",
		"<", "&"+"lt;",
		">", "&"+"gt;",
		"\"", "&"+"quot;",
	)
	return replacer.Replace(s)
}

// truncateText truncates text to maxLen runes, adding "..." if truncated.
func truncateText(s string, maxLen int) string {
	runes := []rune(s)
	if len(runes) <= maxLen {
		return s
	}
	return string(runes[:maxLen]) + "..."
}