Cyrene/backend/ai-core/internal/tools/text_tool.go

package tools

import (
	"context"
	"fmt"
	"regexp"
	"strings"
	"unicode"
)

// TextTool provides text processing operations for the LLM.
// Supports counting, summarizing, translation, and pattern extraction.
type TextTool struct{}

// NewTextTool creates a text processing tool.
func NewTextTool() *TextTool {
	return &TextTool{}
}

// Definition returns the tool definition for LLM function calling.
func (t *TextTool) Definition() ToolDefinition {
	return ToolDefinition{
		Name:        "text",
		Description: "文本处理工具。统计文本、生成摘要、翻译文本、正则提取信息。用于处理用户提供的文本内容。",
		Parameters: map[string]interface{}{
			"type": "object",
			"properties": map[string]interface{}{
				"action": map[string]interface{}{
					"type":        "string",
					"enum":        []string{"count", "summarize", "translate", "extract"},
					"description": "操作类型。count: 统计字符/单词/行/段落数；summarize: 提取首段+关键句生成简单摘要；translate: 翻译文本（需指定target_lang）；extract: 正则提取邮箱/电话/URL等",
				},
				"text": map[string]interface{}{
					"type":        "string",
					"description": "输入文本，需要处理的文本内容",
				},
				"target_lang": map[string]interface{}{
					"type":        "string",
					"enum":        []string{"en", "zh", "ja", "ko", "fr", "de"},
					"description": "翻译目标语言代码。en: 英语, zh: 中文, ja: 日语, ko: 韩语, fr: 法语, de: 德语",
				},
				"pattern": map[string]interface{}{
					"type":        "string",
					"description": "正则表达式模式，用于 extract 操作。常用预设: email(邮箱), phone(电话), url(网址)",
				},
			},
			"required": []string{"action", "text"},
		},
	}
}

// Execute performs text processing operations.
func (t *TextTool) Execute(ctx context.Context, arguments map[string]interface{}) (*ToolResult, error) {
	action, ok := arguments["action"].(string)
	if !ok || action == "" {
		return &ToolResult{
			ToolName: "text",
			Success:  false,
			Error:    "缺少 action 参数",
		}, nil
	}

	text, ok := arguments["text"].(string)
	if !ok || strings.TrimSpace(text) == "" {
		return &ToolResult{
			ToolName: "text",
			Success:  false,
			Error:    "缺少 text 参数或文本为空",
		}, nil
	}

	switch action {
	case "count":
		return t.handleCount(text)
	case "summarize":
		return t.handleSummarize(text)
	case "translate":
		return t.handleTranslate(arguments)
	case "extract":
		return t.handleExtract(arguments)
	default:
		return &ToolResult{
			ToolName: "text",
			Success:  false,
			Error:    fmt.Sprintf("未知操作: %s，支持: count, summarize, translate, extract", action),
		}, nil
	}
}

// handleCount counts characters, words, lines, and paragraphs in the text.
func (t *TextTool) handleCount(text string) (*ToolResult, error) {
	charCount := len([]rune(text))
	byteCount := len(text)

	words := strings.Fields(text)
	wordCount := len(words)

	lines := strings.Split(text, "\n")
	lineCount := len(lines)

	// Count paragraphs (separated by double newlines)
	paragraphs := regexp.MustCompile(`\n\s*\n`).Split(text, -1)
	paraCount := 0
	for _, p := range paragraphs {
		if strings.TrimSpace(p) != "" {
			paraCount++
		}
	}

	// Count Chinese characters
	chineseCount := 0
	for _, r := range text {
		if unicode.Is(unicode.Han, r) {
			chineseCount++
		}
	}

	return &ToolResult{
		ToolName: "text",
		Success:  true,
		Data: fmt.Sprintf("文本统计结果:\n- 字符数 (含空格): %d\n- 字符数 (不含空格): %d\n- 字节数: %d\n- 单词数: %d\n- 行数: %d\n- 段落数: %d\n- 中文字符数: %d",
			charCount, len([]rune(strings.ReplaceAll(text, " ", ""))),
			byteCount, wordCount, lineCount, paraCount, chineseCount),
	}, nil
}

// handleSummarize generates a simple summary by extracting the first paragraph and key sentences.
func (t *TextTool) handleSummarize(text string) (*ToolResult, error) {
	var result strings.Builder
	result.WriteString("文本摘要:\n\n")

	// Extract first paragraph
	paragraphs := regexp.MustCompile(`\n\s*\n`).Split(text, -1)
	var firstPara string
	for _, p := range paragraphs {
		if trimmed := strings.TrimSpace(p); trimmed != "" {
			firstPara = trimmed
			break
		}
	}

	if firstPara != "" {
		result.WriteString("【首段】\n")
		// Truncate if very long
		runes := []rune(firstPara)
		if len(runes) > 300 {
			firstPara = string(runes[:300]) + "..."
		}
		result.WriteString(firstPara)
		result.WriteString("\n\n")
	}

	// Extract key sentences (longer sentences with important keywords)
	sentences := t.splitSentences(text)
	keySentences := t.extractKeySentences(sentences, 5)

	if len(keySentences) > 0 {
		result.WriteString("【关键句】\n")
		for i, s := range keySentences {
			result.WriteString(fmt.Sprintf("%d. %s\n", i+1, s))
		}
	}

	// Overall stats
	lines := strings.Split(text, "\n")
	words := strings.Fields(text)
	result.WriteString(fmt.Sprintf("\n【概况】共 %d 段、%d 句、%d 词、%d 行",
		len(paragraphs), len(sentences), len(words), len(lines)))

	return &ToolResult{
		ToolName: "text",
		Success:  true,
		Data:     result.String(),
	}, nil
}

// splitSentences splits text into sentences based on punctuation.
func (t *TextTool) splitSentences(text string) []string {
	re := regexp.MustCompile(`[^。！？.!?\n]+[。！？.!?\n]?`)
	return re.FindAllString(text, -1)
}

// extractKeySentences selects the most informative sentences (longer ones with keyword hints).
func (t *TextTool) extractKeySentences(sentences []string, maxCount int) []string {
	type scored struct {
		text  string
		score int
	}

	var scoredList []scored
	keywords := []string{"重要", "关键", "核心", "主要", "首先", "最后", "因此", "所以", "总结",
		"important", "key", "critical", "significant", "therefore", "conclusion", "summary"}

	for _, s := range sentences {
		trimmed := strings.TrimSpace(s)
		if len([]rune(trimmed)) < 10 {
			continue
		}

		score := len([]rune(trimmed)) // longer sentences are more likely informative
		lower := strings.ToLower(trimmed)
		for _, kw := range keywords {
			if strings.Contains(lower, kw) {
				score += 50
			}
		}
		scoredList = append(scoredList, scored{text: trimmed, score: score})
	}

	// Sort by score descending (simple bubble sort for small lists)
	for i := 0; i < len(scoredList); i++ {
		for j := i + 1; j < len(scoredList); j++ {
			if scoredList[j].score > scoredList[i].score {
				scoredList[i], scoredList[j] = scoredList[j], scoredList[i]
			}
		}
	}

	result := make([]string, 0, maxCount)
	for i := 0; i < len(scoredList) && i < maxCount; i++ {
		result = append(result, scoredList[i].text)
	}
	return result
}

// handleTranslate provides a translation placeholder (actual translation requires LLM).
func (t *TextTool) handleTranslate(arguments map[string]interface{}) (*ToolResult, error) {
	text, _ := arguments["text"].(string)
	targetLang, _ := arguments["target_lang"].(string)
	if targetLang == "" {
		targetLang = "zh"
	}

	langNames := map[string]string{
		"en": "英语",
		"zh": "中文",
		"ja": "日语",
		"ko": "韩语",
		"fr": "法语",
		"de": "德语",
	}

	langName, ok := langNames[targetLang]
	if !ok {
		langName = targetLang
	}

	return &ToolResult{
		ToolName: "text",
		Success:  true,
		Data: fmt.Sprintf("【翻译请求】\n目标语言: %s (%s)\n原文 (%d 字符):\n---\n%s\n---\n\n提示: 实际翻译由LLM完成，请基于以上原文和目标语言进行翻译。",
			langName, targetLang, len([]rune(text)), text),
	}, nil
}

// handleExtract extracts patterns like emails, phones, URLs from text using regex.
func (t *TextTool) handleExtract(arguments map[string]interface{}) (*ToolResult, error) {
	text, _ := arguments["text"].(string)
	pattern, _ := arguments["pattern"].(string)

	// Predefined patterns
	presets := map[string]string{
		"email": `[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`,
		"phone": `(?:\+?86[\-\s]?)?1[3-9]\d{9}`,
		"url":   `https?://[^\s<>"{}|\\^` + "`" + `\[\]]+`,
	}

	if preset, ok := presets[strings.ToLower(pattern)]; ok {
		pattern = preset
	}

	if pattern == "" {
		// Extract all common patterns when no specific pattern given
		var result strings.Builder
		result.WriteString("文本提取结果:\n\n")

		for name, p := range presets {
			re, err := regexp.Compile(p)
			if err != nil {
				continue
			}
			matches := re.FindAllString(text, -1)
			if len(matches) > 0 {
				result.WriteString(fmt.Sprintf("【%s】(共 %d 个):\n", name, len(matches)))
				seen := make(map[string]bool)
				for _, m := range matches {
					if !seen[m] {
						result.WriteString(fmt.Sprintf("  - %s\n", m))
						seen[m] = true
					}
				}
				result.WriteString("\n")
			}
		}

		if result.Len() == len("文本提取结果:\n\n") {
			return &ToolResult{
				ToolName: "text",
				Success:  true,
				Data:     "未提取到匹配的内容（邮箱、电话、URL）",
			}, nil
		}

		return &ToolResult{
			ToolName: "text",
			Success:  true,
			Data:     result.String(),
		}, nil
	}

	// Use custom regex pattern
	re, err := regexp.Compile(pattern)
	if err != nil {
		return &ToolResult{
			ToolName: "text",
			Success:  false,
			Error:    fmt.Sprintf("正则表达式无效: %v", err),
		}, nil
	}

	matches := re.FindAllString(text, -1)
	if len(matches) == 0 {
		return &ToolResult{
			ToolName: "text",
			Success:  true,
			Data:     fmt.Sprintf("未找到匹配模式 '%s' 的内容", pattern),
		}, nil
	}

	var result strings.Builder
	result.WriteString(fmt.Sprintf("正则提取结果 (模式: %s, 共 %d 个匹配):\n", pattern, len(matches)))
	seen := make(map[string]bool)
	for _, m := range matches {
		if !seen[m] {
			result.WriteString(fmt.Sprintf("  - %s\n", m))
			seen[m] = true
		}
	}

	return &ToolResult{
		ToolName: "text",
		Success:  true,
		Data:     result.String(),
	}, nil
}