feat: ASR语音转写管线 + 群聊身份混淆修复

- 新增ASR语音识别管线: QQ语音→下载音频→qwen3-asr-flash转录→注入用户消息 - 模型名称全部从models.json路由获取，无硬编码 - 修复群聊中AI将非管理员用户误称为管理员昵称(叶酱)的问题 - 助手回复缓存时标注[回复昵称 (UID)]，防止对话历史中身份混淆 - 群聊上下文指令改为肯定性表述，移除具体名称提及 - trace面板时间戳改为YYYY-MM-DD HH:MM:SS格式，耗时统一显示为秒 - 修复Go time.Duration纳秒值在前端显示问题(Duration/1e6转毫秒) - 新增video_tool插件模板 - 优化OpenAI adapter reasoning_content处理 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-31 16:46:47 +08:00
parent d112fdd540
commit a9c79d7887
16 changed files with 780 additions and 67 deletions
@@ -0,0 +1,88 @@
+package tools
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+
+	"git.yeij.top/AskaEth/Cyrene/ai-core/internal/llm"
+	"git.yeij.top/AskaEth/Cyrene/ai-core/internal/model"
+)
+
+// VideoTool enables video understanding via multimodal LLM.
+type VideoTool struct {
+	videoProvider llm.LLMProvider
+}
+
+// NewVideoTool creates a video tool. videoProvider is optional (nil = no-op mode).
+func NewVideoTool(videoProvider llm.LLMProvider) *VideoTool {
+	return &VideoTool{videoProvider: videoProvider}
+}
+
+func (t *VideoTool) Definition() ToolDefinition {
+	return ToolDefinition{
+		Name:        "video_analyze",
+		Description: "分析视频内容。传入视频文件路径或URL，返回视频内容的文字描述和分析结果。支持场景理解、动作识别、文字提取等。",
+		Parameters: map[string]interface{}{
+			"type": "object",
+			"properties": map[string]interface{}{
+				"video_path": map[string]interface{}{
+					"type":        "string",
+					"description": "视频文件路径或URL",
+				},
+				"task": map[string]interface{}{
+					"type":        "string",
+					"description": "分析任务: describe(内容描述), summarize(摘要), analyze(综合分析)",
+					"enum":        []string{"describe", "summarize", "analyze"},
+				},
+			},
+			"required": []string{"video_path", "task"},
+		},
+	}
+}
+
+var videoTaskPrompts = map[string]string{
+	"describe":   "请详细描述这个视频的内容，包括场景、人物、动作、对话要点等。",
+	"summarize":  "请用简洁的语言总结这个视频的主要内容。",
+	"analyze":    "请综合分析这个视频，包括内容描述、关键片段、文字信息(如有)、以及你的理解。",
+}
+
+func (t *VideoTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
+	videoPath, _ := args["video_path"].(string)
+	if videoPath == "" {
+		return &ToolResult{ToolName: "video_analyze", Success: false, Error: "video_path 参数不能为空"}, nil
+	}
+
+	task, _ := args["task"].(string)
+	if task == "" {
+		task = "analyze"
+	}
+
+	prompt := videoTaskPrompts[task]
+	if prompt == "" {
+		prompt = videoTaskPrompts["analyze"]
+	}
+
+	if t.videoProvider == nil {
+		return &ToolResult{ToolName: "video_analyze", Success: false, Error: "视频理解模型未配置"}, nil
+	}
+
+	messages := []model.LLMMessage{
+		{Role: model.RoleUser, Content: prompt, VideoURLs: []string{videoPath}},
+	}
+	resp, err := t.videoProvider.Chat(ctx, messages)
+	if err != nil {
+		return &ToolResult{ToolName: "video_analyze", Success: false, Error: fmt.Sprintf("视频模型调用失败: %v", err)}, nil
+	}
+
+	output, _ := json.Marshal(map[string]interface{}{
+		"video_path":        videoPath,
+		"task":              task,
+		"model":             t.videoProvider.ModelName(),
+		"text":              resp.Content,
+		"prompt_tokens":     resp.Usage.PromptTokens,
+		"completion_tokens": resp.Usage.CompletionTokens,
+		"total_tokens":      resp.Usage.TotalTokens,
+	})
+	return &ToolResult{ToolName: "video_analyze", Success: true, Data: string(output)}, nil
+}