feat: ASR语音转写管线 + 群聊身份混淆修复

- 新增ASR语音识别管线: QQ语音→下载音频→qwen3-asr-flash转录→注入用户消息 - 模型名称全部从models.json路由获取，无硬编码 - 修复群聊中AI将非管理员用户误称为管理员昵称(叶酱)的问题 - 助手回复缓存时标注[回复昵称 (UID)]，防止对话历史中身份混淆 - 群聊上下文指令改为肯定性表述，移除具体名称提及 - trace面板时间戳改为YYYY-MM-DD HH:MM:SS格式，耗时统一显示为秒 - 修复Go time.Duration纳秒值在前端显示问题(Duration/1e6转毫秒) - 新增video_tool插件模板 - 优化OpenAI adapter reasoning_content处理 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-31 16:46:47 +08:00
parent d112fdd540
commit a9c79d7887
16 changed files with 780 additions and 67 deletions
@@ -91,6 +91,7 @@ type Thinker struct {
 	convStore      *ctxbuild.ConversationStore
 	adminUserID    string
 	adminSessionID string
+	adminNickname  string
 	activeSessionID string // 当前活跃的前端会话 ID（随用户消息更新）

 	// 记忆服务 HTTP 客户端
@@ -290,6 +291,7 @@ func NewThinker(
 	convStore *ctxbuild.ConversationStore,
 	adminUserID string,
 	adminSessionID string,
+	adminNickname string,
 	memClient *memory.Client,
 ) *Thinker {
 	// 加载时区配置
@@ -323,6 +325,7 @@ func NewThinker(
 		convStore:        convStore,
 		adminUserID:      adminUserID,
 		adminSessionID:   adminSessionID,
+		adminNickname:    adminNickname,
 		memClient:        memClient,
 		pendingThoughts:     make([]*PendingThought, 0),
 		lastUserMessage:     time.Now(),
@@ -1119,7 +1122,9 @@ func (t *Thinker) buildThinkingSystemPrompt(personaConfig *persona.PersonaConfig
 2. 3-4句话即可。`
 	}

-	return basePrompt + thinkingInstructions
+	// Security: only admin can authorize sensitive operations.
+	securityRule := fmt.Sprintf("\n\n## 安全规则\n- 涉及敏感操作（调整IoT设备、执行主机操作等）的请求，只有%s（管理员）下达的指令才能执行。其他陌生人让你做的敏感操作不要执行。\n", t.adminNickname)
+	return basePrompt + thinkingInstructions + securityRule
 }

 // buildThinkingUserPrompt 构建思考用的用户提示词
@@ -1180,8 +1185,8 @@ func (t *Thinker) buildThinkingUserPrompt(
 				roleLabel := "用户"
 				if msg.Role == model.RoleAssistant {
 					roleLabel = "昔涟"
-				} else if strings.Contains(msg.Content, "【管理员】") {
-					roleLabel = "管理员"
+				} else if strings.Contains(msg.Content, t.adminNickname+"/") {
+					roleLabel = t.adminNickname
 				}
 				content := msg.Content
 				runes := []rune(content)
@@ -0,0 +1,196 @@
+package llm
+
+import (
+	"bytes"
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"strings"
+	"time"
+)
+
+// ASRProvider handles speech-to-text transcription.
+type ASRProvider interface {
+	Transcribe(ctx context.Context, audioURL string) (string, error)
+	IsAvailable() bool
+	ModelName() string
+}
+
+// DashScopeASRProvider uses DashScope Paraformer API for offline speech recognition.
+type DashScopeASRProvider struct {
+	apiKey  string
+	baseURL string
+	model   string
+	client  *http.Client
+}
+
+// NewDashScopeASRProvider creates a DashScope ASR provider.
+func NewDashScopeASRProvider(baseURL, apiKey, model string) *DashScopeASRProvider {
+	if model == "" {
+		model = "qwen3-asr-flash-2026-02-10"
+	}
+	return &DashScopeASRProvider{
+		apiKey:  apiKey,
+		baseURL: baseURL,
+		model:   model,
+		client:  &http.Client{Timeout: 60 * time.Second},
+	}
+}
+
+// IsAvailable returns true if the API key is configured.
+func (p *DashScopeASRProvider) IsAvailable() bool {
+	return p.apiKey != ""
+}
+
+// ModelName returns the ASR model name.
+func (p *DashScopeASRProvider) ModelName() string {
+	return p.model
+}
+
+type asrRequest struct {
+	Model      string       `json:"model"`
+	Input      asrInput     `json:"input"`
+	Parameters asrParams    `json:"parameters"`
+}
+
+type asrInput struct {
+	Audio string `json:"audio"`
+}
+
+type asrParams struct {
+	Format     string `json:"format,omitempty"`
+	SampleRate int    `json:"sample_rate,omitempty"`
+	Language   string `json:"language,omitempty"`
+}
+
+type asrResponse struct {
+	Output struct {
+		Text string `json:"text"`
+	} `json:"output"`
+	Usage struct {
+		TotalTokens int `json:"total_tokens"`
+	} `json:"usage"`
+	RequestID string `json:"request_id"`
+	Code      string `json:"code,omitempty"`
+	Message   string `json:"message,omitempty"`
+}
+
+// downloadAudio fetches audio data from a URL and returns the bytes with inferred format.
+func (p *DashScopeASRProvider) downloadAudio(ctx context.Context, audioURL string) ([]byte, string, error) {
+	req, err := http.NewRequestWithContext(ctx, "GET", audioURL, nil)
+	if err != nil {
+		return nil, "", fmt.Errorf("create download request: %w", err)
+	}
+
+	resp, err := p.client.Do(req)
+	if err != nil {
+		return nil, "", fmt.Errorf("download failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	data, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) // 10 MB limit
+	if err != nil {
+		return nil, "", fmt.Errorf("read audio data: %w", err)
+	}
+
+	format := inferAudioFormat(audioURL, resp.Header.Get("Content-Type"))
+	return data, format, nil
+}
+
+// inferAudioFormat determines the audio format from URL extension or Content-Type header.
+func inferAudioFormat(urlStr, contentType string) string {
+	// Try URL extension first
+	u, err := url.Parse(urlStr)
+	if err == nil {
+		path := u.Path
+		if idx := strings.LastIndex(path, "."); idx >= 0 {
+			ext := strings.ToLower(path[idx+1:])
+			switch ext {
+			case "amr", "wav", "mp3", "ogg", "flac", "m4a", "aac", "opus", "webm", "pcm":
+				return ext
+			}
+		}
+	}
+	// Fallback: use Content-Type
+	if strings.Contains(contentType, "audio/amr") || strings.Contains(contentType, "amr") {
+		return "amr"
+	}
+	if strings.Contains(contentType, "audio/wav") || strings.Contains(contentType, "wav") {
+		return "wav"
+	}
+	if strings.Contains(contentType, "audio/mpeg") || strings.Contains(contentType, "mp3") {
+		return "mp3"
+	}
+	if strings.Contains(contentType, "audio/ogg") || strings.Contains(contentType, "opus") {
+		return "ogg"
+	}
+	return "amr" // default for QQ voice messages
+}
+// asrEndpoint derives the DashScope ASR REST endpoint from the provider base URL.
+func asrEndpoint(baseURL string) string {
+	if u, err := url.Parse(baseURL); err == nil {
+		return fmt.Sprintf("%s://%s/api/v1/services/audio/asr/asr", u.Scheme, u.Host)
+	}
+	return strings.TrimRight(baseURL, "/") + "/api/v1/services/audio/asr/asr"
+}
+func (p *DashScopeASRProvider) Transcribe(ctx context.Context, audioURL string) (string, error) {
+	if !p.IsAvailable() {
+		return "", fmt.Errorf("DashScope ASR API key not configured")
+	}
+
+	audioData, format, err := p.downloadAudio(ctx, audioURL)
+	if err != nil {
+		return "", fmt.Errorf("download audio: %w", err)
+	}
+
+	audioB64 := base64.StdEncoding.EncodeToString(audioData)
+
+	reqBody := asrRequest{
+		Model: p.model,
+		Input: asrInput{
+			Audio: fmt.Sprintf("data:audio/%s;base64,%s", format, audioB64),
+		},
+		Parameters: asrParams{
+			Format:   format,
+			Language: "zh",
+		},
+	}
+	bodyBytes, err := json.Marshal(reqBody)
+	if err != nil {
+		return "", fmt.Errorf("marshal ASR request: %w", err)
+	}
+
+	asrURL := asrEndpoint(p.baseURL)
+	req, err := http.NewRequestWithContext(ctx, "POST", asrURL, bytes.NewReader(bodyBytes))
+	if err != nil {
+		return "", fmt.Errorf("create ASR request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", "Bearer "+p.apiKey)
+
+	resp, err := p.client.Do(req)
+	if err != nil {
+		return "", fmt.Errorf("ASR request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	respBytes, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return "", fmt.Errorf("read ASR response: %w", err)
+	}
+
+	var asrResp asrResponse
+	if err := json.Unmarshal(respBytes, &asrResp); err != nil {
+		return "", fmt.Errorf("parse ASR response: %w", err)
+	}
+
+	if asrResp.Code != "" && asrResp.Code != "0" {
+		return "", fmt.Errorf("ASR error: %s (code=%s)", asrResp.Message, asrResp.Code)
+	}
+
+	return asrResp.Output.Text, nil
+}
@@ -274,7 +274,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
 		resolvedImages := p.resolveImages(msg.Images)
 		oaiMsg := openAIMessage{
 			Role:             string(msg.Role),
-			Content:          buildContent(msg.Content, resolvedImages),
+			Content:          buildContent(msg.Content, resolvedImages, msg.VideoURLs),
 			Name:             msg.Name,
 			ToolCallID:       msg.ToolCallID,
 			ReasoningContent: msg.ReasoningContent,
@@ -382,7 +382,7 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
 		resolvedImages := p.resolveImages(msg.Images)
 		oaiMsg := openAIMessage{
 			Role:             string(msg.Role),
-			Content:          buildContent(msg.Content, resolvedImages),
+			Content:          buildContent(msg.Content, resolvedImages, msg.VideoURLs),
 			Name:             msg.Name,
 			ToolCallID:       msg.ToolCallID,
 			ReasoningContent: msg.ReasoningContent,
@@ -521,23 +521,27 @@ func (p *OpenAIProvider) downloadAsDataURL(url string) (string, error) {

 // buildContent converts text + optional images to API content format.
 // Returns a plain string if no images, or a multimodal array otherwise.
-func buildContent(text string, images []string) interface{} {
-	if len(images) == 0 {
+func buildContent(text string, images []string, videoURLs []string) interface{} {
+	if len(images) == 0 && len(videoURLs) == 0 {
 		return text
 	}
-	parts := make([]model.ImageContent, 0, len(images)+1)
+	parts := make([]interface{}, 0, len(images)+len(videoURLs)+1)
 	if text != "" {
-		parts = append(parts, model.ImageContent{
-			Type: "text",
-			Text: text,
+		parts = append(parts, map[string]interface{}{
+			"type": "text",
+			"text": text,
 		})
 	}
 	for _, img := range images {
-		parts = append(parts, model.ImageContent{
-			Type: "image_url",
-			ImageURL: &model.ImageURL{
-				URL: img,
-			},
+		parts = append(parts, map[string]interface{}{
+			"type": "image_url",
+			"image_url": map[string]string{"url": img},
+		})
+	}
+	for _, video := range videoURLs {
+		parts = append(parts, map[string]interface{}{
+			"type": "video_url",
+			"video_url": map[string]string{"url": video},
 		})
 	}
 	return parts
@@ -18,8 +18,10 @@ const (
 	PurposeIntentAnalysis   ModelPurpose = "intent_analysis"
 	PurposeToolCalling      ModelPurpose = "tool_calling"
 	PurposeMemoryExtraction ModelPurpose = "memory_extraction"
-	PurposeVision           ModelPurpose = "vision"
-	PurposeOCR              ModelPurpose = "ocr"
+	PurposeVision              ModelPurpose = "vision"
+	PurposeVideo               ModelPurpose = "video"
+	PurposeOCR                 ModelPurpose = "ocr"
+	PurposeSpeechRecognition   ModelPurpose = "speech_recognition"
 )

 // ErrModelNotRequired is returned when an optional model is unavailable.
@@ -17,6 +17,7 @@ type LLMMessage struct {
 	Role             Role       `json:"role"`
 	Content          string     `json:"content"`
 	Images           []string   `json:"images,omitempty"`            // 图片 base64 data URL 列表 (多模态)
+	VideoURLs        []string   `json:"video_urls,omitempty"`        // 视频 URL 列表 (多模态)
 	Name             string     `json:"name,omitempty"`              // 可选发送者名称
 	ToolCallID       string     `json:"tool_call_id,omitempty"`       // 工具调用关联ID (tool role 消息关联调用)
 	ToolCalls        []ToolCall `json:"tool_calls,omitempty"`         // 助手消息中的工具调用列表
@@ -36,6 +37,16 @@ type ImageURL struct {
 	Detail string `json:"detail,omitempty"` // low, high, auto
 }

+// VideoURLContent holds a video URL for multimodal video understanding.
+type VideoURLContent struct {
+	VideoURL *VideoURL `json:"video_url,omitempty"`
+}
+
+// VideoURL holds a video URL.
+type VideoURL struct {
+	URL string `json:"url"`
+}
+
 // ChatMessage 数据库存储的对话消息
 type ChatMessage struct {
 	ID        string    `json:"id" db:"id"`
@@ -41,6 +41,8 @@ type Orchestrator struct {
 	toolRegistry    *plgManager.ToolRegistry
 	visionProvider  llm.LLMProvider // 视觉模型 (图片预处理)
 	ocrProvider     llm.LLMProvider // OCR 模型 (文字提取，与视觉模型并行调用)
+	videoProvider   llm.LLMProvider // 视频模型 (短视频理解)
+	asrProvider     llm.ASRProvider  // ASR 语音识别 (语音消息转录)
 }

 // SetResponseCache sets the response cache (optional, for Phase 0.2).
@@ -84,6 +86,16 @@ func (o *Orchestrator) SetOCRProvider(op llm.LLMProvider) {
 	o.ocrProvider = op
 }

+// SetVideoProvider sets the video model provider for short video understanding.
+func (o *Orchestrator) SetVideoProvider(vp llm.LLMProvider) {
+	o.videoProvider = vp
+}
+
+// SetASRProvider sets the ASR provider for voice message transcription.
+func (o *Orchestrator) SetASRProvider(ap llm.ASRProvider) {
+	o.asrProvider = ap
+}
+
 // getBus returns the bus or a nop fallback.
 func (o *Orchestrator) getBus() bus.Bus {
 	if o.eventBus == nil {
@@ -121,6 +133,8 @@ type ProcessParams struct {
 	SessionID   string
 	Message     string
 	Images      []string // 图片 base64 data URL (多模态)
+	VideoURLs   []string // 视频 URL (多模态), ≤20s short videos
+	VoiceURLs   []string // 语音 URL (ASR 转录)
 	Mode        string   // text / voice_msg / voice_assistant
 	Nickname    string
 	ChannelType string // direct / group
@@ -174,6 +188,34 @@ func (o *Orchestrator) ProcessInput(
 		}
 		// 预处理后清空原始图片，避免后续传给不支持多模态的 Chat 模型
 		params.Images = nil
+
+		// 0.6 视频预处理: 使用视频模型分析短视频 (≤20s)，将描述注入消息
+		if len(params.VideoURLs) > 0 && o.videoProvider != nil {
+			startTime := time.Now()
+			augmented := o.preprocessVideos(ctx, params.Message, params.VideoURLs)
+			if augmented != params.Message {
+				params.Message = augmented
+				logger.Printf("[orchestrator] 视频预处理耗时: %v", time.Since(startTime))
+			}
+			params.VideoURLs = nil
+		} else if len(params.VideoURLs) > 0 {
+			logger.Printf("[orchestrator] 视频模型未配置，丢弃 %d 个视频", len(params.VideoURLs))
+			params.VideoURLs = nil
+		}
+
+		// 0.7 语音预处理: 使用 ASR 模型转录语音消息，将文本注入消息
+		if len(params.VoiceURLs) > 0 && o.asrProvider != nil && o.asrProvider.IsAvailable() {
+			startTime := time.Now()
+			augmented := o.preprocessVoice(ctx, params.Message, params.VoiceURLs)
+			if augmented != params.Message {
+				params.Message = augmented
+				logger.Printf("[orchestrator] 语音预处理耗时: %v", time.Since(startTime))
+			}
+			params.VoiceURLs = nil
+		} else if len(params.VoiceURLs) > 0 {
+			logger.Printf("[orchestrator] ASR模型未配置，丢弃 %d 个语音", len(params.VoiceURLs))
+			params.VoiceURLs = nil
+		}
 	} else if len(params.Images) > 0 {
 		// 未配置 Vision 模型时，告知用户该模型不支持图片，并清空图片避免报错
 		if params.Message == "" {
@@ -234,7 +276,7 @@ func (o *Orchestrator) ProcessInput(
 					eventCh <- model.StreamEvent{Type: model.StreamSegments, Segments: segments}
 				}
 				eventCh <- model.StreamEvent{Type: model.StreamDone}
-				o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, fullContent)
+				o.cacheAssistantMessage(params, fullContent)
 				logger.Printf("[orchestrator] 缓存响应完成: len=%d", len([]rune(fullContent)))
 				return
 			}
@@ -478,7 +520,7 @@ func (o *Orchestrator) ProcessInput(

 		// 10. 后处理：缓存回复
 		if fullContent != "" {
-			o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, fullContent)
+			o.cacheAssistantMessage(params, fullContent)
 			if o.responseCache != nil {
 				o.responseCache.Set(params.Message, fullContent)
 			}
@@ -694,6 +736,19 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
 	}
 }

+// cacheAssistantMessage caches the assistant response, tagging it with the recipient
+// in group chats so dialog history shows who the AI was addressing.
+func (o *Orchestrator) cacheAssistantMessage(params ProcessParams, fullContent string) {
+	if o.contextBuilder == nil {
+		return
+	}
+	cached := fullContent
+	if params.ChannelType == "group" && params.Nickname != "" {
+		cached = fmt.Sprintf("[回复 %s]\n%s", params.Nickname, fullContent)
+	}
+	o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, cached)
+}
+
 // preprocessImages uses vision and OCR models to analyze images and augments the user message.
 // When both vision and OCR providers are available (and are different models), they are called
 // in parallel and both results are passed to the chat model for autonomous judgment.
@@ -781,6 +836,74 @@ func (o *Orchestrator) preprocessImages(ctx context.Context, message string, ima
 	return augmented
 }

+// preprocessVideos uses the video model to analyze short videos and augments the message.
+func (o *Orchestrator) preprocessVideos(ctx context.Context, message string, videoURLs []string) string {
+	if o.videoProvider == nil {
+		return message
+	}
+
+	var descriptions []string
+	for i, url := range videoURLs {
+		resp, err := o.videoProvider.Chat(ctx, []model.LLMMessage{
+			{Role: model.RoleUser, Content: "请用简短的中文描述这个视频的内容，包括场景、人物、动作等。控制在100字以内。", VideoURLs: []string{url}},
+		})
+		if err != nil {
+			logger.Printf("[orchestrator] 视频 %d 分析失败: %v", i, err)
+			continue
+		}
+		if resp.Content != "" {
+			descriptions = append(descriptions, resp.Content)
+		}
+	}
+
+	if len(descriptions) == 0 {
+		return message
+	}
+
+	if message == "" {
+		return strings.Join(descriptions, "\n\n")
+	}
+
+	augmented := message
+	for i, desc := range descriptions {
+		augmented += fmt.Sprintf("\n\n[视频%d的分析]: %s", i+1, desc)
+	}
+	return augmented
+}
+
+// preprocessVoice transcribes voice messages using the ASR provider and augments the message.
+func (o *Orchestrator) preprocessVoice(ctx context.Context, message string, voiceURLs []string) string {
+	if o.asrProvider == nil || !o.asrProvider.IsAvailable() {
+		return message
+	}
+
+	var transcriptions []string
+	for i, url := range voiceURLs {
+		text, err := o.asrProvider.Transcribe(ctx, url)
+		if err != nil {
+			logger.Printf("[orchestrator] 语音 %d 转录失败: %v", i, err)
+			continue
+		}
+		if text != "" {
+			transcriptions = append(transcriptions, text)
+		}
+	}
+
+	if len(transcriptions) == 0 {
+		return message
+	}
+
+	if message == "" {
+		return strings.Join(transcriptions, "\n\n")
+	}
+
+	augmented := message
+	for i, t := range transcriptions {
+		augmented += fmt.Sprintf("\n\n[语音%d的转写]: %s", i+1, t)
+	}
+	return augmented
+}
+
 // Ensure time, memory are used
 var _ = time.Now
 var _ = memory.NewRetriever
@@ -35,6 +35,7 @@ type SynthesizeParams struct {
 	SessionID          string
 	UserMessage        string
 	Images             []string                    // 图片 base64 data URL (多模态)
+	VideoURLs          []string                    // 视频 URL (多模态)
 	Nickname           string
 	PersonaPrompt      string                      // 完整人格提示词
 	DialogHistory      []model.LLMMessage           // 对话历史
@@ -215,7 +216,7 @@ func (s *Synthesizer) buildSynthesizeMessages(params SynthesizeParams) []model.L
 	if params.ChannelType == "group" {
 		messages = append(messages, model.LLMMessage{
 			Role:    model.RoleSystem,
-			Content: "【群聊上下文】这条消息来自QQ群聊。消息前缀 [群聊 群号] 昵称 (QQ号) 标注了真实发送者。你不是在和开拓者一对一私聊，而是在群聊中和不同成员交流。请用发送者的真实名字称呼，不要叫所有人开拓者或叶酱。只在对你说话或延续已有对话时才回复。",
+			Content: "【群聊上下文】这条消息来自QQ群聊。消息前缀 [群聊 群号] 昵称 (QQ号) 标注了真实发送者。你不是在和开拓者一对一私聊，而是在群聊中和不同成员交流。请根据消息前缀中的发送者名字称呼对方，不同的人有不同的名字。只在对你说话或延续已有对话时才回复。",
 		})
 	}

@@ -280,11 +281,12 @@ func (s *Synthesizer) buildSynthesizeMessages(params SynthesizeParams) []model.L
 		messages = append(messages, history...)
 	}

-	// 当前用户消息 (支持多模态图片)
+	// 当前用户消息 (支持多模态图片和视频)
 	messages = append(messages, model.LLMMessage{
-		Role:    model.RoleUser,
-		Content: params.UserMessage,
-		Images:  params.Images,
+		Role:      model.RoleUser,
+		Content:   params.UserMessage,
+		Images:    params.Images,
+		VideoURLs: params.VideoURLs,
 	})

 	return messages
@@ -0,0 +1,88 @@
+package tools
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+
+	"git.yeij.top/AskaEth/Cyrene/ai-core/internal/llm"
+	"git.yeij.top/AskaEth/Cyrene/ai-core/internal/model"
+)
+
+// VideoTool enables video understanding via multimodal LLM.
+type VideoTool struct {
+	videoProvider llm.LLMProvider
+}
+
+// NewVideoTool creates a video tool. videoProvider is optional (nil = no-op mode).
+func NewVideoTool(videoProvider llm.LLMProvider) *VideoTool {
+	return &VideoTool{videoProvider: videoProvider}
+}
+
+func (t *VideoTool) Definition() ToolDefinition {
+	return ToolDefinition{
+		Name:        "video_analyze",
+		Description: "分析视频内容。传入视频文件路径或URL，返回视频内容的文字描述和分析结果。支持场景理解、动作识别、文字提取等。",
+		Parameters: map[string]interface{}{
+			"type": "object",
+			"properties": map[string]interface{}{
+				"video_path": map[string]interface{}{
+					"type":        "string",
+					"description": "视频文件路径或URL",
+				},
+				"task": map[string]interface{}{
+					"type":        "string",
+					"description": "分析任务: describe(内容描述), summarize(摘要), analyze(综合分析)",
+					"enum":        []string{"describe", "summarize", "analyze"},
+				},
+			},
+			"required": []string{"video_path", "task"},
+		},
+	}
+}
+
+var videoTaskPrompts = map[string]string{
+	"describe":   "请详细描述这个视频的内容，包括场景、人物、动作、对话要点等。",
+	"summarize":  "请用简洁的语言总结这个视频的主要内容。",
+	"analyze":    "请综合分析这个视频，包括内容描述、关键片段、文字信息(如有)、以及你的理解。",
+}
+
+func (t *VideoTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
+	videoPath, _ := args["video_path"].(string)
+	if videoPath == "" {
+		return &ToolResult{ToolName: "video_analyze", Success: false, Error: "video_path 参数不能为空"}, nil
+	}
+
+	task, _ := args["task"].(string)
+	if task == "" {
+		task = "analyze"
+	}
+
+	prompt := videoTaskPrompts[task]
+	if prompt == "" {
+		prompt = videoTaskPrompts["analyze"]
+	}
+
+	if t.videoProvider == nil {
+		return &ToolResult{ToolName: "video_analyze", Success: false, Error: "视频理解模型未配置"}, nil
+	}
+
+	messages := []model.LLMMessage{
+		{Role: model.RoleUser, Content: prompt, VideoURLs: []string{videoPath}},
+	}
+	resp, err := t.videoProvider.Chat(ctx, messages)
+	if err != nil {
+		return &ToolResult{ToolName: "video_analyze", Success: false, Error: fmt.Sprintf("视频模型调用失败: %v", err)}, nil
+	}
+
+	output, _ := json.Marshal(map[string]interface{}{
+		"video_path":        videoPath,
+		"task":              task,
+		"model":             t.videoProvider.ModelName(),
+		"text":              resp.Content,
+		"prompt_tokens":     resp.Usage.PromptTokens,
+		"completion_tokens": resp.Usage.CompletionTokens,
+		"total_tokens":      resp.Usage.TotalTokens,
+	})
+	return &ToolResult{ToolName: "video_analyze", Success: true, Data: string(output)}, nil
+}