Cyrene/backend/ai-core/internal/tools/video_tool.go

package tools

import (
	"context"
	"encoding/json"
	"fmt"

	"git.yeij.top/AskaEth/Cyrene/ai-core/internal/llm"
	"git.yeij.top/AskaEth/Cyrene/ai-core/internal/model"
)

// VideoTool enables video understanding via multimodal LLM.
type VideoTool struct {
	videoProvider llm.LLMProvider
}

// NewVideoTool creates a video tool. videoProvider is optional (nil = no-op mode).
func NewVideoTool(videoProvider llm.LLMProvider) *VideoTool {
	return &VideoTool{videoProvider: videoProvider}
}

func (t *VideoTool) Definition() ToolDefinition {
	return ToolDefinition{
		Name:        "video_analyze",
		Description: "分析视频内容。传入视频文件路径或URL，返回视频内容的文字描述和分析结果。支持场景理解、动作识别、文字提取等。",
		Parameters: map[string]interface{}{
			"type": "object",
			"properties": map[string]interface{}{
				"video_path": map[string]interface{}{
					"type":        "string",
					"description": "视频文件路径或URL",
				},
				"task": map[string]interface{}{
					"type":        "string",
					"description": "分析任务: describe(内容描述), summarize(摘要), analyze(综合分析)",
					"enum":        []string{"describe", "summarize", "analyze"},
				},
			},
			"required": []string{"video_path", "task"},
		},
	}
}

var videoTaskPrompts = map[string]string{
	"describe":   "请详细描述这个视频的内容，包括场景、人物、动作、对话要点等。",
	"summarize":  "请用简洁的语言总结这个视频的主要内容。",
	"analyze":    "请综合分析这个视频，包括内容描述、关键片段、文字信息(如有)、以及你的理解。",
}

func (t *VideoTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
	videoPath, _ := args["video_path"].(string)
	if videoPath == "" {
		return &ToolResult{ToolName: "video_analyze", Success: false, Error: "video_path 参数不能为空"}, nil
	}

	task, _ := args["task"].(string)
	if task == "" {
		task = "analyze"
	}

	prompt := videoTaskPrompts[task]
	if prompt == "" {
		prompt = videoTaskPrompts["analyze"]
	}

	if t.videoProvider == nil {
		return &ToolResult{ToolName: "video_analyze", Success: false, Error: "视频理解模型未配置"}, nil
	}

	messages := []model.LLMMessage{
		{Role: model.RoleUser, Content: prompt, VideoURLs: []string{videoPath}},
	}
	resp, err := t.videoProvider.Chat(ctx, messages)
	if err != nil {
		return &ToolResult{ToolName: "video_analyze", Success: false, Error: fmt.Sprintf("视频模型调用失败: %v", err)}, nil
	}

	output, _ := json.Marshal(map[string]interface{}{
		"video_path":        videoPath,
		"task":              task,
		"model":             t.videoProvider.ModelName(),
		"text":              resp.Content,
		"prompt_tokens":     resp.Usage.PromptTokens,
		"completion_tokens": resp.Usage.CompletionTokens,
		"total_tokens":      resp.Usage.TotalTokens,
	})
	return &ToolResult{ToolName: "video_analyze", Success: true, Data: string(output)}, nil
}