package tools import ( "context" "encoding/base64" "encoding/json" "fmt" "os" "path/filepath" "strings" "github.com/yourname/cyrene-ai/ai-core/internal/llm" "github.com/yourname/cyrene-ai/ai-core/internal/model" ) // VisionTool enables image understanding via multimodal LLM. // When visionProvider is available, it calls the vision model directly for OCR/analysis. // When nil, it falls back to returning a base64 data URL for the caller to process. type VisionTool struct { visionProvider llm.LLMProvider } // NewVisionTool creates a vision tool. visionProvider is optional (nil = base64-only mode). func NewVisionTool(visionProvider llm.LLMProvider) *VisionTool { return &VisionTool{visionProvider: visionProvider} } func (t *VisionTool) Definition() ToolDefinition { return ToolDefinition{ Name: "vision_analyze", Description: "分析图片内容。传入图片路径,返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。", Parameters: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ "image_path": map[string]interface{}{ "type": "string", "description": "图片文件路径", }, "task": map[string]interface{}{ "type": "string", "description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)", "enum": []string{"ocr", "describe", "analyze"}, }, }, "required": []string{"image_path", "task"}, }, } } var taskPrompts = map[string]string{ "ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。", "describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。", "analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。", } func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) { imagePath, _ := args["image_path"].(string) if imagePath == "" { return &ToolResult{ ToolName: "vision_analyze", Success: false, Error: "image_path 参数不能为空", }, nil } task, _ := args["task"].(string) if task == "" { task = "analyze" } dataURL, mimeType, err := encodeImageToDataURL(imagePath) if err != nil { return &ToolResult{ ToolName: "vision_analyze", Success: false, Error: fmt.Sprintf("读取图片失败: %v", err), }, nil } prompt := taskPrompts[task] if prompt == "" { prompt = taskPrompts["analyze"] } // If a vision model is available, call it directly for OCR/analysis if t.visionProvider != nil { messages := []model.LLMMessage{ {Role: model.RoleUser, Content: prompt, Images: []string{dataURL}}, } resp, err := t.visionProvider.Chat(ctx, messages) if err != nil { return &ToolResult{ ToolName: "vision_analyze", Success: false, Error: fmt.Sprintf("视觉模型调用失败: %v", err), }, nil } output, _ := json.Marshal(map[string]interface{}{ "image_path": imagePath, "task": task, "model": t.visionProvider.ModelName(), "text": resp.Content, "prompt_tokens": resp.Usage.PromptTokens, "completion_tokens": resp.Usage.CompletionTokens, "total_tokens": resp.Usage.TotalTokens, }) return &ToolResult{ ToolName: "vision_analyze", Success: true, Data: string(output), }, nil } // Fallback: return base64 data URL for caller to process result, _ := json.Marshal(map[string]interface{}{ "image_path": imagePath, "task": task, "data_url": dataURL, "mime_type": mimeType, "prompt": prompt, "file_size": len(dataURL), }) return &ToolResult{ ToolName: "vision_analyze", Success: true, Data: string(result), }, nil } // encodeImageToDataURL reads an image file and returns a base64 data URL. func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) { data, err := os.ReadFile(path) if err != nil { return "", "", fmt.Errorf("cannot read image: %w", err) } if len(data) > 20*1024*1024 { return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data)) } ext := strings.ToLower(filepath.Ext(path)) switch ext { case ".png": mimeType = "image/png" case ".jpg", ".jpeg": mimeType = "image/jpeg" case ".gif": mimeType = "image/gif" case ".webp": mimeType = "image/webp" case ".bmp": mimeType = "image/bmp" case ".svg": mimeType = "image/svg+xml" default: mimeType = "image/png" } b64 := base64.StdEncoding.EncodeToString(data) return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil }