package tools import ( "context" "encoding/base64" "encoding/json" "fmt" "os" "path/filepath" "strings" ) // VisionTool enables image understanding via multimodal LLM. // It reads an image file, encodes it as base64, and returns a prompt-ready // data URL that can be fed into the vision pipeline. type VisionTool struct{} // NewVisionTool creates a vision tool. func NewVisionTool() *VisionTool { return &VisionTool{} } func (t *VisionTool) Definition() ToolDefinition { return ToolDefinition{ Name: "vision_analyze", Description: "分析图片内容。传入图片路径,返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。", Parameters: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ "image_path": map[string]interface{}{ "type": "string", "description": "图片文件路径", }, "task": map[string]interface{}{ "type": "string", "description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)", "enum": []string{"ocr", "describe", "analyze"}, }, }, "required": []string{"image_path", "task"}, }, } } func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) { imagePath, _ := args["image_path"].(string) if imagePath == "" { return &ToolResult{ ToolName: "vision_analyze", Success: false, Error: "image_path 参数不能为空", }, nil } task, _ := args["task"].(string) if task == "" { task = "analyze" } dataURL, mimeType, err := encodeImageToDataURL(imagePath) if err != nil { return &ToolResult{ ToolName: "vision_analyze", Success: false, Error: fmt.Sprintf("读取图片失败: %v", err), }, nil } taskPrompts := map[string]string{ "ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。", "describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。", "analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。", } result, _ := json.Marshal(map[string]interface{}{ "image_path": imagePath, "task": task, "data_url": dataURL, "mime_type": mimeType, "prompt": taskPrompts[task], "file_size": len(dataURL), }) return &ToolResult{ ToolName: "vision_analyze", Success: true, Data: string(result), }, nil } // encodeImageToDataURL reads an image file and returns a base64 data URL. func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) { data, err := os.ReadFile(path) if err != nil { return "", "", fmt.Errorf("cannot read image: %w", err) } if len(data) > 20*1024*1024 { return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data)) } ext := strings.ToLower(filepath.Ext(path)) switch ext { case ".png": mimeType = "image/png" case ".jpg", ".jpeg": mimeType = "image/jpeg" case ".gif": mimeType = "image/gif" case ".webp": mimeType = "image/webp" case ".bmp": mimeType = "image/bmp" case ".svg": mimeType = "image/svg+xml" default: mimeType = "image/png" } b64 := base64.StdEncoding.EncodeToString(data) return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil }