feat: Phase 6.3 视觉理解 — 多模态图片输入 + OCR/Vision 工具 + 图片编码管线

- LLMMessage 新增 Images 字段支持多模态 content array - OpenAIProvider 支持 image_url content parts - VisionTool: 图片读取 + base64 编码 + OCR/场景描述/综合分析 - 对话管道全线支持 images 参数传递 (Gateway->Orchestrator->Synthesizer->LLM) - 自动根据图片有无构建 text-only 或 multimodal content Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 22:28:42 +08:00
parent 38b36fc5ad
commit 9a8fb8d0ce
7 changed files with 205 additions and 24 deletions
@@ -0,0 +1,122 @@
+package tools
+
+import (
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// VisionTool enables image understanding via multimodal LLM.
+// It reads an image file, encodes it as base64, and returns a prompt-ready
+// data URL that can be fed into the vision pipeline.
+type VisionTool struct{}
+
+// NewVisionTool creates a vision tool.
+func NewVisionTool() *VisionTool {
+	return &VisionTool{}
+}
+
+func (t *VisionTool) Definition() ToolDefinition {
+	return ToolDefinition{
+		Name:        "vision_analyze",
+		Description: "分析图片内容。传入图片路径，返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。",
+		Parameters: map[string]interface{}{
+			"type": "object",
+			"properties": map[string]interface{}{
+				"image_path": map[string]interface{}{
+					"type":        "string",
+					"description": "图片文件路径",
+				},
+				"task": map[string]interface{}{
+					"type":        "string",
+					"description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)",
+					"enum":        []string{"ocr", "describe", "analyze"},
+				},
+			},
+			"required": []string{"image_path", "task"},
+		},
+	}
+}
+
+func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
+	imagePath, _ := args["image_path"].(string)
+	if imagePath == "" {
+		return &ToolResult{
+			ToolName: "vision_analyze",
+			Success:  false,
+			Error:    "image_path 参数不能为空",
+		}, nil
+	}
+
+	task, _ := args["task"].(string)
+	if task == "" {
+		task = "analyze"
+	}
+
+	dataURL, mimeType, err := encodeImageToDataURL(imagePath)
+	if err != nil {
+		return &ToolResult{
+			ToolName: "vision_analyze",
+			Success:  false,
+			Error:    fmt.Sprintf("读取图片失败: %v", err),
+		}, nil
+	}
+
+	taskPrompts := map[string]string{
+		"ocr":       "请提取这张图片中的所有文字内容，保持原始格式和排版。只输出文字内容，不要添加额外说明。",
+		"describe":  "请详细描述这张图片的内容，包括场景、物体、人物、颜色、氛围等。",
+		"analyze":   "请综合分析这张图片，包括内容描述、文字提取(如有)、以及你的理解。",
+	}
+
+	result, _ := json.Marshal(map[string]interface{}{
+		"image_path":  imagePath,
+		"task":        task,
+		"data_url":    dataURL,
+		"mime_type":   mimeType,
+		"prompt":      taskPrompts[task],
+		"file_size":   len(dataURL),
+	})
+
+	return &ToolResult{
+		ToolName: "vision_analyze",
+		Success:  true,
+		Data:     string(result),
+	}, nil
+}
+
+// encodeImageToDataURL reads an image file and returns a base64 data URL.
+func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return "", "", fmt.Errorf("cannot read image: %w", err)
+	}
+
+	if len(data) > 20*1024*1024 {
+		return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data))
+	}
+
+	ext := strings.ToLower(filepath.Ext(path))
+	switch ext {
+	case ".png":
+		mimeType = "image/png"
+	case ".jpg", ".jpeg":
+		mimeType = "image/jpeg"
+	case ".gif":
+		mimeType = "image/gif"
+	case ".webp":
+		mimeType = "image/webp"
+	case ".bmp":
+		mimeType = "image/bmp"
+	case ".svg":
+		mimeType = "image/svg+xml"
+	default:
+		mimeType = "image/png"
+	}
+
+	b64 := base64.StdEncoding.EncodeToString(data)
+	return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil
+}