feat: Phase 6.3 视觉理解 — 多模态图片输入 + OCR/Vision 工具 + 图片编码管线

- LLMMessage 新增 Images 字段支持多模态 content array
- OpenAIProvider 支持 image_url content parts
- VisionTool: 图片读取 + base64 编码 + OCR/场景描述/综合分析
- 对话管道全线支持 images 参数传递 (Gateway->Orchestrator->Synthesizer->LLM)
- 自动根据图片有无构建 text-only 或 multimodal content

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-23 22:28:42 +08:00
parent 38b36fc5ad
commit 9a8fb8d0ce
7 changed files with 205 additions and 24 deletions
@@ -0,0 +1,122 @@
package tools
import (
"context"
"encoding/base64"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
)
// VisionTool enables image understanding via multimodal LLM.
// It reads an image file, encodes it as base64, and returns a prompt-ready
// data URL that can be fed into the vision pipeline.
type VisionTool struct{}
// NewVisionTool creates a vision tool.
func NewVisionTool() *VisionTool {
return &VisionTool{}
}
func (t *VisionTool) Definition() ToolDefinition {
return ToolDefinition{
Name: "vision_analyze",
Description: "分析图片内容。传入图片路径,返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。",
Parameters: map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"image_path": map[string]interface{}{
"type": "string",
"description": "图片文件路径",
},
"task": map[string]interface{}{
"type": "string",
"description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)",
"enum": []string{"ocr", "describe", "analyze"},
},
},
"required": []string{"image_path", "task"},
},
}
}
func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
imagePath, _ := args["image_path"].(string)
if imagePath == "" {
return &ToolResult{
ToolName: "vision_analyze",
Success: false,
Error: "image_path 参数不能为空",
}, nil
}
task, _ := args["task"].(string)
if task == "" {
task = "analyze"
}
dataURL, mimeType, err := encodeImageToDataURL(imagePath)
if err != nil {
return &ToolResult{
ToolName: "vision_analyze",
Success: false,
Error: fmt.Sprintf("读取图片失败: %v", err),
}, nil
}
taskPrompts := map[string]string{
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
}
result, _ := json.Marshal(map[string]interface{}{
"image_path": imagePath,
"task": task,
"data_url": dataURL,
"mime_type": mimeType,
"prompt": taskPrompts[task],
"file_size": len(dataURL),
})
return &ToolResult{
ToolName: "vision_analyze",
Success: true,
Data: string(result),
}, nil
}
// encodeImageToDataURL reads an image file and returns a base64 data URL.
func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) {
data, err := os.ReadFile(path)
if err != nil {
return "", "", fmt.Errorf("cannot read image: %w", err)
}
if len(data) > 20*1024*1024 {
return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data))
}
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".png":
mimeType = "image/png"
case ".jpg", ".jpeg":
mimeType = "image/jpeg"
case ".gif":
mimeType = "image/gif"
case ".webp":
mimeType = "image/webp"
case ".bmp":
mimeType = "image/bmp"
case ".svg":
mimeType = "image/svg+xml"
default:
mimeType = "image/png"
}
b64 := base64.StdEncoding.EncodeToString(data)
return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil
}