feat: Phase 6.3 视觉理解 — 多模态图片输入 + OCR/Vision 工具 + 图片编码管线
- LLMMessage 新增 Images 字段支持多模态 content array - OpenAIProvider 支持 image_url content parts - VisionTool: 图片读取 + base64 编码 + OCR/场景描述/综合分析 - 对话管道全线支持 images 参数传递 (Gateway->Orchestrator->Synthesizer->LLM) - 自动根据图片有无构建 text-only 或 multimodal content Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,122 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// VisionTool enables image understanding via multimodal LLM.
|
||||
// It reads an image file, encodes it as base64, and returns a prompt-ready
|
||||
// data URL that can be fed into the vision pipeline.
|
||||
type VisionTool struct{}
|
||||
|
||||
// NewVisionTool creates a vision tool.
|
||||
func NewVisionTool() *VisionTool {
|
||||
return &VisionTool{}
|
||||
}
|
||||
|
||||
func (t *VisionTool) Definition() ToolDefinition {
|
||||
return ToolDefinition{
|
||||
Name: "vision_analyze",
|
||||
Description: "分析图片内容。传入图片路径,返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。",
|
||||
Parameters: map[string]interface{}{
|
||||
"type": "object",
|
||||
"properties": map[string]interface{}{
|
||||
"image_path": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "图片文件路径",
|
||||
},
|
||||
"task": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)",
|
||||
"enum": []string{"ocr", "describe", "analyze"},
|
||||
},
|
||||
},
|
||||
"required": []string{"image_path", "task"},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
|
||||
imagePath, _ := args["image_path"].(string)
|
||||
if imagePath == "" {
|
||||
return &ToolResult{
|
||||
ToolName: "vision_analyze",
|
||||
Success: false,
|
||||
Error: "image_path 参数不能为空",
|
||||
}, nil
|
||||
}
|
||||
|
||||
task, _ := args["task"].(string)
|
||||
if task == "" {
|
||||
task = "analyze"
|
||||
}
|
||||
|
||||
dataURL, mimeType, err := encodeImageToDataURL(imagePath)
|
||||
if err != nil {
|
||||
return &ToolResult{
|
||||
ToolName: "vision_analyze",
|
||||
Success: false,
|
||||
Error: fmt.Sprintf("读取图片失败: %v", err),
|
||||
}, nil
|
||||
}
|
||||
|
||||
taskPrompts := map[string]string{
|
||||
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
|
||||
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
|
||||
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
|
||||
}
|
||||
|
||||
result, _ := json.Marshal(map[string]interface{}{
|
||||
"image_path": imagePath,
|
||||
"task": task,
|
||||
"data_url": dataURL,
|
||||
"mime_type": mimeType,
|
||||
"prompt": taskPrompts[task],
|
||||
"file_size": len(dataURL),
|
||||
})
|
||||
|
||||
return &ToolResult{
|
||||
ToolName: "vision_analyze",
|
||||
Success: true,
|
||||
Data: string(result),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// encodeImageToDataURL reads an image file and returns a base64 data URL.
|
||||
func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("cannot read image: %w", err)
|
||||
}
|
||||
|
||||
if len(data) > 20*1024*1024 {
|
||||
return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data))
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".png":
|
||||
mimeType = "image/png"
|
||||
case ".jpg", ".jpeg":
|
||||
mimeType = "image/jpeg"
|
||||
case ".gif":
|
||||
mimeType = "image/gif"
|
||||
case ".webp":
|
||||
mimeType = "image/webp"
|
||||
case ".bmp":
|
||||
mimeType = "image/bmp"
|
||||
case ".svg":
|
||||
mimeType = "image/svg+xml"
|
||||
default:
|
||||
mimeType = "image/png"
|
||||
}
|
||||
|
||||
b64 := base64.StdEncoding.EncodeToString(data)
|
||||
return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil
|
||||
}
|
||||
Reference in New Issue
Block a user