feat: Phase 6.3 视觉理解 — 多模态图片输入 + OCR/Vision 工具 + 图片编码管线

- LLMMessage 新增 Images 字段支持多模态 content array
- OpenAIProvider 支持 image_url content parts
- VisionTool: 图片读取 + base64 编码 + OCR/场景描述/综合分析
- 对话管道全线支持 images 参数传递 (Gateway->Orchestrator->Synthesizer->LLM)
- 自动根据图片有无构建 text-only 或 multimodal content

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-23 22:28:42 +08:00
parent 38b36fc5ad
commit 9a8fb8d0ce
7 changed files with 205 additions and 24 deletions
+10 -5
View File
@@ -158,6 +158,9 @@ func main() {
toolRegistry.Register(tools.NewHostFileTool(hostManager))
toolRegistry.Register(tools.NewHostSystemTool(hostManager))
}
// Phase 6.3: 视觉理解工具
toolRegistry.Register(tools.NewVisionTool())
log.Printf("工具注册中心已就绪: %d 个工具 (%v)", len(toolRegistry.ListTools()), toolRegistry.ListTools())
}
@@ -431,11 +434,12 @@ func handleChat(
// 解析请求
var req struct {
UserID string `json:"user_id"`
SessionID string `json:"session_id"`
Message string `json:"message"`
Mode string `json:"mode"`
Nickname string `json:"nickname,omitempty"`
UserID string `json:"user_id"`
SessionID string `json:"session_id"`
Message string `json:"message"`
Images []string `json:"images,omitempty"` // 图片 base64 data URL
Mode string `json:"mode"`
Nickname string `json:"nickname,omitempty"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "无效的请求体", http.StatusBadRequest)
@@ -480,6 +484,7 @@ func handleChat(
UserID: req.UserID,
SessionID: req.SessionID,
Message: req.Message,
Images: req.Images,
Mode: req.Mode,
Nickname: userNickname,
})
@@ -135,6 +135,7 @@ func DefaultAutonomousToolPolicy() *AutonomousToolPolicy {
"iot_query", "iot_control", "memory_search", "web_search",
"calculator", "datetime", "web_fetch",
"host_exec", "host_file", "host_system",
"vision_analyze",
},
MaxToolCallsPerRound: 5,
MaxHighRiskPerHour: 10,
+41 -6
View File
@@ -61,7 +61,7 @@ type openAIRequest struct {
type openAIMessage struct {
Role string `json:"role"`
Content string `json:"content,omitempty"`
Content interface{} `json:"content,omitempty"` // string or []model.ImageContent for multimodal
Name string `json:"name,omitempty"`
ToolCalls []openAIToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
@@ -180,8 +180,8 @@ func (p *OpenAIProvider) ChatStreamWithTools(ctx context.Context, messages []mod
if len(streamResp.Choices) > 0 {
delta := streamResp.Choices[0].Delta
if delta.Content != "" {
ch <- StreamChunk{Content: delta.Content}
if deltaStr := contentString(delta.Content); deltaStr != "" {
ch <- StreamChunk{Content: deltaStr}
}
if streamResp.Choices[0].FinishReason != "" {
usage := &model.Usage{}
@@ -228,7 +228,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
for i, msg := range messages {
oaiMsg := openAIMessage{
Role: string(msg.Role),
Content: msg.Content,
Content: buildContent(msg.Content, msg.Images),
Name: msg.Name,
ToolCallID: msg.ToolCallID,
ReasoningContent: msg.ReasoningContent,
@@ -305,7 +305,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
// 检查是否有工具调用
choice := oaiResp.Choices[0]
llmResp := &model.LLMResponse{
Content: choice.Message.Content,
Content: contentString(choice.Message.Content),
FinishReason: choice.FinishReason,
ReasoningContent: choice.Message.ReasoningContent,
Usage: model.Usage{
@@ -335,7 +335,7 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
for i, msg := range messages {
oaiMsg := openAIMessage{
Role: string(msg.Role),
Content: msg.Content,
Content: buildContent(msg.Content, msg.Images),
Name: msg.Name,
ToolCallID: msg.ToolCallID,
ReasoningContent: msg.ReasoningContent,
@@ -399,3 +399,38 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
func (p *OpenAIProvider) ModelName() string {
return p.config.Model
}
// contentString extracts a string from an interface{} Content value.
func contentString(v interface{}) string {
if v == nil {
return ""
}
if s, ok := v.(string); ok {
return s
}
return ""
}
// buildContent converts text + optional images to API content format.
// Returns a plain string if no images, or a multimodal array otherwise.
func buildContent(text string, images []string) interface{} {
if len(images) == 0 {
return text
}
parts := make([]model.ImageContent, 0, len(images)+1)
if text != "" {
parts = append(parts, model.ImageContent{
Type: "text",
Text: text,
})
}
for _, img := range images {
parts = append(parts, model.ImageContent{
Type: "image_url",
ImageURL: &model.ImageURL{
URL: img,
},
})
}
return parts
}
+14
View File
@@ -16,12 +16,26 @@ const (
type LLMMessage struct {
Role Role `json:"role"`
Content string `json:"content"`
Images []string `json:"images,omitempty"` // 图片 base64 data URL 列表 (多模态)
Name string `json:"name,omitempty"` // 可选发送者名称
ToolCallID string `json:"tool_call_id,omitempty"` // 工具调用关联ID (tool role 消息关联调用)
ToolCalls []ToolCall `json:"tool_calls,omitempty"` // 助手消息中的工具调用列表
ReasoningContent string `json:"reasoning_content,omitempty"` // DeepSeek 思考链内容(需回传)
}
// ImageContent is a multimodal content part for images.
type ImageContent struct {
Type string `json:"type"`
Text string `json:"text,omitempty"`
ImageURL *ImageURL `json:"image_url,omitempty"`
}
// ImageURL holds an image URL (can be a data: URL or http: URL).
type ImageURL struct {
URL string `json:"url"`
Detail string `json:"detail,omitempty"` // low, high, auto
}
// ChatMessage 数据库存储的对话消息
type ChatMessage struct {
ID string `json:"id" db:"id"`
@@ -98,7 +98,8 @@ type ProcessParams struct {
UserID string
SessionID string
Message string
Mode string // text / voice_msg / voice_assistant
Images []string // 图片 base64 data URL (多模态)
Mode string // text / voice_msg / voice_assistant
Nickname string
}
@@ -262,6 +263,7 @@ func (o *Orchestrator) ProcessInput(
UserID: params.UserID,
SessionID: params.SessionID,
UserMessage: params.Message,
Images: params.Images,
Nickname: userName,
PersonaPrompt: systemPrompt,
DialogHistory: history,
@@ -25,17 +25,18 @@ func NewSynthesizer(llmAdapter *llm.Adapter) *Synthesizer {
// SynthesizeParams 综合参数
type SynthesizeParams struct {
UserID string
SessionID string
UserMessage string
Nickname string
PersonaPrompt string // 完整人格提示词
DialogHistory []model.LLMMessage // 对话历史
MemorySummary string // 记忆检索摘要
ThoughtOutline string // 通用对话思考
IoTSummary string // IoT 操作摘要
DeviceContext string // 设备状态上下文
Mode string // text / voice_assistant
UserID string
SessionID string
UserMessage string
Images []string // 图片 base64 data URL (多模态)
Nickname string
PersonaPrompt string // 完整人格提示词
DialogHistory []model.LLMMessage // 对话历史
MemorySummary string // 记忆检索摘要
ThoughtOutline string // 通用对话思考
IoTSummary string // IoT 操作摘要
DeviceContext string // 设备状态上下文
Mode string // text / voice_assistant
}
// Synthesize 综合所有子会话结果,流式生成最终回复
@@ -99,10 +100,11 @@ func (s *Synthesizer) buildSynthesizeMessages(params SynthesizeParams) []model.L
messages = append(messages, params.DialogHistory...)
}
// 当前用户消息
// 当前用户消息 (支持多模态图片)
messages = append(messages, model.LLMMessage{
Role: model.RoleUser,
Content: params.UserMessage,
Images: params.Images,
})
return messages
@@ -0,0 +1,122 @@
package tools
import (
"context"
"encoding/base64"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
)
// VisionTool enables image understanding via multimodal LLM.
// It reads an image file, encodes it as base64, and returns a prompt-ready
// data URL that can be fed into the vision pipeline.
type VisionTool struct{}
// NewVisionTool creates a vision tool.
func NewVisionTool() *VisionTool {
return &VisionTool{}
}
func (t *VisionTool) Definition() ToolDefinition {
return ToolDefinition{
Name: "vision_analyze",
Description: "分析图片内容。传入图片路径,返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。",
Parameters: map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"image_path": map[string]interface{}{
"type": "string",
"description": "图片文件路径",
},
"task": map[string]interface{}{
"type": "string",
"description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)",
"enum": []string{"ocr", "describe", "analyze"},
},
},
"required": []string{"image_path", "task"},
},
}
}
func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
imagePath, _ := args["image_path"].(string)
if imagePath == "" {
return &ToolResult{
ToolName: "vision_analyze",
Success: false,
Error: "image_path 参数不能为空",
}, nil
}
task, _ := args["task"].(string)
if task == "" {
task = "analyze"
}
dataURL, mimeType, err := encodeImageToDataURL(imagePath)
if err != nil {
return &ToolResult{
ToolName: "vision_analyze",
Success: false,
Error: fmt.Sprintf("读取图片失败: %v", err),
}, nil
}
taskPrompts := map[string]string{
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
}
result, _ := json.Marshal(map[string]interface{}{
"image_path": imagePath,
"task": task,
"data_url": dataURL,
"mime_type": mimeType,
"prompt": taskPrompts[task],
"file_size": len(dataURL),
})
return &ToolResult{
ToolName: "vision_analyze",
Success: true,
Data: string(result),
}, nil
}
// encodeImageToDataURL reads an image file and returns a base64 data URL.
func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) {
data, err := os.ReadFile(path)
if err != nil {
return "", "", fmt.Errorf("cannot read image: %w", err)
}
if len(data) > 20*1024*1024 {
return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data))
}
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".png":
mimeType = "image/png"
case ".jpg", ".jpeg":
mimeType = "image/jpeg"
case ".gif":
mimeType = "image/gif"
case ".webp":
mimeType = "image/webp"
case ".bmp":
mimeType = "image/bmp"
case ".svg":
mimeType = "image/svg+xml"
default:
mimeType = "image/png"
}
b64 := base64.StdEncoding.EncodeToString(data)
return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil
}