feat: Phase 6.3 视觉理解 — 多模态图片输入 + OCR/Vision 工具 + 图片编码管线
- LLMMessage 新增 Images 字段支持多模态 content array - OpenAIProvider 支持 image_url content parts - VisionTool: 图片读取 + base64 编码 + OCR/场景描述/综合分析 - 对话管道全线支持 images 参数传递 (Gateway->Orchestrator->Synthesizer->LLM) - 自动根据图片有无构建 text-only 或 multimodal content Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -158,6 +158,9 @@ func main() {
|
||||
toolRegistry.Register(tools.NewHostFileTool(hostManager))
|
||||
toolRegistry.Register(tools.NewHostSystemTool(hostManager))
|
||||
}
|
||||
|
||||
// Phase 6.3: 视觉理解工具
|
||||
toolRegistry.Register(tools.NewVisionTool())
|
||||
log.Printf("工具注册中心已就绪: %d 个工具 (%v)", len(toolRegistry.ListTools()), toolRegistry.ListTools())
|
||||
}
|
||||
|
||||
@@ -431,11 +434,12 @@ func handleChat(
|
||||
|
||||
// 解析请求
|
||||
var req struct {
|
||||
UserID string `json:"user_id"`
|
||||
SessionID string `json:"session_id"`
|
||||
Message string `json:"message"`
|
||||
Mode string `json:"mode"`
|
||||
Nickname string `json:"nickname,omitempty"`
|
||||
UserID string `json:"user_id"`
|
||||
SessionID string `json:"session_id"`
|
||||
Message string `json:"message"`
|
||||
Images []string `json:"images,omitempty"` // 图片 base64 data URL
|
||||
Mode string `json:"mode"`
|
||||
Nickname string `json:"nickname,omitempty"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "无效的请求体", http.StatusBadRequest)
|
||||
@@ -480,6 +484,7 @@ func handleChat(
|
||||
UserID: req.UserID,
|
||||
SessionID: req.SessionID,
|
||||
Message: req.Message,
|
||||
Images: req.Images,
|
||||
Mode: req.Mode,
|
||||
Nickname: userNickname,
|
||||
})
|
||||
|
||||
@@ -135,6 +135,7 @@ func DefaultAutonomousToolPolicy() *AutonomousToolPolicy {
|
||||
"iot_query", "iot_control", "memory_search", "web_search",
|
||||
"calculator", "datetime", "web_fetch",
|
||||
"host_exec", "host_file", "host_system",
|
||||
"vision_analyze",
|
||||
},
|
||||
MaxToolCallsPerRound: 5,
|
||||
MaxHighRiskPerHour: 10,
|
||||
|
||||
@@ -61,7 +61,7 @@ type openAIRequest struct {
|
||||
|
||||
type openAIMessage struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content,omitempty"`
|
||||
Content interface{} `json:"content,omitempty"` // string or []model.ImageContent for multimodal
|
||||
Name string `json:"name,omitempty"`
|
||||
ToolCalls []openAIToolCall `json:"tool_calls,omitempty"`
|
||||
ToolCallID string `json:"tool_call_id,omitempty"`
|
||||
@@ -180,8 +180,8 @@ func (p *OpenAIProvider) ChatStreamWithTools(ctx context.Context, messages []mod
|
||||
|
||||
if len(streamResp.Choices) > 0 {
|
||||
delta := streamResp.Choices[0].Delta
|
||||
if delta.Content != "" {
|
||||
ch <- StreamChunk{Content: delta.Content}
|
||||
if deltaStr := contentString(delta.Content); deltaStr != "" {
|
||||
ch <- StreamChunk{Content: deltaStr}
|
||||
}
|
||||
if streamResp.Choices[0].FinishReason != "" {
|
||||
usage := &model.Usage{}
|
||||
@@ -228,7 +228,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
|
||||
for i, msg := range messages {
|
||||
oaiMsg := openAIMessage{
|
||||
Role: string(msg.Role),
|
||||
Content: msg.Content,
|
||||
Content: buildContent(msg.Content, msg.Images),
|
||||
Name: msg.Name,
|
||||
ToolCallID: msg.ToolCallID,
|
||||
ReasoningContent: msg.ReasoningContent,
|
||||
@@ -305,7 +305,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
|
||||
// 检查是否有工具调用
|
||||
choice := oaiResp.Choices[0]
|
||||
llmResp := &model.LLMResponse{
|
||||
Content: choice.Message.Content,
|
||||
Content: contentString(choice.Message.Content),
|
||||
FinishReason: choice.FinishReason,
|
||||
ReasoningContent: choice.Message.ReasoningContent,
|
||||
Usage: model.Usage{
|
||||
@@ -335,7 +335,7 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
|
||||
for i, msg := range messages {
|
||||
oaiMsg := openAIMessage{
|
||||
Role: string(msg.Role),
|
||||
Content: msg.Content,
|
||||
Content: buildContent(msg.Content, msg.Images),
|
||||
Name: msg.Name,
|
||||
ToolCallID: msg.ToolCallID,
|
||||
ReasoningContent: msg.ReasoningContent,
|
||||
@@ -399,3 +399,38 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
|
||||
func (p *OpenAIProvider) ModelName() string {
|
||||
return p.config.Model
|
||||
}
|
||||
|
||||
// contentString extracts a string from an interface{} Content value.
|
||||
func contentString(v interface{}) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
}
|
||||
if s, ok := v.(string); ok {
|
||||
return s
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// buildContent converts text + optional images to API content format.
|
||||
// Returns a plain string if no images, or a multimodal array otherwise.
|
||||
func buildContent(text string, images []string) interface{} {
|
||||
if len(images) == 0 {
|
||||
return text
|
||||
}
|
||||
parts := make([]model.ImageContent, 0, len(images)+1)
|
||||
if text != "" {
|
||||
parts = append(parts, model.ImageContent{
|
||||
Type: "text",
|
||||
Text: text,
|
||||
})
|
||||
}
|
||||
for _, img := range images {
|
||||
parts = append(parts, model.ImageContent{
|
||||
Type: "image_url",
|
||||
ImageURL: &model.ImageURL{
|
||||
URL: img,
|
||||
},
|
||||
})
|
||||
}
|
||||
return parts
|
||||
}
|
||||
|
||||
@@ -16,12 +16,26 @@ const (
|
||||
type LLMMessage struct {
|
||||
Role Role `json:"role"`
|
||||
Content string `json:"content"`
|
||||
Images []string `json:"images,omitempty"` // 图片 base64 data URL 列表 (多模态)
|
||||
Name string `json:"name,omitempty"` // 可选发送者名称
|
||||
ToolCallID string `json:"tool_call_id,omitempty"` // 工具调用关联ID (tool role 消息关联调用)
|
||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"` // 助手消息中的工具调用列表
|
||||
ReasoningContent string `json:"reasoning_content,omitempty"` // DeepSeek 思考链内容(需回传)
|
||||
}
|
||||
|
||||
// ImageContent is a multimodal content part for images.
|
||||
type ImageContent struct {
|
||||
Type string `json:"type"`
|
||||
Text string `json:"text,omitempty"`
|
||||
ImageURL *ImageURL `json:"image_url,omitempty"`
|
||||
}
|
||||
|
||||
// ImageURL holds an image URL (can be a data: URL or http: URL).
|
||||
type ImageURL struct {
|
||||
URL string `json:"url"`
|
||||
Detail string `json:"detail,omitempty"` // low, high, auto
|
||||
}
|
||||
|
||||
// ChatMessage 数据库存储的对话消息
|
||||
type ChatMessage struct {
|
||||
ID string `json:"id" db:"id"`
|
||||
|
||||
@@ -98,7 +98,8 @@ type ProcessParams struct {
|
||||
UserID string
|
||||
SessionID string
|
||||
Message string
|
||||
Mode string // text / voice_msg / voice_assistant
|
||||
Images []string // 图片 base64 data URL (多模态)
|
||||
Mode string // text / voice_msg / voice_assistant
|
||||
Nickname string
|
||||
}
|
||||
|
||||
@@ -262,6 +263,7 @@ func (o *Orchestrator) ProcessInput(
|
||||
UserID: params.UserID,
|
||||
SessionID: params.SessionID,
|
||||
UserMessage: params.Message,
|
||||
Images: params.Images,
|
||||
Nickname: userName,
|
||||
PersonaPrompt: systemPrompt,
|
||||
DialogHistory: history,
|
||||
|
||||
@@ -25,17 +25,18 @@ func NewSynthesizer(llmAdapter *llm.Adapter) *Synthesizer {
|
||||
|
||||
// SynthesizeParams 综合参数
|
||||
type SynthesizeParams struct {
|
||||
UserID string
|
||||
SessionID string
|
||||
UserMessage string
|
||||
Nickname string
|
||||
PersonaPrompt string // 完整人格提示词
|
||||
DialogHistory []model.LLMMessage // 对话历史
|
||||
MemorySummary string // 记忆检索摘要
|
||||
ThoughtOutline string // 通用对话思考
|
||||
IoTSummary string // IoT 操作摘要
|
||||
DeviceContext string // 设备状态上下文
|
||||
Mode string // text / voice_assistant
|
||||
UserID string
|
||||
SessionID string
|
||||
UserMessage string
|
||||
Images []string // 图片 base64 data URL (多模态)
|
||||
Nickname string
|
||||
PersonaPrompt string // 完整人格提示词
|
||||
DialogHistory []model.LLMMessage // 对话历史
|
||||
MemorySummary string // 记忆检索摘要
|
||||
ThoughtOutline string // 通用对话思考
|
||||
IoTSummary string // IoT 操作摘要
|
||||
DeviceContext string // 设备状态上下文
|
||||
Mode string // text / voice_assistant
|
||||
}
|
||||
|
||||
// Synthesize 综合所有子会话结果,流式生成最终回复
|
||||
@@ -99,10 +100,11 @@ func (s *Synthesizer) buildSynthesizeMessages(params SynthesizeParams) []model.L
|
||||
messages = append(messages, params.DialogHistory...)
|
||||
}
|
||||
|
||||
// 当前用户消息
|
||||
// 当前用户消息 (支持多模态图片)
|
||||
messages = append(messages, model.LLMMessage{
|
||||
Role: model.RoleUser,
|
||||
Content: params.UserMessage,
|
||||
Images: params.Images,
|
||||
})
|
||||
|
||||
return messages
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// VisionTool enables image understanding via multimodal LLM.
|
||||
// It reads an image file, encodes it as base64, and returns a prompt-ready
|
||||
// data URL that can be fed into the vision pipeline.
|
||||
type VisionTool struct{}
|
||||
|
||||
// NewVisionTool creates a vision tool.
|
||||
func NewVisionTool() *VisionTool {
|
||||
return &VisionTool{}
|
||||
}
|
||||
|
||||
func (t *VisionTool) Definition() ToolDefinition {
|
||||
return ToolDefinition{
|
||||
Name: "vision_analyze",
|
||||
Description: "分析图片内容。传入图片路径,返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。",
|
||||
Parameters: map[string]interface{}{
|
||||
"type": "object",
|
||||
"properties": map[string]interface{}{
|
||||
"image_path": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "图片文件路径",
|
||||
},
|
||||
"task": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)",
|
||||
"enum": []string{"ocr", "describe", "analyze"},
|
||||
},
|
||||
},
|
||||
"required": []string{"image_path", "task"},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
|
||||
imagePath, _ := args["image_path"].(string)
|
||||
if imagePath == "" {
|
||||
return &ToolResult{
|
||||
ToolName: "vision_analyze",
|
||||
Success: false,
|
||||
Error: "image_path 参数不能为空",
|
||||
}, nil
|
||||
}
|
||||
|
||||
task, _ := args["task"].(string)
|
||||
if task == "" {
|
||||
task = "analyze"
|
||||
}
|
||||
|
||||
dataURL, mimeType, err := encodeImageToDataURL(imagePath)
|
||||
if err != nil {
|
||||
return &ToolResult{
|
||||
ToolName: "vision_analyze",
|
||||
Success: false,
|
||||
Error: fmt.Sprintf("读取图片失败: %v", err),
|
||||
}, nil
|
||||
}
|
||||
|
||||
taskPrompts := map[string]string{
|
||||
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
|
||||
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
|
||||
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
|
||||
}
|
||||
|
||||
result, _ := json.Marshal(map[string]interface{}{
|
||||
"image_path": imagePath,
|
||||
"task": task,
|
||||
"data_url": dataURL,
|
||||
"mime_type": mimeType,
|
||||
"prompt": taskPrompts[task],
|
||||
"file_size": len(dataURL),
|
||||
})
|
||||
|
||||
return &ToolResult{
|
||||
ToolName: "vision_analyze",
|
||||
Success: true,
|
||||
Data: string(result),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// encodeImageToDataURL reads an image file and returns a base64 data URL.
|
||||
func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("cannot read image: %w", err)
|
||||
}
|
||||
|
||||
if len(data) > 20*1024*1024 {
|
||||
return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data))
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".png":
|
||||
mimeType = "image/png"
|
||||
case ".jpg", ".jpeg":
|
||||
mimeType = "image/jpeg"
|
||||
case ".gif":
|
||||
mimeType = "image/gif"
|
||||
case ".webp":
|
||||
mimeType = "image/webp"
|
||||
case ".bmp":
|
||||
mimeType = "image/bmp"
|
||||
case ".svg":
|
||||
mimeType = "image/svg+xml"
|
||||
default:
|
||||
mimeType = "image/png"
|
||||
}
|
||||
|
||||
b64 := base64.StdEncoding.EncodeToString(data)
|
||||
return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil
|
||||
}
|
||||
Reference in New Issue
Block a user