feat: Phase 6.3 视觉理解 — 多模态图片输入 + OCR/Vision 工具 + 图片编码管线

- LLMMessage 新增 Images 字段支持多模态 content array - OpenAIProvider 支持 image_url content parts - VisionTool: 图片读取 + base64 编码 + OCR/场景描述/综合分析 - 对话管道全线支持 images 参数传递 (Gateway->Orchestrator->Synthesizer->LLM) - 自动根据图片有无构建 text-only 或 multimodal content Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 22:28:42 +08:00
parent 38b36fc5ad
commit 9a8fb8d0ce
7 changed files with 205 additions and 24 deletions
@@ -158,6 +158,9 @@ func main() {
 			toolRegistry.Register(tools.NewHostFileTool(hostManager))
 			toolRegistry.Register(tools.NewHostSystemTool(hostManager))
 		}
+
+		// Phase 6.3: 视觉理解工具
+		toolRegistry.Register(tools.NewVisionTool())
 		log.Printf("工具注册中心已就绪: %d 个工具 (%v)", len(toolRegistry.ListTools()), toolRegistry.ListTools())
 	}

@@ -431,11 +434,12 @@ func handleChat(

 	// 解析请求
 	var req struct {
-		UserID    string `json:"user_id"`
-		SessionID string `json:"session_id"`
-		Message   string `json:"message"`
-		Mode      string `json:"mode"`
-		Nickname  string `json:"nickname,omitempty"`
+		UserID    string   `json:"user_id"`
+		SessionID string   `json:"session_id"`
+		Message   string   `json:"message"`
+		Images    []string `json:"images,omitempty"` // 图片 base64 data URL
+		Mode      string   `json:"mode"`
+		Nickname  string   `json:"nickname,omitempty"`
 	}
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 		http.Error(w, "无效的请求体", http.StatusBadRequest)
@@ -480,6 +484,7 @@ func handleChat(
 		UserID:    req.UserID,
 		SessionID: req.SessionID,
 		Message:   req.Message,
+		Images:    req.Images,
 		Mode:      req.Mode,
 		Nickname:  userNickname,
 	})
@@ -135,6 +135,7 @@ func DefaultAutonomousToolPolicy() *AutonomousToolPolicy {
 			"iot_query", "iot_control", "memory_search", "web_search",
 			"calculator", "datetime", "web_fetch",
 			"host_exec", "host_file", "host_system",
+			"vision_analyze",
 		},
 		MaxToolCallsPerRound: 5,
 		MaxHighRiskPerHour:   10,
@@ -61,7 +61,7 @@ type openAIRequest struct {

 type openAIMessage struct {
 	Role             string            `json:"role"`
-	Content          string            `json:"content,omitempty"`
+	Content          interface{}       `json:"content,omitempty"` // string or []model.ImageContent for multimodal
 	Name             string            `json:"name,omitempty"`
 	ToolCalls        []openAIToolCall  `json:"tool_calls,omitempty"`
 	ToolCallID       string            `json:"tool_call_id,omitempty"`
@@ -180,8 +180,8 @@ func (p *OpenAIProvider) ChatStreamWithTools(ctx context.Context, messages []mod

 			if len(streamResp.Choices) > 0 {
 				delta := streamResp.Choices[0].Delta
-				if delta.Content != "" {
-					ch <- StreamChunk{Content: delta.Content}
+				if deltaStr := contentString(delta.Content); deltaStr != "" {
+					ch <- StreamChunk{Content: deltaStr}
 				}
 				if streamResp.Choices[0].FinishReason != "" {
 					usage := &model.Usage{}
@@ -228,7 +228,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
 	for i, msg := range messages {
 		oaiMsg := openAIMessage{
 			Role:             string(msg.Role),
-			Content:          msg.Content,
+			Content:          buildContent(msg.Content, msg.Images),
 			Name:             msg.Name,
 			ToolCallID:       msg.ToolCallID,
 			ReasoningContent: msg.ReasoningContent,
@@ -305,7 +305,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
 	// 检查是否有工具调用
 	choice := oaiResp.Choices[0]
 	llmResp := &model.LLMResponse{
-		Content:          choice.Message.Content,
+		Content:          contentString(choice.Message.Content),
 		FinishReason:     choice.FinishReason,
 		ReasoningContent: choice.Message.ReasoningContent,
 		Usage: model.Usage{
@@ -335,7 +335,7 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
 	for i, msg := range messages {
 		oaiMsg := openAIMessage{
 			Role:             string(msg.Role),
-			Content:          msg.Content,
+			Content:          buildContent(msg.Content, msg.Images),
 			Name:             msg.Name,
 			ToolCallID:       msg.ToolCallID,
 			ReasoningContent: msg.ReasoningContent,
@@ -399,3 +399,38 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
 func (p *OpenAIProvider) ModelName() string {
 	return p.config.Model
 }
+
+// contentString extracts a string from an interface{} Content value.
+func contentString(v interface{}) string {
+	if v == nil {
+		return ""
+	}
+	if s, ok := v.(string); ok {
+		return s
+	}
+	return ""
+}
+
+// buildContent converts text + optional images to API content format.
+// Returns a plain string if no images, or a multimodal array otherwise.
+func buildContent(text string, images []string) interface{} {
+	if len(images) == 0 {
+		return text
+	}
+	parts := make([]model.ImageContent, 0, len(images)+1)
+	if text != "" {
+		parts = append(parts, model.ImageContent{
+			Type: "text",
+			Text: text,
+		})
+	}
+	for _, img := range images {
+		parts = append(parts, model.ImageContent{
+			Type: "image_url",
+			ImageURL: &model.ImageURL{
+				URL: img,
+			},
+		})
+	}
+	return parts
+}
@@ -16,12 +16,26 @@ const (
 type LLMMessage struct {
 	Role             Role       `json:"role"`
 	Content          string     `json:"content"`
+	Images           []string   `json:"images,omitempty"`            // 图片 base64 data URL 列表 (多模态)
 	Name             string     `json:"name,omitempty"`              // 可选发送者名称
 	ToolCallID       string     `json:"tool_call_id,omitempty"`       // 工具调用关联ID (tool role 消息关联调用)
 	ToolCalls        []ToolCall `json:"tool_calls,omitempty"`         // 助手消息中的工具调用列表
 	ReasoningContent string     `json:"reasoning_content,omitempty"` // DeepSeek 思考链内容（需回传）
 }

+// ImageContent is a multimodal content part for images.
+type ImageContent struct {
+	Type     string    `json:"type"`
+	Text     string    `json:"text,omitempty"`
+	ImageURL *ImageURL `json:"image_url,omitempty"`
+}
+
+// ImageURL holds an image URL (can be a data: URL or http: URL).
+type ImageURL struct {
+	URL    string `json:"url"`
+	Detail string `json:"detail,omitempty"` // low, high, auto
+}
+
 // ChatMessage 数据库存储的对话消息
 type ChatMessage struct {
 	ID        string    `json:"id" db:"id"`
@@ -98,7 +98,8 @@ type ProcessParams struct {
 	UserID    string
 	SessionID string
 	Message   string
-	Mode      string // text / voice_msg / voice_assistant
+	Images    []string // 图片 base64 data URL (多模态)
+	Mode      string   // text / voice_msg / voice_assistant
 	Nickname  string
 }

@@ -262,6 +263,7 @@ func (o *Orchestrator) ProcessInput(
 			UserID:        params.UserID,
 			SessionID:     params.SessionID,
 			UserMessage:   params.Message,
+			Images:        params.Images,
 			Nickname:      userName,
 			PersonaPrompt: systemPrompt,
 			DialogHistory: history,
@@ -25,17 +25,18 @@ func NewSynthesizer(llmAdapter *llm.Adapter) *Synthesizer {

 // SynthesizeParams 综合参数
 type SynthesizeParams struct {
-	UserID        string
-	SessionID     string
-	UserMessage   string
-	Nickname      string
-	PersonaPrompt string // 完整人格提示词
-	DialogHistory []model.LLMMessage // 对话历史
-	MemorySummary string            // 记忆检索摘要
-	ThoughtOutline string           // 通用对话思考
-	IoTSummary    string            // IoT 操作摘要
-	DeviceContext string            // 设备状态上下文
-	Mode          string            // text / voice_assistant
+	UserID         string
+	SessionID      string
+	UserMessage    string
+	Images         []string           // 图片 base64 data URL (多模态)
+	Nickname       string
+	PersonaPrompt  string             // 完整人格提示词
+	DialogHistory  []model.LLMMessage // 对话历史
+	MemorySummary  string             // 记忆检索摘要
+	ThoughtOutline string             // 通用对话思考
+	IoTSummary     string             // IoT 操作摘要
+	DeviceContext  string             // 设备状态上下文
+	Mode           string             // text / voice_assistant
 }

 // Synthesize 综合所有子会话结果，流式生成最终回复
@@ -99,10 +100,11 @@ func (s *Synthesizer) buildSynthesizeMessages(params SynthesizeParams) []model.L
 		messages = append(messages, params.DialogHistory...)
 	}

-	// 当前用户消息
+	// 当前用户消息 (支持多模态图片)
 	messages = append(messages, model.LLMMessage{
 		Role:    model.RoleUser,
 		Content: params.UserMessage,
+		Images:  params.Images,
 	})

 	return messages
@@ -0,0 +1,122 @@
+package tools
+
+import (
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// VisionTool enables image understanding via multimodal LLM.
+// It reads an image file, encodes it as base64, and returns a prompt-ready
+// data URL that can be fed into the vision pipeline.
+type VisionTool struct{}
+
+// NewVisionTool creates a vision tool.
+func NewVisionTool() *VisionTool {
+	return &VisionTool{}
+}
+
+func (t *VisionTool) Definition() ToolDefinition {
+	return ToolDefinition{
+		Name:        "vision_analyze",
+		Description: "分析图片内容。传入图片路径，返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。",
+		Parameters: map[string]interface{}{
+			"type": "object",
+			"properties": map[string]interface{}{
+				"image_path": map[string]interface{}{
+					"type":        "string",
+					"description": "图片文件路径",
+				},
+				"task": map[string]interface{}{
+					"type":        "string",
+					"description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)",
+					"enum":        []string{"ocr", "describe", "analyze"},
+				},
+			},
+			"required": []string{"image_path", "task"},
+		},
+	}
+}
+
+func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
+	imagePath, _ := args["image_path"].(string)
+	if imagePath == "" {
+		return &ToolResult{
+			ToolName: "vision_analyze",
+			Success:  false,
+			Error:    "image_path 参数不能为空",
+		}, nil
+	}
+
+	task, _ := args["task"].(string)
+	if task == "" {
+		task = "analyze"
+	}
+
+	dataURL, mimeType, err := encodeImageToDataURL(imagePath)
+	if err != nil {
+		return &ToolResult{
+			ToolName: "vision_analyze",
+			Success:  false,
+			Error:    fmt.Sprintf("读取图片失败: %v", err),
+		}, nil
+	}
+
+	taskPrompts := map[string]string{
+		"ocr":       "请提取这张图片中的所有文字内容，保持原始格式和排版。只输出文字内容，不要添加额外说明。",
+		"describe":  "请详细描述这张图片的内容，包括场景、物体、人物、颜色、氛围等。",
+		"analyze":   "请综合分析这张图片，包括内容描述、文字提取(如有)、以及你的理解。",
+	}
+
+	result, _ := json.Marshal(map[string]interface{}{
+		"image_path":  imagePath,
+		"task":        task,
+		"data_url":    dataURL,
+		"mime_type":   mimeType,
+		"prompt":      taskPrompts[task],
+		"file_size":   len(dataURL),
+	})
+
+	return &ToolResult{
+		ToolName: "vision_analyze",
+		Success:  true,
+		Data:     string(result),
+	}, nil
+}
+
+// encodeImageToDataURL reads an image file and returns a base64 data URL.
+func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return "", "", fmt.Errorf("cannot read image: %w", err)
+	}
+
+	if len(data) > 20*1024*1024 {
+		return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data))
+	}
+
+	ext := strings.ToLower(filepath.Ext(path))
+	switch ext {
+	case ".png":
+		mimeType = "image/png"
+	case ".jpg", ".jpeg":
+		mimeType = "image/jpeg"
+	case ".gif":
+		mimeType = "image/gif"
+	case ".webp":
+		mimeType = "image/webp"
+	case ".bmp":
+		mimeType = "image/bmp"
+	case ".svg":
+		mimeType = "image/svg+xml"
+	default:
+		mimeType = "image/png"
+	}
+
+	b64 := base64.StdEncoding.EncodeToString(data)
+	return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil
+}