feat: Phase 6.3 视觉理解 — 多模态图片输入 + OCR/Vision 工具 + 图片编码管线

- LLMMessage 新增 Images 字段支持多模态 content array - OpenAIProvider 支持 image_url content parts - VisionTool: 图片读取 + base64 编码 + OCR/场景描述/综合分析 - 对话管道全线支持 images 参数传递 (Gateway->Orchestrator->Synthesizer->LLM) - 自动根据图片有无构建 text-only 或 multimodal content Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 22:28:42 +08:00
parent 38b36fc5ad
commit 9a8fb8d0ce
7 changed files with 205 additions and 24 deletions
@@ -61,7 +61,7 @@ type openAIRequest struct {

 type openAIMessage struct {
 	Role             string            `json:"role"`
-	Content          string            `json:"content,omitempty"`
+	Content          interface{}       `json:"content,omitempty"` // string or []model.ImageContent for multimodal
 	Name             string            `json:"name,omitempty"`
 	ToolCalls        []openAIToolCall  `json:"tool_calls,omitempty"`
 	ToolCallID       string            `json:"tool_call_id,omitempty"`
@@ -180,8 +180,8 @@ func (p *OpenAIProvider) ChatStreamWithTools(ctx context.Context, messages []mod

 			if len(streamResp.Choices) > 0 {
 				delta := streamResp.Choices[0].Delta
-				if delta.Content != "" {
-					ch <- StreamChunk{Content: delta.Content}
+				if deltaStr := contentString(delta.Content); deltaStr != "" {
+					ch <- StreamChunk{Content: deltaStr}
 				}
 				if streamResp.Choices[0].FinishReason != "" {
 					usage := &model.Usage{}
@@ -228,7 +228,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
 	for i, msg := range messages {
 		oaiMsg := openAIMessage{
 			Role:             string(msg.Role),
-			Content:          msg.Content,
+			Content:          buildContent(msg.Content, msg.Images),
 			Name:             msg.Name,
 			ToolCallID:       msg.ToolCallID,
 			ReasoningContent: msg.ReasoningContent,
@@ -305,7 +305,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
 	// 检查是否有工具调用
 	choice := oaiResp.Choices[0]
 	llmResp := &model.LLMResponse{
-		Content:          choice.Message.Content,
+		Content:          contentString(choice.Message.Content),
 		FinishReason:     choice.FinishReason,
 		ReasoningContent: choice.Message.ReasoningContent,
 		Usage: model.Usage{
@@ -335,7 +335,7 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
 	for i, msg := range messages {
 		oaiMsg := openAIMessage{
 			Role:             string(msg.Role),
-			Content:          msg.Content,
+			Content:          buildContent(msg.Content, msg.Images),
 			Name:             msg.Name,
 			ToolCallID:       msg.ToolCallID,
 			ReasoningContent: msg.ReasoningContent,
@@ -399,3 +399,38 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
 func (p *OpenAIProvider) ModelName() string {
 	return p.config.Model
 }
+
+// contentString extracts a string from an interface{} Content value.
+func contentString(v interface{}) string {
+	if v == nil {
+		return ""
+	}
+	if s, ok := v.(string); ok {
+		return s
+	}
+	return ""
+}
+
+// buildContent converts text + optional images to API content format.
+// Returns a plain string if no images, or a multimodal array otherwise.
+func buildContent(text string, images []string) interface{} {
+	if len(images) == 0 {
+		return text
+	}
+	parts := make([]model.ImageContent, 0, len(images)+1)
+	if text != "" {
+		parts = append(parts, model.ImageContent{
+			Type: "text",
+			Text: text,
+		})
+	}
+	for _, img := range images {
+		parts = append(parts, model.ImageContent{
+			Type: "image_url",
+			ImageURL: &model.ImageURL{
+				URL: img,
+			},
+		})
+	}
+	return parts
+}