feat: Phase 6.3 视觉理解 — 多模态图片输入 + OCR/Vision 工具 + 图片编码管线

- LLMMessage 新增 Images 字段支持多模态 content array
- OpenAIProvider 支持 image_url content parts
- VisionTool: 图片读取 + base64 编码 + OCR/场景描述/综合分析
- 对话管道全线支持 images 参数传递 (Gateway->Orchestrator->Synthesizer->LLM)
- 自动根据图片有无构建 text-only 或 multimodal content

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-23 22:28:42 +08:00
parent 38b36fc5ad
commit 9a8fb8d0ce
7 changed files with 205 additions and 24 deletions
+41 -6
View File
@@ -61,7 +61,7 @@ type openAIRequest struct {
type openAIMessage struct {
Role string `json:"role"`
Content string `json:"content,omitempty"`
Content interface{} `json:"content,omitempty"` // string or []model.ImageContent for multimodal
Name string `json:"name,omitempty"`
ToolCalls []openAIToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
@@ -180,8 +180,8 @@ func (p *OpenAIProvider) ChatStreamWithTools(ctx context.Context, messages []mod
if len(streamResp.Choices) > 0 {
delta := streamResp.Choices[0].Delta
if delta.Content != "" {
ch <- StreamChunk{Content: delta.Content}
if deltaStr := contentString(delta.Content); deltaStr != "" {
ch <- StreamChunk{Content: deltaStr}
}
if streamResp.Choices[0].FinishReason != "" {
usage := &model.Usage{}
@@ -228,7 +228,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
for i, msg := range messages {
oaiMsg := openAIMessage{
Role: string(msg.Role),
Content: msg.Content,
Content: buildContent(msg.Content, msg.Images),
Name: msg.Name,
ToolCallID: msg.ToolCallID,
ReasoningContent: msg.ReasoningContent,
@@ -305,7 +305,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
// 检查是否有工具调用
choice := oaiResp.Choices[0]
llmResp := &model.LLMResponse{
Content: choice.Message.Content,
Content: contentString(choice.Message.Content),
FinishReason: choice.FinishReason,
ReasoningContent: choice.Message.ReasoningContent,
Usage: model.Usage{
@@ -335,7 +335,7 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
for i, msg := range messages {
oaiMsg := openAIMessage{
Role: string(msg.Role),
Content: msg.Content,
Content: buildContent(msg.Content, msg.Images),
Name: msg.Name,
ToolCallID: msg.ToolCallID,
ReasoningContent: msg.ReasoningContent,
@@ -399,3 +399,38 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
func (p *OpenAIProvider) ModelName() string {
return p.config.Model
}
// contentString extracts a string from an interface{} Content value.
func contentString(v interface{}) string {
if v == nil {
return ""
}
if s, ok := v.(string); ok {
return s
}
return ""
}
// buildContent converts text + optional images to API content format.
// Returns a plain string if no images, or a multimodal array otherwise.
func buildContent(text string, images []string) interface{} {
if len(images) == 0 {
return text
}
parts := make([]model.ImageContent, 0, len(images)+1)
if text != "" {
parts = append(parts, model.ImageContent{
Type: "text",
Text: text,
})
}
for _, img := range images {
parts = append(parts, model.ImageContent{
Type: "image_url",
ImageURL: &model.ImageURL{
URL: img,
},
})
}
return parts
}