feat: Phase 6.3 视觉理解 — 多模态图片输入 + OCR/Vision 工具 + 图片编码管线
- LLMMessage 新增 Images 字段支持多模态 content array - OpenAIProvider 支持 image_url content parts - VisionTool: 图片读取 + base64 编码 + OCR/场景描述/综合分析 - 对话管道全线支持 images 参数传递 (Gateway->Orchestrator->Synthesizer->LLM) - 自动根据图片有无构建 text-only 或 multimodal content Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -61,7 +61,7 @@ type openAIRequest struct {
|
||||
|
||||
type openAIMessage struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content,omitempty"`
|
||||
Content interface{} `json:"content,omitempty"` // string or []model.ImageContent for multimodal
|
||||
Name string `json:"name,omitempty"`
|
||||
ToolCalls []openAIToolCall `json:"tool_calls,omitempty"`
|
||||
ToolCallID string `json:"tool_call_id,omitempty"`
|
||||
@@ -180,8 +180,8 @@ func (p *OpenAIProvider) ChatStreamWithTools(ctx context.Context, messages []mod
|
||||
|
||||
if len(streamResp.Choices) > 0 {
|
||||
delta := streamResp.Choices[0].Delta
|
||||
if delta.Content != "" {
|
||||
ch <- StreamChunk{Content: delta.Content}
|
||||
if deltaStr := contentString(delta.Content); deltaStr != "" {
|
||||
ch <- StreamChunk{Content: deltaStr}
|
||||
}
|
||||
if streamResp.Choices[0].FinishReason != "" {
|
||||
usage := &model.Usage{}
|
||||
@@ -228,7 +228,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
|
||||
for i, msg := range messages {
|
||||
oaiMsg := openAIMessage{
|
||||
Role: string(msg.Role),
|
||||
Content: msg.Content,
|
||||
Content: buildContent(msg.Content, msg.Images),
|
||||
Name: msg.Name,
|
||||
ToolCallID: msg.ToolCallID,
|
||||
ReasoningContent: msg.ReasoningContent,
|
||||
@@ -305,7 +305,7 @@ func (p *OpenAIProvider) doChat(ctx context.Context, messages []model.LLMMessage
|
||||
// 检查是否有工具调用
|
||||
choice := oaiResp.Choices[0]
|
||||
llmResp := &model.LLMResponse{
|
||||
Content: choice.Message.Content,
|
||||
Content: contentString(choice.Message.Content),
|
||||
FinishReason: choice.FinishReason,
|
||||
ReasoningContent: choice.Message.ReasoningContent,
|
||||
Usage: model.Usage{
|
||||
@@ -335,7 +335,7 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
|
||||
for i, msg := range messages {
|
||||
oaiMsg := openAIMessage{
|
||||
Role: string(msg.Role),
|
||||
Content: msg.Content,
|
||||
Content: buildContent(msg.Content, msg.Images),
|
||||
Name: msg.Name,
|
||||
ToolCallID: msg.ToolCallID,
|
||||
ReasoningContent: msg.ReasoningContent,
|
||||
@@ -399,3 +399,38 @@ func (p *OpenAIProvider) doChatStream(ctx context.Context, messages []model.LLMM
|
||||
func (p *OpenAIProvider) ModelName() string {
|
||||
return p.config.Model
|
||||
}
|
||||
|
||||
// contentString extracts a string from an interface{} Content value.
|
||||
func contentString(v interface{}) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
}
|
||||
if s, ok := v.(string); ok {
|
||||
return s
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// buildContent converts text + optional images to API content format.
|
||||
// Returns a plain string if no images, or a multimodal array otherwise.
|
||||
func buildContent(text string, images []string) interface{} {
|
||||
if len(images) == 0 {
|
||||
return text
|
||||
}
|
||||
parts := make([]model.ImageContent, 0, len(images)+1)
|
||||
if text != "" {
|
||||
parts = append(parts, model.ImageContent{
|
||||
Type: "text",
|
||||
Text: text,
|
||||
})
|
||||
}
|
||||
for _, img := range images {
|
||||
parts = append(parts, model.ImageContent{
|
||||
Type: "image_url",
|
||||
ImageURL: &model.ImageURL{
|
||||
URL: img,
|
||||
},
|
||||
})
|
||||
}
|
||||
return parts
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user