feat: 双模型并行图片预处理 — 视觉理解+OCR 同时调用,由会话模型自主判断
- preprocessImages 同时调用 vision 和 OCR 模型(并行 goroutine) - 当两个模型不同时,OCR 专注文字提取,视觉模型负责场景理解 - 两种结果合并传给会话模型,由 LLM 自主判断和融合 - 修复 LoadFromDB 旧 action 角色记录映射为 assistant Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"github.com/yourname/cyrene-ai/pkg/logger"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/yourname/cyrene-ai/ai-core/internal/cache"
|
||||
@@ -38,7 +39,8 @@ type Orchestrator struct {
|
||||
msgScheduler *scheduler.MessageScheduler
|
||||
emotionTracker *persona.EmotionTracker
|
||||
toolRegistry *plgManager.ToolRegistry
|
||||
visionProvider llm.LLMProvider // 视觉模型 (图片预处理/OCR)
|
||||
visionProvider llm.LLMProvider // 视觉模型 (图片预处理)
|
||||
ocrProvider llm.LLMProvider // OCR 模型 (文字提取,与视觉模型并行调用)
|
||||
}
|
||||
|
||||
// SetResponseCache sets the response cache (optional, for Phase 0.2).
|
||||
@@ -77,6 +79,11 @@ func (o *Orchestrator) SetVisionProvider(vp llm.LLMProvider) {
|
||||
o.visionProvider = vp
|
||||
}
|
||||
|
||||
// SetOCRProvider sets the OCR model provider for text extraction.
|
||||
func (o *Orchestrator) SetOCRProvider(op llm.LLMProvider) {
|
||||
o.ocrProvider = op
|
||||
}
|
||||
|
||||
// getBus returns the bus or a nop fallback.
|
||||
func (o *Orchestrator) getBus() bus.Bus {
|
||||
if o.eventBus == nil {
|
||||
@@ -650,28 +657,75 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
|
||||
}
|
||||
}
|
||||
|
||||
// preprocessImages uses the vision model to analyze images and augments the user message.
|
||||
// preprocessImages uses vision and OCR models to analyze images and augments the user message.
|
||||
// When both vision and OCR providers are available (and are different models), they are called
|
||||
// in parallel and both results are passed to the chat model for autonomous judgment.
|
||||
// For standalone images (no text): generates a comprehensive description as the message.
|
||||
// For text+images: appends image descriptions as contextual annotations.
|
||||
func (o *Orchestrator) preprocessImages(ctx context.Context, message string, images []string) string {
|
||||
var prompt string
|
||||
if message == "" {
|
||||
prompt = "请详细描述这张图片的内容,包括场景、物体、人物、文字(如有)、颜色、氛围等所有视觉信息。"
|
||||
} else {
|
||||
prompt = fmt.Sprintf("用户的问题是:「%s」\n\n请根据用户的问题,分析这张图片中相关的视觉信息,帮助回答用户的问题。如果图片中有文字,请完整提取。", message)
|
||||
visionPromptBase := "请详细描述这张图片的内容,包括场景、物体、人物、文字(如有)、颜色、氛围等所有视觉信息。"
|
||||
ocrPromptBase := `请逐字逐句完整提取图片中的所有文字内容,保持原有格式和排版。如果图片中没有文字,请回复"无文字"。`
|
||||
|
||||
if message != "" {
|
||||
visionPromptBase = fmt.Sprintf("用户的问题是:「%s」\n\n请根据用户的问题,分析这张图片中相关的视觉信息,帮助回答用户的问题。如果图片中有文字,请完整提取。", message)
|
||||
ocrPromptBase = fmt.Sprintf(`用户的问题是:「%s」
|
||||
|
||||
请逐字逐句完整提取图片中的所有文字内容,保持原有格式和排版。如果图片中没有文字,请回复"无文字"。`, message)
|
||||
}
|
||||
|
||||
// Determine if OCR is a distinct model (avoid double-calling the same model)
|
||||
useDual := o.ocrProvider != nil && o.visionProvider != nil &&
|
||||
o.ocrProvider.ModelName() != o.visionProvider.ModelName()
|
||||
|
||||
var descriptions []string
|
||||
for i, img := range images {
|
||||
resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
|
||||
{Role: model.RoleUser, Content: prompt, Images: []string{img}},
|
||||
})
|
||||
if err != nil {
|
||||
logger.Printf("[orchestrator] 图片 %d 预处理失败: %v", i, err)
|
||||
continue
|
||||
var visionDesc, ocrDesc string
|
||||
var wg sync.WaitGroup
|
||||
|
||||
if o.visionProvider != nil {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
|
||||
{Role: model.RoleUser, Content: visionPromptBase, Images: []string{img}},
|
||||
})
|
||||
if err != nil {
|
||||
logger.Printf("[orchestrator] 图片 %d 视觉分析失败: %v", i, err)
|
||||
return
|
||||
}
|
||||
visionDesc = resp.Content
|
||||
}()
|
||||
}
|
||||
if resp.Content != "" {
|
||||
descriptions = append(descriptions, resp.Content)
|
||||
|
||||
if useDual {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
resp, err := o.ocrProvider.Chat(ctx, []model.LLMMessage{
|
||||
{Role: model.RoleUser, Content: ocrPromptBase, Images: []string{img}},
|
||||
})
|
||||
if err != nil {
|
||||
logger.Printf("[orchestrator] 图片 %d OCR提取失败: %v", i, err)
|
||||
return
|
||||
}
|
||||
ocrDesc = resp.Content
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
var combined string
|
||||
switch {
|
||||
case visionDesc != "" && ocrDesc != "":
|
||||
combined = fmt.Sprintf("[视觉分析]: %s\n[文字提取(OCR)]: %s", visionDesc, ocrDesc)
|
||||
case visionDesc != "":
|
||||
combined = visionDesc
|
||||
case ocrDesc != "":
|
||||
combined = ocrDesc
|
||||
}
|
||||
|
||||
if combined != "" {
|
||||
descriptions = append(descriptions, combined)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user