feat: 双模型并行图片预处理 — 视觉理解+OCR 同时调用,由会话模型自主判断

- preprocessImages 同时调用 vision 和 OCR 模型(并行 goroutine)
- 当两个模型不同时,OCR 专注文字提取,视觉模型负责场景理解
- 两种结果合并传给会话模型,由 LLM 自主判断和融合
- 修复 LoadFromDB 旧 action 角色记录映射为 assistant

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 13:03:04 +08:00
parent 91c9ee4b2d
commit 3e15285065
3 changed files with 99 additions and 16 deletions
@@ -5,6 +5,7 @@ import (
"fmt"
"github.com/yourname/cyrene-ai/pkg/logger"
"strings"
"sync"
"time"
"github.com/yourname/cyrene-ai/ai-core/internal/cache"
@@ -38,7 +39,8 @@ type Orchestrator struct {
msgScheduler *scheduler.MessageScheduler
emotionTracker *persona.EmotionTracker
toolRegistry *plgManager.ToolRegistry
visionProvider llm.LLMProvider // 视觉模型 (图片预处理/OCR)
visionProvider llm.LLMProvider // 视觉模型 (图片预处理)
ocrProvider llm.LLMProvider // OCR 模型 (文字提取,与视觉模型并行调用)
}
// SetResponseCache sets the response cache (optional, for Phase 0.2).
@@ -77,6 +79,11 @@ func (o *Orchestrator) SetVisionProvider(vp llm.LLMProvider) {
o.visionProvider = vp
}
// SetOCRProvider sets the OCR model provider for text extraction.
func (o *Orchestrator) SetOCRProvider(op llm.LLMProvider) {
o.ocrProvider = op
}
// getBus returns the bus or a nop fallback.
func (o *Orchestrator) getBus() bus.Bus {
if o.eventBus == nil {
@@ -650,28 +657,75 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
}
}
// preprocessImages uses the vision model to analyze images and augments the user message.
// preprocessImages uses vision and OCR models to analyze images and augments the user message.
// When both vision and OCR providers are available (and are different models), they are called
// in parallel and both results are passed to the chat model for autonomous judgment.
// For standalone images (no text): generates a comprehensive description as the message.
// For text+images: appends image descriptions as contextual annotations.
func (o *Orchestrator) preprocessImages(ctx context.Context, message string, images []string) string {
var prompt string
if message == "" {
prompt = "请详细描述这张图片的内容,包括场景、物体、人物、文字(如有)、颜色、氛围等所有视觉信息。"
} else {
prompt = fmt.Sprintf("用户的问题是:「%s」\n\n请根据用户的问题,分析这张图片中相关的视觉信息,帮助回答用户的问题。如果图片中有文字,请完整提取。", message)
visionPromptBase := "请详细描述这张图片的内容,包括场景、物体、人物、文字(如有)、颜色、氛围等所有视觉信息。"
ocrPromptBase := `请逐字逐句完整提取图片中的所有文字内容,保持原有格式和排版。如果图片中没有文字,请回复"无文字"。`
if message != "" {
visionPromptBase = fmt.Sprintf("用户的问题是:「%s」\n\n请根据用户的问题,分析这张图片中相关的视觉信息,帮助回答用户的问题。如果图片中有文字,请完整提取。", message)
ocrPromptBase = fmt.Sprintf(`用户的问题是:「%s」
请逐字逐句完整提取图片中的所有文字内容,保持原有格式和排版。如果图片中没有文字,请回复"无文字"。`, message)
}
// Determine if OCR is a distinct model (avoid double-calling the same model)
useDual := o.ocrProvider != nil && o.visionProvider != nil &&
o.ocrProvider.ModelName() != o.visionProvider.ModelName()
var descriptions []string
for i, img := range images {
resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
{Role: model.RoleUser, Content: prompt, Images: []string{img}},
})
if err != nil {
logger.Printf("[orchestrator] 图片 %d 预处理失败: %v", i, err)
continue
var visionDesc, ocrDesc string
var wg sync.WaitGroup
if o.visionProvider != nil {
wg.Add(1)
go func() {
defer wg.Done()
resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
{Role: model.RoleUser, Content: visionPromptBase, Images: []string{img}},
})
if err != nil {
logger.Printf("[orchestrator] 图片 %d 视觉分析失败: %v", i, err)
return
}
visionDesc = resp.Content
}()
}
if resp.Content != "" {
descriptions = append(descriptions, resp.Content)
if useDual {
wg.Add(1)
go func() {
defer wg.Done()
resp, err := o.ocrProvider.Chat(ctx, []model.LLMMessage{
{Role: model.RoleUser, Content: ocrPromptBase, Images: []string{img}},
})
if err != nil {
logger.Printf("[orchestrator] 图片 %d OCR提取失败: %v", i, err)
return
}
ocrDesc = resp.Content
}()
}
wg.Wait()
var combined string
switch {
case visionDesc != "" && ocrDesc != "":
combined = fmt.Sprintf("[视觉分析]: %s\n[文字提取(OCR)]: %s", visionDesc, ocrDesc)
case visionDesc != "":
combined = visionDesc
case ocrDesc != "":
combined = ocrDesc
}
if combined != "" {
descriptions = append(descriptions, combined)
}
}