feat: 双模型并行图片预处理 — 视觉理解+OCR 同时调用,由会话模型自主判断

- preprocessImages 同时调用 vision 和 OCR 模型(并行 goroutine)
- 当两个模型不同时,OCR 专注文字提取,视觉模型负责场景理解
- 两种结果合并传给会话模型,由 LLM 自主判断和融合
- 修复 LoadFromDB 旧 action 角色记录映射为 assistant

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 13:03:04 +08:00
parent 91c9ee4b2d
commit 3e15285065
3 changed files with 99 additions and 16 deletions
+24
View File
@@ -178,6 +178,7 @@ func main() {
// 初始化工具注册中心 (使用共享插件模块)
toolRegistry := plgManager.NewToolRegistry()
var visionProvider llm.LLMProvider
var ocrProvider llm.LLMProvider
if getEnvBool("ENABLE_TOOLS", true) {
// 11 个共享通用插件 — 注册其工具到统一注册中心
registerPluginTools(toolRegistry, &pluginCalc.CalculatorPlugin{})
@@ -231,6 +232,25 @@ func main() {
if visionProvider == nil {
log.Println("视觉模型未配置,vision_analyze 将使用 base64 模式")
}
// 初始化 OCR 模型(与视觉模型并行调用,提供文字提取结果给会话模型自主判断)
ocrProvider = nil
if configLoader != nil && configLoader.HasConfig() {
cfg := configLoader.GetConfig()
if route, ok := cfg.Routing["ocr"]; ok && len(route.FallbackChain) > 0 {
for _, mid := range route.FallbackChain {
if _, ok := cfg.Models[mid]; ok {
ocrProvider, _ = modelSelector.Select(context.Background(), llm.PurposeOCR)
log.Printf("OCR模型已启用: %s", ocrProvider.ModelName())
break
}
}
}
}
if ocrProvider == nil {
log.Println("OCR模型未配置,图片文字提取将复用视觉模型")
}
toolRegistry.Register(wrapTool(tools.NewVisionTool(visionProvider), "vision_analyze", "Image Vision Analysis & OCR", "multimodal"))
if knowledgeRetriever != nil {
@@ -345,6 +365,10 @@ func main() {
orch.SetVisionProvider(visionProvider)
log.Printf("对话编排器: 视觉模型已注入 (%s)", visionProvider.ModelName())
}
if ocrProvider != nil {
orch.SetOCRProvider(ocrProvider)
log.Printf("对话编排器: OCR模型已注入 (%s)", ocrProvider.ModelName())
}
log.Println("对话编排器 v2.0 已就绪")
_ = orch
+6 -1
View File
@@ -127,8 +127,13 @@ func (cs *ConversationStore) LoadFromDB(databaseURL, sessionID string, limit int
if err := rows.Scan(&roleStr, &content); err != nil {
return fmt.Errorf("扫描消息行失败: %w", err)
}
// 将旧数据中的 "action" 角色映射为 "assistant"LLM 模型不支持自定义角色)
role := model.Role(roleStr)
if role == "action" {
role = model.RoleAssistant
}
cs.messages[sessionID] = append(cs.messages[sessionID], model.LLMMessage{
Role: model.Role(roleStr),
Role: role,
Content: content,
})
loaded++
@@ -5,6 +5,7 @@ import (
"fmt"
"github.com/yourname/cyrene-ai/pkg/logger"
"strings"
"sync"
"time"
"github.com/yourname/cyrene-ai/ai-core/internal/cache"
@@ -38,7 +39,8 @@ type Orchestrator struct {
msgScheduler *scheduler.MessageScheduler
emotionTracker *persona.EmotionTracker
toolRegistry *plgManager.ToolRegistry
visionProvider llm.LLMProvider // 视觉模型 (图片预处理/OCR)
visionProvider llm.LLMProvider // 视觉模型 (图片预处理)
ocrProvider llm.LLMProvider // OCR 模型 (文字提取,与视觉模型并行调用)
}
// SetResponseCache sets the response cache (optional, for Phase 0.2).
@@ -77,6 +79,11 @@ func (o *Orchestrator) SetVisionProvider(vp llm.LLMProvider) {
o.visionProvider = vp
}
// SetOCRProvider sets the OCR model provider for text extraction.
func (o *Orchestrator) SetOCRProvider(op llm.LLMProvider) {
o.ocrProvider = op
}
// getBus returns the bus or a nop fallback.
func (o *Orchestrator) getBus() bus.Bus {
if o.eventBus == nil {
@@ -650,28 +657,75 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
}
}
// preprocessImages uses the vision model to analyze images and augments the user message.
// preprocessImages uses vision and OCR models to analyze images and augments the user message.
// When both vision and OCR providers are available (and are different models), they are called
// in parallel and both results are passed to the chat model for autonomous judgment.
// For standalone images (no text): generates a comprehensive description as the message.
// For text+images: appends image descriptions as contextual annotations.
func (o *Orchestrator) preprocessImages(ctx context.Context, message string, images []string) string {
var prompt string
if message == "" {
prompt = "请详细描述这张图片的内容,包括场景、物体、人物、文字(如有)、颜色、氛围等所有视觉信息。"
} else {
prompt = fmt.Sprintf("用户的问题是:「%s」\n\n请根据用户的问题,分析这张图片中相关的视觉信息,帮助回答用户的问题。如果图片中有文字,请完整提取。", message)
visionPromptBase := "请详细描述这张图片的内容,包括场景、物体、人物、文字(如有)、颜色、氛围等所有视觉信息。"
ocrPromptBase := `请逐字逐句完整提取图片中的所有文字内容,保持原有格式和排版。如果图片中没有文字,请回复"无文字"。`
if message != "" {
visionPromptBase = fmt.Sprintf("用户的问题是:「%s」\n\n请根据用户的问题,分析这张图片中相关的视觉信息,帮助回答用户的问题。如果图片中有文字,请完整提取。", message)
ocrPromptBase = fmt.Sprintf(`用户的问题是:「%s」
请逐字逐句完整提取图片中的所有文字内容,保持原有格式和排版。如果图片中没有文字,请回复"无文字"。`, message)
}
// Determine if OCR is a distinct model (avoid double-calling the same model)
useDual := o.ocrProvider != nil && o.visionProvider != nil &&
o.ocrProvider.ModelName() != o.visionProvider.ModelName()
var descriptions []string
for i, img := range images {
resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
{Role: model.RoleUser, Content: prompt, Images: []string{img}},
})
if err != nil {
logger.Printf("[orchestrator] 图片 %d 预处理失败: %v", i, err)
continue
var visionDesc, ocrDesc string
var wg sync.WaitGroup
if o.visionProvider != nil {
wg.Add(1)
go func() {
defer wg.Done()
resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
{Role: model.RoleUser, Content: visionPromptBase, Images: []string{img}},
})
if err != nil {
logger.Printf("[orchestrator] 图片 %d 视觉分析失败: %v", i, err)
return
}
visionDesc = resp.Content
}()
}
if resp.Content != "" {
descriptions = append(descriptions, resp.Content)
if useDual {
wg.Add(1)
go func() {
defer wg.Done()
resp, err := o.ocrProvider.Chat(ctx, []model.LLMMessage{
{Role: model.RoleUser, Content: ocrPromptBase, Images: []string{img}},
})
if err != nil {
logger.Printf("[orchestrator] 图片 %d OCR提取失败: %v", i, err)
return
}
ocrDesc = resp.Content
}()
}
wg.Wait()
var combined string
switch {
case visionDesc != "" && ocrDesc != "":
combined = fmt.Sprintf("[视觉分析]: %s\n[文字提取(OCR)]: %s", visionDesc, ocrDesc)
case visionDesc != "":
combined = visionDesc
case ocrDesc != "":
combined = ocrDesc
}
if combined != "" {
descriptions = append(descriptions, combined)
}
}