feat: 双模型并行图片预处理 — 视觉理解+OCR 同时调用,由会话模型自主判断
- preprocessImages 同时调用 vision 和 OCR 模型(并行 goroutine) - 当两个模型不同时,OCR 专注文字提取,视觉模型负责场景理解 - 两种结果合并传给会话模型,由 LLM 自主判断和融合 - 修复 LoadFromDB 旧 action 角色记录映射为 assistant Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -178,6 +178,7 @@ func main() {
|
||||
// 初始化工具注册中心 (使用共享插件模块)
|
||||
toolRegistry := plgManager.NewToolRegistry()
|
||||
var visionProvider llm.LLMProvider
|
||||
var ocrProvider llm.LLMProvider
|
||||
if getEnvBool("ENABLE_TOOLS", true) {
|
||||
// 11 个共享通用插件 — 注册其工具到统一注册中心
|
||||
registerPluginTools(toolRegistry, &pluginCalc.CalculatorPlugin{})
|
||||
@@ -231,6 +232,25 @@ func main() {
|
||||
if visionProvider == nil {
|
||||
log.Println("视觉模型未配置,vision_analyze 将使用 base64 模式")
|
||||
}
|
||||
|
||||
// 初始化 OCR 模型(与视觉模型并行调用,提供文字提取结果给会话模型自主判断)
|
||||
ocrProvider = nil
|
||||
if configLoader != nil && configLoader.HasConfig() {
|
||||
cfg := configLoader.GetConfig()
|
||||
if route, ok := cfg.Routing["ocr"]; ok && len(route.FallbackChain) > 0 {
|
||||
for _, mid := range route.FallbackChain {
|
||||
if _, ok := cfg.Models[mid]; ok {
|
||||
ocrProvider, _ = modelSelector.Select(context.Background(), llm.PurposeOCR)
|
||||
log.Printf("OCR模型已启用: %s", ocrProvider.ModelName())
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ocrProvider == nil {
|
||||
log.Println("OCR模型未配置,图片文字提取将复用视觉模型")
|
||||
}
|
||||
|
||||
toolRegistry.Register(wrapTool(tools.NewVisionTool(visionProvider), "vision_analyze", "Image Vision Analysis & OCR", "multimodal"))
|
||||
|
||||
if knowledgeRetriever != nil {
|
||||
@@ -345,6 +365,10 @@ func main() {
|
||||
orch.SetVisionProvider(visionProvider)
|
||||
log.Printf("对话编排器: 视觉模型已注入 (%s)", visionProvider.ModelName())
|
||||
}
|
||||
if ocrProvider != nil {
|
||||
orch.SetOCRProvider(ocrProvider)
|
||||
log.Printf("对话编排器: OCR模型已注入 (%s)", ocrProvider.ModelName())
|
||||
}
|
||||
log.Println("对话编排器 v2.0 已就绪")
|
||||
_ = orch
|
||||
|
||||
|
||||
@@ -127,8 +127,13 @@ func (cs *ConversationStore) LoadFromDB(databaseURL, sessionID string, limit int
|
||||
if err := rows.Scan(&roleStr, &content); err != nil {
|
||||
return fmt.Errorf("扫描消息行失败: %w", err)
|
||||
}
|
||||
// 将旧数据中的 "action" 角色映射为 "assistant"(LLM 模型不支持自定义角色)
|
||||
role := model.Role(roleStr)
|
||||
if role == "action" {
|
||||
role = model.RoleAssistant
|
||||
}
|
||||
cs.messages[sessionID] = append(cs.messages[sessionID], model.LLMMessage{
|
||||
Role: model.Role(roleStr),
|
||||
Role: role,
|
||||
Content: content,
|
||||
})
|
||||
loaded++
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"github.com/yourname/cyrene-ai/pkg/logger"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/yourname/cyrene-ai/ai-core/internal/cache"
|
||||
@@ -38,7 +39,8 @@ type Orchestrator struct {
|
||||
msgScheduler *scheduler.MessageScheduler
|
||||
emotionTracker *persona.EmotionTracker
|
||||
toolRegistry *plgManager.ToolRegistry
|
||||
visionProvider llm.LLMProvider // 视觉模型 (图片预处理/OCR)
|
||||
visionProvider llm.LLMProvider // 视觉模型 (图片预处理)
|
||||
ocrProvider llm.LLMProvider // OCR 模型 (文字提取,与视觉模型并行调用)
|
||||
}
|
||||
|
||||
// SetResponseCache sets the response cache (optional, for Phase 0.2).
|
||||
@@ -77,6 +79,11 @@ func (o *Orchestrator) SetVisionProvider(vp llm.LLMProvider) {
|
||||
o.visionProvider = vp
|
||||
}
|
||||
|
||||
// SetOCRProvider sets the OCR model provider for text extraction.
|
||||
func (o *Orchestrator) SetOCRProvider(op llm.LLMProvider) {
|
||||
o.ocrProvider = op
|
||||
}
|
||||
|
||||
// getBus returns the bus or a nop fallback.
|
||||
func (o *Orchestrator) getBus() bus.Bus {
|
||||
if o.eventBus == nil {
|
||||
@@ -650,28 +657,75 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
|
||||
}
|
||||
}
|
||||
|
||||
// preprocessImages uses the vision model to analyze images and augments the user message.
|
||||
// preprocessImages uses vision and OCR models to analyze images and augments the user message.
|
||||
// When both vision and OCR providers are available (and are different models), they are called
|
||||
// in parallel and both results are passed to the chat model for autonomous judgment.
|
||||
// For standalone images (no text): generates a comprehensive description as the message.
|
||||
// For text+images: appends image descriptions as contextual annotations.
|
||||
func (o *Orchestrator) preprocessImages(ctx context.Context, message string, images []string) string {
|
||||
var prompt string
|
||||
if message == "" {
|
||||
prompt = "请详细描述这张图片的内容,包括场景、物体、人物、文字(如有)、颜色、氛围等所有视觉信息。"
|
||||
} else {
|
||||
prompt = fmt.Sprintf("用户的问题是:「%s」\n\n请根据用户的问题,分析这张图片中相关的视觉信息,帮助回答用户的问题。如果图片中有文字,请完整提取。", message)
|
||||
visionPromptBase := "请详细描述这张图片的内容,包括场景、物体、人物、文字(如有)、颜色、氛围等所有视觉信息。"
|
||||
ocrPromptBase := `请逐字逐句完整提取图片中的所有文字内容,保持原有格式和排版。如果图片中没有文字,请回复"无文字"。`
|
||||
|
||||
if message != "" {
|
||||
visionPromptBase = fmt.Sprintf("用户的问题是:「%s」\n\n请根据用户的问题,分析这张图片中相关的视觉信息,帮助回答用户的问题。如果图片中有文字,请完整提取。", message)
|
||||
ocrPromptBase = fmt.Sprintf(`用户的问题是:「%s」
|
||||
|
||||
请逐字逐句完整提取图片中的所有文字内容,保持原有格式和排版。如果图片中没有文字,请回复"无文字"。`, message)
|
||||
}
|
||||
|
||||
// Determine if OCR is a distinct model (avoid double-calling the same model)
|
||||
useDual := o.ocrProvider != nil && o.visionProvider != nil &&
|
||||
o.ocrProvider.ModelName() != o.visionProvider.ModelName()
|
||||
|
||||
var descriptions []string
|
||||
for i, img := range images {
|
||||
resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
|
||||
{Role: model.RoleUser, Content: prompt, Images: []string{img}},
|
||||
})
|
||||
if err != nil {
|
||||
logger.Printf("[orchestrator] 图片 %d 预处理失败: %v", i, err)
|
||||
continue
|
||||
var visionDesc, ocrDesc string
|
||||
var wg sync.WaitGroup
|
||||
|
||||
if o.visionProvider != nil {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
|
||||
{Role: model.RoleUser, Content: visionPromptBase, Images: []string{img}},
|
||||
})
|
||||
if err != nil {
|
||||
logger.Printf("[orchestrator] 图片 %d 视觉分析失败: %v", i, err)
|
||||
return
|
||||
}
|
||||
visionDesc = resp.Content
|
||||
}()
|
||||
}
|
||||
if resp.Content != "" {
|
||||
descriptions = append(descriptions, resp.Content)
|
||||
|
||||
if useDual {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
resp, err := o.ocrProvider.Chat(ctx, []model.LLMMessage{
|
||||
{Role: model.RoleUser, Content: ocrPromptBase, Images: []string{img}},
|
||||
})
|
||||
if err != nil {
|
||||
logger.Printf("[orchestrator] 图片 %d OCR提取失败: %v", i, err)
|
||||
return
|
||||
}
|
||||
ocrDesc = resp.Content
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
var combined string
|
||||
switch {
|
||||
case visionDesc != "" && ocrDesc != "":
|
||||
combined = fmt.Sprintf("[视觉分析]: %s\n[文字提取(OCR)]: %s", visionDesc, ocrDesc)
|
||||
case visionDesc != "":
|
||||
combined = visionDesc
|
||||
case ocrDesc != "":
|
||||
combined = ocrDesc
|
||||
}
|
||||
|
||||
if combined != "" {
|
||||
descriptions = append(descriptions, combined)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user