feat: 双模型并行图片预处理 — 视觉理解+OCR 同时调用，由会话模型自主判断

- preprocessImages 同时调用 vision 和 OCR 模型（并行 goroutine） - 当两个模型不同时，OCR 专注文字提取，视觉模型负责场景理解 - 两种结果合并传给会话模型，由 LLM 自主判断和融合 - 修复 LoadFromDB 旧 action 角色记录映射为 assistant Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 13:03:04 +08:00
parent 91c9ee4b2d
commit 3e15285065
3 changed files with 99 additions and 16 deletions
@@ -178,6 +178,7 @@ func main() {
 	// 初始化工具注册中心 (使用共享插件模块)
 	toolRegistry := plgManager.NewToolRegistry()
 	var visionProvider llm.LLMProvider
+	var ocrProvider llm.LLMProvider
 	if getEnvBool("ENABLE_TOOLS", true) {
 		// 11 个共享通用插件 — 注册其工具到统一注册中心
 		registerPluginTools(toolRegistry, &pluginCalc.CalculatorPlugin{})
@@ -231,6 +232,25 @@ func main() {
 		if visionProvider == nil {
 			log.Println("视觉模型未配置，vision_analyze 将使用 base64 模式")
 		}
+
+		// 初始化 OCR 模型（与视觉模型并行调用，提供文字提取结果给会话模型自主判断）
+		ocrProvider = nil
+		if configLoader != nil && configLoader.HasConfig() {
+			cfg := configLoader.GetConfig()
+			if route, ok := cfg.Routing["ocr"]; ok && len(route.FallbackChain) > 0 {
+				for _, mid := range route.FallbackChain {
+					if _, ok := cfg.Models[mid]; ok {
+						ocrProvider, _ = modelSelector.Select(context.Background(), llm.PurposeOCR)
+						log.Printf("OCR模型已启用: %s", ocrProvider.ModelName())
+						break
+					}
+				}
+			}
+		}
+		if ocrProvider == nil {
+			log.Println("OCR模型未配置，图片文字提取将复用视觉模型")
+		}
+
 		toolRegistry.Register(wrapTool(tools.NewVisionTool(visionProvider), "vision_analyze", "Image Vision Analysis & OCR", "multimodal"))

 		if knowledgeRetriever != nil {
@@ -345,6 +365,10 @@ func main() {
 		orch.SetVisionProvider(visionProvider)
 		log.Printf("对话编排器: 视觉模型已注入 (%s)", visionProvider.ModelName())
 	}
+	if ocrProvider != nil {
+		orch.SetOCRProvider(ocrProvider)
+		log.Printf("对话编排器: OCR模型已注入 (%s)", ocrProvider.ModelName())
+	}
 	log.Println("对话编排器 v2.0 已就绪")
 	_ = orch

@@ -127,8 +127,13 @@ func (cs *ConversationStore) LoadFromDB(databaseURL, sessionID string, limit int
 		if err := rows.Scan(&roleStr, &content); err != nil {
 			return fmt.Errorf("扫描消息行失败: %w", err)
 		}
+		// 将旧数据中的 "action" 角色映射为 "assistant"（LLM 模型不支持自定义角色）
+		role := model.Role(roleStr)
+		if role == "action" {
+			role = model.RoleAssistant
+		}
 		cs.messages[sessionID] = append(cs.messages[sessionID], model.LLMMessage{
-			Role:    model.Role(roleStr),
+			Role:    role,
 			Content: content,
 		})
 		loaded++
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"github.com/yourname/cyrene-ai/pkg/logger"
 	"strings"
+	"sync"
 	"time"

 	"github.com/yourname/cyrene-ai/ai-core/internal/cache"
@@ -38,7 +39,8 @@ type Orchestrator struct {
 	msgScheduler    *scheduler.MessageScheduler
 	emotionTracker  *persona.EmotionTracker
 	toolRegistry    *plgManager.ToolRegistry
-	visionProvider  llm.LLMProvider // 视觉模型 (图片预处理/OCR)
+	visionProvider  llm.LLMProvider // 视觉模型 (图片预处理)
+	ocrProvider     llm.LLMProvider // OCR 模型 (文字提取，与视觉模型并行调用)
 }

 // SetResponseCache sets the response cache (optional, for Phase 0.2).
@@ -77,6 +79,11 @@ func (o *Orchestrator) SetVisionProvider(vp llm.LLMProvider) {
 	o.visionProvider = vp
 }

+// SetOCRProvider sets the OCR model provider for text extraction.
+func (o *Orchestrator) SetOCRProvider(op llm.LLMProvider) {
+	o.ocrProvider = op
+}
+
 // getBus returns the bus or a nop fallback.
 func (o *Orchestrator) getBus() bus.Bus {
 	if o.eventBus == nil {
@@ -650,28 +657,75 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
 	}
 }

-// preprocessImages uses the vision model to analyze images and augments the user message.
+// preprocessImages uses vision and OCR models to analyze images and augments the user message.
+// When both vision and OCR providers are available (and are different models), they are called
+// in parallel and both results are passed to the chat model for autonomous judgment.
 // For standalone images (no text): generates a comprehensive description as the message.
 // For text+images: appends image descriptions as contextual annotations.
 func (o *Orchestrator) preprocessImages(ctx context.Context, message string, images []string) string {
-	var prompt string
-	if message == "" {
-		prompt = "请详细描述这张图片的内容，包括场景、物体、人物、文字（如有）、颜色、氛围等所有视觉信息。"
-	} else {
-		prompt = fmt.Sprintf("用户的问题是：「%s」\n\n请根据用户的问题，分析这张图片中相关的视觉信息，帮助回答用户的问题。如果图片中有文字，请完整提取。", message)
+	visionPromptBase := "请详细描述这张图片的内容，包括场景、物体、人物、文字（如有）、颜色、氛围等所有视觉信息。"
+	ocrPromptBase := `请逐字逐句完整提取图片中的所有文字内容，保持原有格式和排版。如果图片中没有文字，请回复"无文字"。`
+
+	if message != "" {
+		visionPromptBase = fmt.Sprintf("用户的问题是：「%s」\n\n请根据用户的问题，分析这张图片中相关的视觉信息，帮助回答用户的问题。如果图片中有文字，请完整提取。", message)
+		ocrPromptBase = fmt.Sprintf(`用户的问题是：「%s」
+
+请逐字逐句完整提取图片中的所有文字内容，保持原有格式和排版。如果图片中没有文字，请回复"无文字"。`, message)
 	}

+	// Determine if OCR is a distinct model (avoid double-calling the same model)
+	useDual := o.ocrProvider != nil && o.visionProvider != nil &&
+		o.ocrProvider.ModelName() != o.visionProvider.ModelName()
+
 	var descriptions []string
 	for i, img := range images {
-		resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
-			{Role: model.RoleUser, Content: prompt, Images: []string{img}},
-		})
-		if err != nil {
-			logger.Printf("[orchestrator] 图片 %d 预处理失败: %v", i, err)
-			continue
+		var visionDesc, ocrDesc string
+		var wg sync.WaitGroup
+
+		if o.visionProvider != nil {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
+					{Role: model.RoleUser, Content: visionPromptBase, Images: []string{img}},
+				})
+				if err != nil {
+					logger.Printf("[orchestrator] 图片 %d 视觉分析失败: %v", i, err)
+					return
+				}
+				visionDesc = resp.Content
+			}()
 		}
-		if resp.Content != "" {
-			descriptions = append(descriptions, resp.Content)
+
+		if useDual {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				resp, err := o.ocrProvider.Chat(ctx, []model.LLMMessage{
+					{Role: model.RoleUser, Content: ocrPromptBase, Images: []string{img}},
+				})
+				if err != nil {
+					logger.Printf("[orchestrator] 图片 %d OCR提取失败: %v", i, err)
+					return
+				}
+				ocrDesc = resp.Content
+			}()
+		}
+
+		wg.Wait()
+
+		var combined string
+		switch {
+		case visionDesc != "" && ocrDesc != "":
+			combined = fmt.Sprintf("[视觉分析]: %s\n[文字提取(OCR)]: %s", visionDesc, ocrDesc)
+		case visionDesc != "":
+			combined = visionDesc
+		case ocrDesc != "":
+			combined = ocrDesc
+		}
+
+		if combined != "" {
+			descriptions = append(descriptions, combined)
 		}
 	}