feat: 双模型并行图片预处理 — 视觉理解+OCR 同时调用，由会话模型自主判断

- preprocessImages 同时调用 vision 和 OCR 模型（并行 goroutine） - 当两个模型不同时，OCR 专注文字提取，视觉模型负责场景理解 - 两种结果合并传给会话模型，由 LLM 自主判断和融合 - 修复 LoadFromDB 旧 action 角色记录映射为 assistant Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 13:03:04 +08:00
parent 91c9ee4b2d
commit 3e15285065
3 changed files with 99 additions and 16 deletions
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"github.com/yourname/cyrene-ai/pkg/logger"
 	"strings"
+	"sync"
 	"time"

 	"github.com/yourname/cyrene-ai/ai-core/internal/cache"
@@ -38,7 +39,8 @@ type Orchestrator struct {
 	msgScheduler    *scheduler.MessageScheduler
 	emotionTracker  *persona.EmotionTracker
 	toolRegistry    *plgManager.ToolRegistry
-	visionProvider  llm.LLMProvider // 视觉模型 (图片预处理/OCR)
+	visionProvider  llm.LLMProvider // 视觉模型 (图片预处理)
+	ocrProvider     llm.LLMProvider // OCR 模型 (文字提取，与视觉模型并行调用)
 }

 // SetResponseCache sets the response cache (optional, for Phase 0.2).
@@ -77,6 +79,11 @@ func (o *Orchestrator) SetVisionProvider(vp llm.LLMProvider) {
 	o.visionProvider = vp
 }

+// SetOCRProvider sets the OCR model provider for text extraction.
+func (o *Orchestrator) SetOCRProvider(op llm.LLMProvider) {
+	o.ocrProvider = op
+}
+
 // getBus returns the bus or a nop fallback.
 func (o *Orchestrator) getBus() bus.Bus {
 	if o.eventBus == nil {
@@ -650,28 +657,75 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
 	}
 }

-// preprocessImages uses the vision model to analyze images and augments the user message.
+// preprocessImages uses vision and OCR models to analyze images and augments the user message.
+// When both vision and OCR providers are available (and are different models), they are called
+// in parallel and both results are passed to the chat model for autonomous judgment.
 // For standalone images (no text): generates a comprehensive description as the message.
 // For text+images: appends image descriptions as contextual annotations.
 func (o *Orchestrator) preprocessImages(ctx context.Context, message string, images []string) string {
-	var prompt string
-	if message == "" {
-		prompt = "请详细描述这张图片的内容，包括场景、物体、人物、文字（如有）、颜色、氛围等所有视觉信息。"
-	} else {
-		prompt = fmt.Sprintf("用户的问题是：「%s」\n\n请根据用户的问题，分析这张图片中相关的视觉信息，帮助回答用户的问题。如果图片中有文字，请完整提取。", message)
+	visionPromptBase := "请详细描述这张图片的内容，包括场景、物体、人物、文字（如有）、颜色、氛围等所有视觉信息。"
+	ocrPromptBase := `请逐字逐句完整提取图片中的所有文字内容，保持原有格式和排版。如果图片中没有文字，请回复"无文字"。`
+
+	if message != "" {
+		visionPromptBase = fmt.Sprintf("用户的问题是：「%s」\n\n请根据用户的问题，分析这张图片中相关的视觉信息，帮助回答用户的问题。如果图片中有文字，请完整提取。", message)
+		ocrPromptBase = fmt.Sprintf(`用户的问题是：「%s」
+
+请逐字逐句完整提取图片中的所有文字内容，保持原有格式和排版。如果图片中没有文字，请回复"无文字"。`, message)
 	}

+	// Determine if OCR is a distinct model (avoid double-calling the same model)
+	useDual := o.ocrProvider != nil && o.visionProvider != nil &&
+		o.ocrProvider.ModelName() != o.visionProvider.ModelName()
+
 	var descriptions []string
 	for i, img := range images {
-		resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
-			{Role: model.RoleUser, Content: prompt, Images: []string{img}},
-		})
-		if err != nil {
-			logger.Printf("[orchestrator] 图片 %d 预处理失败: %v", i, err)
-			continue
+		var visionDesc, ocrDesc string
+		var wg sync.WaitGroup
+
+		if o.visionProvider != nil {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				resp, err := o.visionProvider.Chat(ctx, []model.LLMMessage{
+					{Role: model.RoleUser, Content: visionPromptBase, Images: []string{img}},
+				})
+				if err != nil {
+					logger.Printf("[orchestrator] 图片 %d 视觉分析失败: %v", i, err)
+					return
+				}
+				visionDesc = resp.Content
+			}()
 		}
-		if resp.Content != "" {
-			descriptions = append(descriptions, resp.Content)
+
+		if useDual {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				resp, err := o.ocrProvider.Chat(ctx, []model.LLMMessage{
+					{Role: model.RoleUser, Content: ocrPromptBase, Images: []string{img}},
+				})
+				if err != nil {
+					logger.Printf("[orchestrator] 图片 %d OCR提取失败: %v", i, err)
+					return
+				}
+				ocrDesc = resp.Content
+			}()
+		}
+
+		wg.Wait()
+
+		var combined string
+		switch {
+		case visionDesc != "" && ocrDesc != "":
+			combined = fmt.Sprintf("[视觉分析]: %s\n[文字提取(OCR)]: %s", visionDesc, ocrDesc)
+		case visionDesc != "":
+			combined = visionDesc
+		case ocrDesc != "":
+			combined = ocrDesc
+		}
+
+		if combined != "" {
+			descriptions = append(descriptions, combined)
 		}
 	}