feat: VisionTool 集成多模态 LLM 直接调用 — OCR/视觉分析

- VisionTool 改为接受可选 llm.LLMProvider，有模型时直接调用视觉模型分析，无模型时回退 base64 data URL 模式，不影响基本功能 - ModelSelector 新增 PurposeVision 路由用途 - main.go 按 vision routing 自动发现并注入视觉模型 provider - 支持 models.json 中 qwen3.6-flash / qwen-vl-ocr-latest fallback 链 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 16:25:56 +08:00
parent 47f9de2409
commit 189f7b999b
3 changed files with 77 additions and 20 deletions
@@ -170,8 +170,24 @@ func main() {
 			toolRegistry.Register(tools.NewHostSystemTool(hostManager))
 		}

-		// Phase 6.3: 视觉理解工具
-		toolRegistry.Register(tools.NewVisionTool())
+		// Phase 6.3: 视觉理解工具 — 可选 LLM 增强，无视觉模型时回退 base64 模式
+		var visionProvider llm.LLMProvider
+		if configLoader != nil && configLoader.HasConfig() {
+			cfg := configLoader.GetConfig()
+			if route, ok := cfg.Routing["vision"]; ok && len(route.FallbackChain) > 0 {
+				for _, mid := range route.FallbackChain {
+					if _, ok := cfg.Models[mid]; ok {
+						visionProvider, _ = modelSelector.Select(context.Background(), llm.PurposeVision)
+						log.Printf("视觉模型已启用: %s", visionProvider.ModelName())
+						break
+					}
+				}
+			}
+		}
+		if visionProvider == nil {
+			log.Println("视觉模型未配置，vision_analyze 将使用 base64 模式")
+		}
+		toolRegistry.Register(tools.NewVisionTool(visionProvider))

 		// Phase 6.6: 知识库 RAG 工具
 		if knowledgeRetriever != nil {
@@ -18,6 +18,7 @@ const (
 	PurposeIntentAnalysis   ModelPurpose = "intent_analysis"
 	PurposeToolCalling      ModelPurpose = "tool_calling"
 	PurposeMemoryExtraction ModelPurpose = "memory_extraction"
+	PurposeVision           ModelPurpose = "vision"
 )

 // ErrModelNotRequired is returned when an optional model is unavailable.
@@ -8,16 +8,21 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+
+	"github.com/yourname/cyrene-ai/ai-core/internal/llm"
+	"github.com/yourname/cyrene-ai/ai-core/internal/model"
 )

 // VisionTool enables image understanding via multimodal LLM.
-// It reads an image file, encodes it as base64, and returns a prompt-ready
-// data URL that can be fed into the vision pipeline.
-type VisionTool struct{}
+// When visionProvider is available, it calls the vision model directly for OCR/analysis.
+// When nil, it falls back to returning a base64 data URL for the caller to process.
+type VisionTool struct {
+	visionProvider llm.LLMProvider
+}

-// NewVisionTool creates a vision tool.
-func NewVisionTool() *VisionTool {
-	return &VisionTool{}
+// NewVisionTool creates a vision tool. visionProvider is optional (nil = base64-only mode).
+func NewVisionTool(visionProvider llm.LLMProvider) *VisionTool {
+	return &VisionTool{visionProvider: visionProvider}
 }

 func (t *VisionTool) Definition() ToolDefinition {
@@ -42,6 +47,12 @@ func (t *VisionTool) Definition() ToolDefinition {
 	}
 }

+var taskPrompts = map[string]string{
+	"ocr":      "请提取这张图片中的所有文字内容，保持原始格式和排版。只输出文字内容，不要添加额外说明。",
+	"describe": "请详细描述这张图片的内容，包括场景、物体、人物、颜色、氛围等。",
+	"analyze":  "请综合分析这张图片，包括内容描述、文字提取(如有)、以及你的理解。",
+}
+
 func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
 	imagePath, _ := args["image_path"].(string)
 	if imagePath == "" {
@@ -66,21 +77,50 @@ func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (
 		}, nil
 	}

-	taskPrompts := map[string]string{
-		"ocr":       "请提取这张图片中的所有文字内容，保持原始格式和排版。只输出文字内容，不要添加额外说明。",
-		"describe":  "请详细描述这张图片的内容，包括场景、物体、人物、颜色、氛围等。",
-		"analyze":   "请综合分析这张图片，包括内容描述、文字提取(如有)、以及你的理解。",
+	prompt := taskPrompts[task]
+	if prompt == "" {
+		prompt = taskPrompts["analyze"]
 	}

-	result, _ := json.Marshal(map[string]interface{}{
-		"image_path":  imagePath,
-		"task":        task,
-		"data_url":    dataURL,
-		"mime_type":   mimeType,
-		"prompt":      taskPrompts[task],
-		"file_size":   len(dataURL),
-	})
+	// If a vision model is available, call it directly for OCR/analysis
+	if t.visionProvider != nil {
+		messages := []model.LLMMessage{
+			{Role: model.RoleUser, Content: prompt, Images: []string{dataURL}},
+		}
+		resp, err := t.visionProvider.Chat(ctx, messages)
+		if err != nil {
+			return &ToolResult{
+				ToolName: "vision_analyze",
+				Success:  false,
+				Error:    fmt.Sprintf("视觉模型调用失败: %v", err),
+			}, nil
+		}

+		output, _ := json.Marshal(map[string]interface{}{
+			"image_path":   imagePath,
+			"task":         task,
+			"model":        t.visionProvider.ModelName(),
+			"text":         resp.Content,
+			"prompt_tokens":  resp.Usage.PromptTokens,
+			"completion_tokens": resp.Usage.CompletionTokens,
+			"total_tokens":  resp.Usage.TotalTokens,
+		})
+		return &ToolResult{
+			ToolName: "vision_analyze",
+			Success:  true,
+			Data:     string(output),
+		}, nil
+	}
+
+	// Fallback: return base64 data URL for caller to process
+	result, _ := json.Marshal(map[string]interface{}{
+		"image_path": imagePath,
+		"task":       task,
+		"data_url":   dataURL,
+		"mime_type":  mimeType,
+		"prompt":     prompt,
+		"file_size":  len(dataURL),
+	})
 	return &ToolResult{
 		ToolName: "vision_analyze",
 		Success:  true,