feat: VisionTool 集成多模态 LLM 直接调用 — OCR/视觉分析

- VisionTool 改为接受可选 llm.LLMProvider,有模型时直接调用视觉模型分析,
  无模型时回退 base64 data URL 模式,不影响基本功能
- ModelSelector 新增 PurposeVision 路由用途
- main.go 按 vision routing 自动发现并注入视觉模型 provider
- 支持 models.json 中 qwen3.6-flash / qwen-vl-ocr-latest fallback 链

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-24 16:25:56 +08:00
parent 47f9de2409
commit 189f7b999b
3 changed files with 77 additions and 20 deletions
+18 -2
View File
@@ -170,8 +170,24 @@ func main() {
toolRegistry.Register(tools.NewHostSystemTool(hostManager))
}
// Phase 6.3: 视觉理解工具
toolRegistry.Register(tools.NewVisionTool())
// Phase 6.3: 视觉理解工具 — 可选 LLM 增强,无视觉模型时回退 base64 模式
var visionProvider llm.LLMProvider
if configLoader != nil && configLoader.HasConfig() {
cfg := configLoader.GetConfig()
if route, ok := cfg.Routing["vision"]; ok && len(route.FallbackChain) > 0 {
for _, mid := range route.FallbackChain {
if _, ok := cfg.Models[mid]; ok {
visionProvider, _ = modelSelector.Select(context.Background(), llm.PurposeVision)
log.Printf("视觉模型已启用: %s", visionProvider.ModelName())
break
}
}
}
}
if visionProvider == nil {
log.Println("视觉模型未配置,vision_analyze 将使用 base64 模式")
}
toolRegistry.Register(tools.NewVisionTool(visionProvider))
// Phase 6.6: 知识库 RAG 工具
if knowledgeRetriever != nil {
+1
View File
@@ -18,6 +18,7 @@ const (
PurposeIntentAnalysis ModelPurpose = "intent_analysis"
PurposeToolCalling ModelPurpose = "tool_calling"
PurposeMemoryExtraction ModelPurpose = "memory_extraction"
PurposeVision ModelPurpose = "vision"
)
// ErrModelNotRequired is returned when an optional model is unavailable.
+58 -18
View File
@@ -8,16 +8,21 @@ import (
"os"
"path/filepath"
"strings"
"github.com/yourname/cyrene-ai/ai-core/internal/llm"
"github.com/yourname/cyrene-ai/ai-core/internal/model"
)
// VisionTool enables image understanding via multimodal LLM.
// It reads an image file, encodes it as base64, and returns a prompt-ready
// data URL that can be fed into the vision pipeline.
type VisionTool struct{}
// When visionProvider is available, it calls the vision model directly for OCR/analysis.
// When nil, it falls back to returning a base64 data URL for the caller to process.
type VisionTool struct {
visionProvider llm.LLMProvider
}
// NewVisionTool creates a vision tool.
func NewVisionTool() *VisionTool {
return &VisionTool{}
// NewVisionTool creates a vision tool. visionProvider is optional (nil = base64-only mode).
func NewVisionTool(visionProvider llm.LLMProvider) *VisionTool {
return &VisionTool{visionProvider: visionProvider}
}
func (t *VisionTool) Definition() ToolDefinition {
@@ -42,6 +47,12 @@ func (t *VisionTool) Definition() ToolDefinition {
}
}
var taskPrompts = map[string]string{
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
}
func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
imagePath, _ := args["image_path"].(string)
if imagePath == "" {
@@ -66,21 +77,50 @@ func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (
}, nil
}
taskPrompts := map[string]string{
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
prompt := taskPrompts[task]
if prompt == "" {
prompt = taskPrompts["analyze"]
}
result, _ := json.Marshal(map[string]interface{}{
"image_path": imagePath,
"task": task,
"data_url": dataURL,
"mime_type": mimeType,
"prompt": taskPrompts[task],
"file_size": len(dataURL),
})
// If a vision model is available, call it directly for OCR/analysis
if t.visionProvider != nil {
messages := []model.LLMMessage{
{Role: model.RoleUser, Content: prompt, Images: []string{dataURL}},
}
resp, err := t.visionProvider.Chat(ctx, messages)
if err != nil {
return &ToolResult{
ToolName: "vision_analyze",
Success: false,
Error: fmt.Sprintf("视觉模型调用失败: %v", err),
}, nil
}
output, _ := json.Marshal(map[string]interface{}{
"image_path": imagePath,
"task": task,
"model": t.visionProvider.ModelName(),
"text": resp.Content,
"prompt_tokens": resp.Usage.PromptTokens,
"completion_tokens": resp.Usage.CompletionTokens,
"total_tokens": resp.Usage.TotalTokens,
})
return &ToolResult{
ToolName: "vision_analyze",
Success: true,
Data: string(output),
}, nil
}
// Fallback: return base64 data URL for caller to process
result, _ := json.Marshal(map[string]interface{}{
"image_path": imagePath,
"task": task,
"data_url": dataURL,
"mime_type": mimeType,
"prompt": prompt,
"file_size": len(dataURL),
})
return &ToolResult{
ToolName: "vision_analyze",
Success: true,