feat: VisionTool 集成多模态 LLM 直接调用 — OCR/视觉分析
- VisionTool 改为接受可选 llm.LLMProvider,有模型时直接调用视觉模型分析, 无模型时回退 base64 data URL 模式,不影响基本功能 - ModelSelector 新增 PurposeVision 路由用途 - main.go 按 vision routing 自动发现并注入视觉模型 provider - 支持 models.json 中 qwen3.6-flash / qwen-vl-ocr-latest fallback 链 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -170,8 +170,24 @@ func main() {
|
||||
toolRegistry.Register(tools.NewHostSystemTool(hostManager))
|
||||
}
|
||||
|
||||
// Phase 6.3: 视觉理解工具
|
||||
toolRegistry.Register(tools.NewVisionTool())
|
||||
// Phase 6.3: 视觉理解工具 — 可选 LLM 增强,无视觉模型时回退 base64 模式
|
||||
var visionProvider llm.LLMProvider
|
||||
if configLoader != nil && configLoader.HasConfig() {
|
||||
cfg := configLoader.GetConfig()
|
||||
if route, ok := cfg.Routing["vision"]; ok && len(route.FallbackChain) > 0 {
|
||||
for _, mid := range route.FallbackChain {
|
||||
if _, ok := cfg.Models[mid]; ok {
|
||||
visionProvider, _ = modelSelector.Select(context.Background(), llm.PurposeVision)
|
||||
log.Printf("视觉模型已启用: %s", visionProvider.ModelName())
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if visionProvider == nil {
|
||||
log.Println("视觉模型未配置,vision_analyze 将使用 base64 模式")
|
||||
}
|
||||
toolRegistry.Register(tools.NewVisionTool(visionProvider))
|
||||
|
||||
// Phase 6.6: 知识库 RAG 工具
|
||||
if knowledgeRetriever != nil {
|
||||
|
||||
@@ -18,6 +18,7 @@ const (
|
||||
PurposeIntentAnalysis ModelPurpose = "intent_analysis"
|
||||
PurposeToolCalling ModelPurpose = "tool_calling"
|
||||
PurposeMemoryExtraction ModelPurpose = "memory_extraction"
|
||||
PurposeVision ModelPurpose = "vision"
|
||||
)
|
||||
|
||||
// ErrModelNotRequired is returned when an optional model is unavailable.
|
||||
|
||||
@@ -8,16 +8,21 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/yourname/cyrene-ai/ai-core/internal/llm"
|
||||
"github.com/yourname/cyrene-ai/ai-core/internal/model"
|
||||
)
|
||||
|
||||
// VisionTool enables image understanding via multimodal LLM.
|
||||
// It reads an image file, encodes it as base64, and returns a prompt-ready
|
||||
// data URL that can be fed into the vision pipeline.
|
||||
type VisionTool struct{}
|
||||
// When visionProvider is available, it calls the vision model directly for OCR/analysis.
|
||||
// When nil, it falls back to returning a base64 data URL for the caller to process.
|
||||
type VisionTool struct {
|
||||
visionProvider llm.LLMProvider
|
||||
}
|
||||
|
||||
// NewVisionTool creates a vision tool.
|
||||
func NewVisionTool() *VisionTool {
|
||||
return &VisionTool{}
|
||||
// NewVisionTool creates a vision tool. visionProvider is optional (nil = base64-only mode).
|
||||
func NewVisionTool(visionProvider llm.LLMProvider) *VisionTool {
|
||||
return &VisionTool{visionProvider: visionProvider}
|
||||
}
|
||||
|
||||
func (t *VisionTool) Definition() ToolDefinition {
|
||||
@@ -42,6 +47,12 @@ func (t *VisionTool) Definition() ToolDefinition {
|
||||
}
|
||||
}
|
||||
|
||||
var taskPrompts = map[string]string{
|
||||
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
|
||||
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
|
||||
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
|
||||
}
|
||||
|
||||
func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
|
||||
imagePath, _ := args["image_path"].(string)
|
||||
if imagePath == "" {
|
||||
@@ -66,21 +77,50 @@ func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (
|
||||
}, nil
|
||||
}
|
||||
|
||||
taskPrompts := map[string]string{
|
||||
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
|
||||
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
|
||||
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
|
||||
prompt := taskPrompts[task]
|
||||
if prompt == "" {
|
||||
prompt = taskPrompts["analyze"]
|
||||
}
|
||||
|
||||
result, _ := json.Marshal(map[string]interface{}{
|
||||
"image_path": imagePath,
|
||||
"task": task,
|
||||
"data_url": dataURL,
|
||||
"mime_type": mimeType,
|
||||
"prompt": taskPrompts[task],
|
||||
"file_size": len(dataURL),
|
||||
})
|
||||
// If a vision model is available, call it directly for OCR/analysis
|
||||
if t.visionProvider != nil {
|
||||
messages := []model.LLMMessage{
|
||||
{Role: model.RoleUser, Content: prompt, Images: []string{dataURL}},
|
||||
}
|
||||
resp, err := t.visionProvider.Chat(ctx, messages)
|
||||
if err != nil {
|
||||
return &ToolResult{
|
||||
ToolName: "vision_analyze",
|
||||
Success: false,
|
||||
Error: fmt.Sprintf("视觉模型调用失败: %v", err),
|
||||
}, nil
|
||||
}
|
||||
|
||||
output, _ := json.Marshal(map[string]interface{}{
|
||||
"image_path": imagePath,
|
||||
"task": task,
|
||||
"model": t.visionProvider.ModelName(),
|
||||
"text": resp.Content,
|
||||
"prompt_tokens": resp.Usage.PromptTokens,
|
||||
"completion_tokens": resp.Usage.CompletionTokens,
|
||||
"total_tokens": resp.Usage.TotalTokens,
|
||||
})
|
||||
return &ToolResult{
|
||||
ToolName: "vision_analyze",
|
||||
Success: true,
|
||||
Data: string(output),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Fallback: return base64 data URL for caller to process
|
||||
result, _ := json.Marshal(map[string]interface{}{
|
||||
"image_path": imagePath,
|
||||
"task": task,
|
||||
"data_url": dataURL,
|
||||
"mime_type": mimeType,
|
||||
"prompt": prompt,
|
||||
"file_size": len(dataURL),
|
||||
})
|
||||
return &ToolResult{
|
||||
ToolName: "vision_analyze",
|
||||
Success: true,
|
||||
|
||||
Reference in New Issue
Block a user