Files
Cyrene/backend/ai-core/internal/tools/vision_tool.go
T
AskaEth 71f0a1abdb feat: Go模块路径迁移 + Docker生产部署适配 + ethend Docker兼容
- 所有Go模块路径从 github.com/yourname/cyrene-ai 迁移到 git.yeij.top/AskaEth/Cyrene
- 5个Go Dockerfile添加 GOPROXY=https://goproxy.cn,direct 解决国内构建问题
- ai-core go.mod 添加 pkg/plugins replace 指令
- Caddyfile 简化为 http:// 通配 + handle 保留 /api 前缀
- ethend Dockerfile 适配 (npm install + 仅 COPY package.json)
- ethend 新增 RUNNING_IN_DOCKER 环境变量,健康检查改用Docker服务名
- ethend 数据库状态检查支持Docker hostname (postgres/redis/qdrant/minio)
- process-manager 新增 CONTAINER_SVC_MAP + Docker模式自动检测
- 统一 docker-compose.dev.db.yml 卷名 (pg_data/redis_data/qdrant_data/minio_data)
- docker-compose.yml ethend服务挂载docker.sock + 端口变量化
- 清理 .env 统一后的残留文件与提示信息

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-30 13:43:22 +08:00

163 lines
4.6 KiB
Go

package tools
import (
"context"
"encoding/base64"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"git.yeij.top/AskaEth/Cyrene/ai-core/internal/llm"
"git.yeij.top/AskaEth/Cyrene/ai-core/internal/model"
)
// VisionTool enables image understanding via multimodal LLM.
// When visionProvider is available, it calls the vision model directly for OCR/analysis.
// When nil, it falls back to returning a base64 data URL for the caller to process.
type VisionTool struct {
visionProvider llm.LLMProvider
}
// NewVisionTool creates a vision tool. visionProvider is optional (nil = base64-only mode).
func NewVisionTool(visionProvider llm.LLMProvider) *VisionTool {
return &VisionTool{visionProvider: visionProvider}
}
func (t *VisionTool) Definition() ToolDefinition {
return ToolDefinition{
Name: "vision_analyze",
Description: "分析图片内容。传入图片路径,返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。",
Parameters: map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"image_path": map[string]interface{}{
"type": "string",
"description": "图片文件路径",
},
"task": map[string]interface{}{
"type": "string",
"description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)",
"enum": []string{"ocr", "describe", "analyze"},
},
},
"required": []string{"image_path", "task"},
},
}
}
var taskPrompts = map[string]string{
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
}
func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
imagePath, _ := args["image_path"].(string)
if imagePath == "" {
return &ToolResult{
ToolName: "vision_analyze",
Success: false,
Error: "image_path 参数不能为空",
}, nil
}
task, _ := args["task"].(string)
if task == "" {
task = "analyze"
}
dataURL, mimeType, err := encodeImageToDataURL(imagePath)
if err != nil {
return &ToolResult{
ToolName: "vision_analyze",
Success: false,
Error: fmt.Sprintf("读取图片失败: %v", err),
}, nil
}
prompt := taskPrompts[task]
if prompt == "" {
prompt = taskPrompts["analyze"]
}
// If a vision model is available, call it directly for OCR/analysis
if t.visionProvider != nil {
messages := []model.LLMMessage{
{Role: model.RoleUser, Content: prompt, Images: []string{dataURL}},
}
resp, err := t.visionProvider.Chat(ctx, messages)
if err != nil {
return &ToolResult{
ToolName: "vision_analyze",
Success: false,
Error: fmt.Sprintf("视觉模型调用失败: %v", err),
}, nil
}
output, _ := json.Marshal(map[string]interface{}{
"image_path": imagePath,
"task": task,
"model": t.visionProvider.ModelName(),
"text": resp.Content,
"prompt_tokens": resp.Usage.PromptTokens,
"completion_tokens": resp.Usage.CompletionTokens,
"total_tokens": resp.Usage.TotalTokens,
})
return &ToolResult{
ToolName: "vision_analyze",
Success: true,
Data: string(output),
}, nil
}
// Fallback: return base64 data URL for caller to process
result, _ := json.Marshal(map[string]interface{}{
"image_path": imagePath,
"task": task,
"data_url": dataURL,
"mime_type": mimeType,
"prompt": prompt,
"file_size": len(dataURL),
})
return &ToolResult{
ToolName: "vision_analyze",
Success: true,
Data: string(result),
}, nil
}
// encodeImageToDataURL reads an image file and returns a base64 data URL.
func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) {
data, err := os.ReadFile(path)
if err != nil {
return "", "", fmt.Errorf("cannot read image: %w", err)
}
if len(data) > 20*1024*1024 {
return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data))
}
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".png":
mimeType = "image/png"
case ".jpg", ".jpeg":
mimeType = "image/jpeg"
case ".gif":
mimeType = "image/gif"
case ".webp":
mimeType = "image/webp"
case ".bmp":
mimeType = "image/bmp"
case ".svg":
mimeType = "image/svg+xml"
default:
mimeType = "image/png"
}
b64 := base64.StdEncoding.EncodeToString(data)
return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil
}