71f0a1abdb
- 所有Go模块路径从 github.com/yourname/cyrene-ai 迁移到 git.yeij.top/AskaEth/Cyrene - 5个Go Dockerfile添加 GOPROXY=https://goproxy.cn,direct 解决国内构建问题 - ai-core go.mod 添加 pkg/plugins replace 指令 - Caddyfile 简化为 http:// 通配 + handle 保留 /api 前缀 - ethend Dockerfile 适配 (npm install + 仅 COPY package.json) - ethend 新增 RUNNING_IN_DOCKER 环境变量,健康检查改用Docker服务名 - ethend 数据库状态检查支持Docker hostname (postgres/redis/qdrant/minio) - process-manager 新增 CONTAINER_SVC_MAP + Docker模式自动检测 - 统一 docker-compose.dev.db.yml 卷名 (pg_data/redis_data/qdrant_data/minio_data) - docker-compose.yml ethend服务挂载docker.sock + 端口变量化 - 清理 .env 统一后的残留文件与提示信息 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
163 lines
4.6 KiB
Go
163 lines
4.6 KiB
Go
package tools
|
|
|
|
import (
|
|
"context"
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"git.yeij.top/AskaEth/Cyrene/ai-core/internal/llm"
|
|
"git.yeij.top/AskaEth/Cyrene/ai-core/internal/model"
|
|
)
|
|
|
|
// VisionTool enables image understanding via multimodal LLM.
|
|
// When visionProvider is available, it calls the vision model directly for OCR/analysis.
|
|
// When nil, it falls back to returning a base64 data URL for the caller to process.
|
|
type VisionTool struct {
|
|
visionProvider llm.LLMProvider
|
|
}
|
|
|
|
// NewVisionTool creates a vision tool. visionProvider is optional (nil = base64-only mode).
|
|
func NewVisionTool(visionProvider llm.LLMProvider) *VisionTool {
|
|
return &VisionTool{visionProvider: visionProvider}
|
|
}
|
|
|
|
func (t *VisionTool) Definition() ToolDefinition {
|
|
return ToolDefinition{
|
|
Name: "vision_analyze",
|
|
Description: "分析图片内容。传入图片路径,返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。",
|
|
Parameters: map[string]interface{}{
|
|
"type": "object",
|
|
"properties": map[string]interface{}{
|
|
"image_path": map[string]interface{}{
|
|
"type": "string",
|
|
"description": "图片文件路径",
|
|
},
|
|
"task": map[string]interface{}{
|
|
"type": "string",
|
|
"description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)",
|
|
"enum": []string{"ocr", "describe", "analyze"},
|
|
},
|
|
},
|
|
"required": []string{"image_path", "task"},
|
|
},
|
|
}
|
|
}
|
|
|
|
var taskPrompts = map[string]string{
|
|
"ocr": "请提取这张图片中的所有文字内容,保持原始格式和排版。只输出文字内容,不要添加额外说明。",
|
|
"describe": "请详细描述这张图片的内容,包括场景、物体、人物、颜色、氛围等。",
|
|
"analyze": "请综合分析这张图片,包括内容描述、文字提取(如有)、以及你的理解。",
|
|
}
|
|
|
|
func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
|
|
imagePath, _ := args["image_path"].(string)
|
|
if imagePath == "" {
|
|
return &ToolResult{
|
|
ToolName: "vision_analyze",
|
|
Success: false,
|
|
Error: "image_path 参数不能为空",
|
|
}, nil
|
|
}
|
|
|
|
task, _ := args["task"].(string)
|
|
if task == "" {
|
|
task = "analyze"
|
|
}
|
|
|
|
dataURL, mimeType, err := encodeImageToDataURL(imagePath)
|
|
if err != nil {
|
|
return &ToolResult{
|
|
ToolName: "vision_analyze",
|
|
Success: false,
|
|
Error: fmt.Sprintf("读取图片失败: %v", err),
|
|
}, nil
|
|
}
|
|
|
|
prompt := taskPrompts[task]
|
|
if prompt == "" {
|
|
prompt = taskPrompts["analyze"]
|
|
}
|
|
|
|
// If a vision model is available, call it directly for OCR/analysis
|
|
if t.visionProvider != nil {
|
|
messages := []model.LLMMessage{
|
|
{Role: model.RoleUser, Content: prompt, Images: []string{dataURL}},
|
|
}
|
|
resp, err := t.visionProvider.Chat(ctx, messages)
|
|
if err != nil {
|
|
return &ToolResult{
|
|
ToolName: "vision_analyze",
|
|
Success: false,
|
|
Error: fmt.Sprintf("视觉模型调用失败: %v", err),
|
|
}, nil
|
|
}
|
|
|
|
output, _ := json.Marshal(map[string]interface{}{
|
|
"image_path": imagePath,
|
|
"task": task,
|
|
"model": t.visionProvider.ModelName(),
|
|
"text": resp.Content,
|
|
"prompt_tokens": resp.Usage.PromptTokens,
|
|
"completion_tokens": resp.Usage.CompletionTokens,
|
|
"total_tokens": resp.Usage.TotalTokens,
|
|
})
|
|
return &ToolResult{
|
|
ToolName: "vision_analyze",
|
|
Success: true,
|
|
Data: string(output),
|
|
}, nil
|
|
}
|
|
|
|
// Fallback: return base64 data URL for caller to process
|
|
result, _ := json.Marshal(map[string]interface{}{
|
|
"image_path": imagePath,
|
|
"task": task,
|
|
"data_url": dataURL,
|
|
"mime_type": mimeType,
|
|
"prompt": prompt,
|
|
"file_size": len(dataURL),
|
|
})
|
|
return &ToolResult{
|
|
ToolName: "vision_analyze",
|
|
Success: true,
|
|
Data: string(result),
|
|
}, nil
|
|
}
|
|
|
|
// encodeImageToDataURL reads an image file and returns a base64 data URL.
|
|
func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("cannot read image: %w", err)
|
|
}
|
|
|
|
if len(data) > 20*1024*1024 {
|
|
return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data))
|
|
}
|
|
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
switch ext {
|
|
case ".png":
|
|
mimeType = "image/png"
|
|
case ".jpg", ".jpeg":
|
|
mimeType = "image/jpeg"
|
|
case ".gif":
|
|
mimeType = "image/gif"
|
|
case ".webp":
|
|
mimeType = "image/webp"
|
|
case ".bmp":
|
|
mimeType = "image/bmp"
|
|
case ".svg":
|
|
mimeType = "image/svg+xml"
|
|
default:
|
|
mimeType = "image/png"
|
|
}
|
|
|
|
b64 := base64.StdEncoding.EncodeToString(data)
|
|
return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil
|
|
}
|