Cyrene/backend/ai-core/internal/tools/vision_tool.go

package tools

import (
	"context"
	"encoding/base64"
	"encoding/json"
	"fmt"
	"os"
	"path/filepath"
	"strings"

	"git.yeij.top/AskaEth/Cyrene/ai-core/internal/llm"
	"git.yeij.top/AskaEth/Cyrene/ai-core/internal/model"
)

// VisionTool enables image understanding via multimodal LLM.
// When visionProvider is available, it calls the vision model directly for OCR/analysis.
// When nil, it falls back to returning a base64 data URL for the caller to process.
type VisionTool struct {
	visionProvider llm.LLMProvider
}

// NewVisionTool creates a vision tool. visionProvider is optional (nil = base64-only mode).
func NewVisionTool(visionProvider llm.LLMProvider) *VisionTool {
	return &VisionTool{visionProvider: visionProvider}
}

func (t *VisionTool) Definition() ToolDefinition {
	return ToolDefinition{
		Name:        "vision_analyze",
		Description: "分析图片内容。传入图片路径，返回图片的 base64 data URL 用于多模态 LLM 分析。可用于 OCR 文字提取、物体识别、场景理解等。",
		Parameters: map[string]interface{}{
			"type": "object",
			"properties": map[string]interface{}{
				"image_path": map[string]interface{}{
					"type":        "string",
					"description": "图片文件路径",
				},
				"task": map[string]interface{}{
					"type":        "string",
					"description": "分析任务: ocr(文字提取), describe(场景描述), analyze(综合分析)",
					"enum":        []string{"ocr", "describe", "analyze"},
				},
			},
			"required": []string{"image_path", "task"},
		},
	}
}

var taskPrompts = map[string]string{
	"ocr":      "请提取这张图片中的所有文字内容，保持原始格式和排版。只输出文字内容，不要添加额外说明。",
	"describe": "请详细描述这张图片的内容，包括场景、物体、人物、颜色、氛围等。",
	"analyze":  "请综合分析这张图片，包括内容描述、文字提取(如有)、以及你的理解。",
}

func (t *VisionTool) Execute(ctx context.Context, args map[string]interface{}) (*ToolResult, error) {
	imagePath, _ := args["image_path"].(string)
	if imagePath == "" {
		return &ToolResult{
			ToolName: "vision_analyze",
			Success:  false,
			Error:    "image_path 参数不能为空",
		}, nil
	}

	task, _ := args["task"].(string)
	if task == "" {
		task = "analyze"
	}

	dataURL, mimeType, err := encodeImageToDataURL(imagePath)
	if err != nil {
		return &ToolResult{
			ToolName: "vision_analyze",
			Success:  false,
			Error:    fmt.Sprintf("读取图片失败: %v", err),
		}, nil
	}

	prompt := taskPrompts[task]
	if prompt == "" {
		prompt = taskPrompts["analyze"]
	}

	// If a vision model is available, call it directly for OCR/analysis
	if t.visionProvider != nil {
		messages := []model.LLMMessage{
			{Role: model.RoleUser, Content: prompt, Images: []string{dataURL}},
		}
		resp, err := t.visionProvider.Chat(ctx, messages)
		if err != nil {
			return &ToolResult{
				ToolName: "vision_analyze",
				Success:  false,
				Error:    fmt.Sprintf("视觉模型调用失败: %v", err),
			}, nil
		}

		output, _ := json.Marshal(map[string]interface{}{
			"image_path":   imagePath,
			"task":         task,
			"model":        t.visionProvider.ModelName(),
			"text":         resp.Content,
			"prompt_tokens":  resp.Usage.PromptTokens,
			"completion_tokens": resp.Usage.CompletionTokens,
			"total_tokens":  resp.Usage.TotalTokens,
		})
		return &ToolResult{
			ToolName: "vision_analyze",
			Success:  true,
			Data:     string(output),
		}, nil
	}

	// Fallback: return base64 data URL for caller to process
	result, _ := json.Marshal(map[string]interface{}{
		"image_path": imagePath,
		"task":       task,
		"data_url":   dataURL,
		"mime_type":  mimeType,
		"prompt":     prompt,
		"file_size":  len(dataURL),
	})
	return &ToolResult{
		ToolName: "vision_analyze",
		Success:  true,
		Data:     string(result),
	}, nil
}

// encodeImageToDataURL reads an image file and returns a base64 data URL.
func encodeImageToDataURL(path string) (dataURL, mimeType string, err error) {
	data, err := os.ReadFile(path)
	if err != nil {
		return "", "", fmt.Errorf("cannot read image: %w", err)
	}

	if len(data) > 20*1024*1024 {
		return "", "", fmt.Errorf("image too large: %d bytes (max 20MB)", len(data))
	}

	ext := strings.ToLower(filepath.Ext(path))
	switch ext {
	case ".png":
		mimeType = "image/png"
	case ".jpg", ".jpeg":
		mimeType = "image/jpeg"
	case ".gif":
		mimeType = "image/gif"
	case ".webp":
		mimeType = "image/webp"
	case ".bmp":
		mimeType = "image/bmp"
	case ".svg":
		mimeType = "image/svg+xml"
	default:
		mimeType = "image/png"
	}

	b64 := base64.StdEncoding.EncodeToString(data)
	return fmt.Sprintf("data:%s;base64,%s", mimeType, b64), mimeType, nil
}