Cyrene/backend/ai-core/internal/tools/web_fetch.go

package tools

import (
	"context"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"
)

// WebFetchTool 网络访问工具 - 允许昔涟获取网页内容
type WebFetchTool struct {
	client  *http.Client
	timeout time.Duration
}

// NewWebFetchTool 创建网络访问工具
func NewWebFetchTool() *WebFetchTool {
	return &WebFetchTool{
		client: &http.Client{
			Timeout: 15 * time.Second,
		},
		timeout: 15 * time.Second,
	}
}

// Definition 返回工具定义
func (t *WebFetchTool) Definition() ToolDefinition {
	return ToolDefinition{
		Name:        "web_fetch",
		Description: "获取指定URL的网页内容。用于查阅新闻、文档、资料等。返回纯文本摘要（前2000字符）。仅支持 HTTP/HTTPS URL。",
		Parameters: map[string]interface{}{
			"type": "object",
			"properties": map[string]interface{}{
				"url": map[string]interface{}{
					"type":        "string",
					"description": "要获取的网页URL，必须是完整的 http:// 或 https:// 链接",
				},
			},
			"required": []string{"url"},
		},
	}
}

// Execute 执行网页获取
func (t *WebFetchTool) Execute(ctx context.Context, arguments map[string]interface{}) (*ToolResult, error) {
	url, ok := arguments["url"].(string)
	if !ok || url == "" {
		return &ToolResult{
			ToolName: "web_fetch",
			Success:  false,
			Error:    "缺少 url 参数",
		}, nil
	}

	// 安全检查：只允许 HTTP/HTTPS
	if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
		return &ToolResult{
			ToolName: "web_fetch",
			Success:  false,
			Error:    "仅支持 http:// 或 https:// 链接",
		}, nil
	}

	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
	if err != nil {
		return &ToolResult{
			ToolName: "web_fetch",
			Success:  false,
			Error:    fmt.Sprintf("创建请求失败: %v", err),
		}, nil
	}

	// 模拟常见浏览器 User-Agent，避免被拒
	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; CyreneBot/1.0; +https://github.com/AskaEth/Cyrene)")
	req.Header.Set("Accept", "text/html,text/plain,*/*")

	resp, err := t.client.Do(req)
	if err != nil {
		return &ToolResult{
			ToolName: "web_fetch",
			Success:  false,
			Error:    fmt.Sprintf("请求失败: %v", err),
		}, nil
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return &ToolResult{
			ToolName: "web_fetch",
			Success:  false,
			Error:    fmt.Sprintf("HTTP %d", resp.StatusCode),
		}, nil
	}

	// 限制读取大小（最多 100KB）
	limitedReader := io.LimitReader(resp.Body, 100*1024)
	body, err := io.ReadAll(limitedReader)
	if err != nil {
		return &ToolResult{
			ToolName: "web_fetch",
			Success:  false,
			Error:    fmt.Sprintf("读取响应失败: %v", err),
		}, nil
	}

	// 提取纯文本摘要（去除 HTML 标签）
	text := extractText(string(body))

	// 截断到 2000 字符
	if len([]rune(text)) > 2000 {
		runes := []rune(text)
		text = string(runes[:2000]) + "\n\n... [内容已截断，共" + fmt.Sprintf("%d", len(runes)) + "字符]"
	}

	result := fmt.Sprintf("URL: %s\n状态: %d\n内容类型: %s\n\n%s",
		url, resp.StatusCode, resp.Header.Get("Content-Type"), text)

	return &ToolResult{
		ToolName: "web_fetch",
		Success:  true,
		Data:     result,
	}, nil
}

// extractText 从 HTML/文本中提取纯文本
func extractText(raw string) string {
	// 简单的 HTML 标签去除
	text := raw
	inTag := false
	var result []rune
	for _, r := range text {
		if r == '<' {
			inTag = true
			continue
		}
		if r == '>' {
			inTag = false
			continue
		}
		if !inTag {
			result = append(result, r)
		}
	}

	// 去除多余空白
	trimmed := strings.TrimSpace(string(result))
	// 压缩连续空行
	lines := strings.Split(trimmed, "\n")
	var cleanLines []string
	for _, line := range lines {
		trimLine := strings.TrimSpace(line)
		if trimLine != "" {
			cleanLines = append(cleanLines, trimLine)
		}
	}
	return strings.Join(cleanLines, "\n")
}