Files
AskaEth 71f0a1abdb feat: Go模块路径迁移 + Docker生产部署适配 + ethend Docker兼容
- 所有Go模块路径从 github.com/yourname/cyrene-ai 迁移到 git.yeij.top/AskaEth/Cyrene
- 5个Go Dockerfile添加 GOPROXY=https://goproxy.cn,direct 解决国内构建问题
- ai-core go.mod 添加 pkg/plugins replace 指令
- Caddyfile 简化为 http:// 通配 + handle 保留 /api 前缀
- ethend Dockerfile 适配 (npm install + 仅 COPY package.json)
- ethend 新增 RUNNING_IN_DOCKER 环境变量,健康检查改用Docker服务名
- ethend 数据库状态检查支持Docker hostname (postgres/redis/qdrant/minio)
- process-manager 新增 CONTAINER_SVC_MAP + Docker模式自动检测
- 统一 docker-compose.dev.db.yml 卷名 (pg_data/redis_data/qdrant_data/minio_data)
- docker-compose.yml ethend服务挂载docker.sock + 端口变量化
- 清理 .env 统一后的残留文件与提示信息

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-30 13:43:22 +08:00

823 lines
23 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package store
import (
"crypto/rand"
"database/sql"
"fmt"
"git.yeij.top/AskaEth/Cyrene/pkg/logger"
"strings"
"time"
"unicode/utf8"
)
// ========== 模型定义 ==========
// KnowledgeBase 知识库
type KnowledgeBase struct {
ID string `json:"id"`
UserID string `json:"user_id"`
Name string `json:"name"`
Description string `json:"description"`
DocumentCount int `json:"document_count"`
ChunkCount int `json:"chunk_count"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// KnowledgeDocument 知识库文档
type KnowledgeDocument struct {
ID string `json:"id"`
KBID string `json:"kb_id"`
UserID string `json:"user_id"`
Title string `json:"title"`
SourceType string `json:"source_type"` // "file", "text", "url"
SourceRef string `json:"source_ref"` // 文件 ID 或 URL
ContentType string `json:"content_type"` // "text/plain", "text/markdown", "text/html"
RawContent string `json:"raw_content"`
ChunkCount int `json:"chunk_count"`
CreatedAt time.Time `json:"created_at"`
}
// KnowledgeChunk 文档分块
type KnowledgeChunk struct {
ID string `json:"id"`
DocID string `json:"doc_id"`
KBID string `json:"kb_id"`
ChunkIndex int `json:"chunk_index"`
Content string `json:"content"`
TokenCount int `json:"token_count"`
CreatedAt time.Time `json:"created_at"`
}
// SearchChunkResult 搜索结果的块,包含额外上下文信息
type SearchChunkResult struct {
KnowledgeChunk
Relevance float64 `json:"relevance"`
DocumentTitle string `json:"document_title"`
KBName string `json:"kb_name"`
Headline string `json:"headline"`
}
// ========== KnowledgeStore ==========
// KnowledgeStore 知识库持久化存储
type KnowledgeStore struct {
db *sql.DB
}
// NewKnowledgeStore 使用已有数据库连接初始化知识库存储并自动建表
func NewKnowledgeStore(db *sql.DB) (*KnowledgeStore, error) {
store := &KnowledgeStore{db: db}
if err := store.migrate(); err != nil {
return nil, fmt.Errorf("知识库表迁移失败: %w", err)
}
logger.Println("[KnowledgeStore] 知识库持久化存储已初始化")
return store, nil
}
// migrate 自动创建知识库相关表结构
func (s *KnowledgeStore) migrate() error {
queries := []string{
// 知识库表
`CREATE TABLE IF NOT EXISTS knowledge_bases (
id VARCHAR(64) PRIMARY KEY,
user_id VARCHAR(64) NOT NULL,
name VARCHAR(255) NOT NULL,
description TEXT DEFAULT '',
document_count INT DEFAULT 0,
chunk_count INT DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
)`,
`CREATE INDEX IF NOT EXISTS idx_kb_user_id ON knowledge_bases(user_id)`,
// 文档表
`CREATE TABLE IF NOT EXISTS knowledge_documents (
id VARCHAR(64) PRIMARY KEY,
kb_id VARCHAR(64) NOT NULL REFERENCES knowledge_bases(id) ON DELETE CASCADE,
user_id VARCHAR(64) NOT NULL,
title VARCHAR(512) NOT NULL,
source_type VARCHAR(32) DEFAULT 'text',
source_ref VARCHAR(1024) DEFAULT '',
content_type VARCHAR(64) DEFAULT 'text/plain',
raw_content TEXT DEFAULT '',
chunk_count INT DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
)`,
`CREATE INDEX IF NOT EXISTS idx_kd_kb_id ON knowledge_documents(kb_id)`,
`CREATE INDEX IF NOT EXISTS idx_kd_user_id ON knowledge_documents(user_id)`,
// 分块表
`CREATE TABLE IF NOT EXISTS knowledge_chunks (
id VARCHAR(64) PRIMARY KEY,
doc_id VARCHAR(64) NOT NULL REFERENCES knowledge_documents(id) ON DELETE CASCADE,
kb_id VARCHAR(64) NOT NULL,
chunk_index INT NOT NULL,
content TEXT NOT NULL,
token_count INT DEFAULT 0,
tsv TSVECTOR,
created_at TIMESTAMPTZ DEFAULT NOW()
)`,
`CREATE INDEX IF NOT EXISTS idx_kc_doc_id ON knowledge_chunks(doc_id)`,
`CREATE INDEX IF NOT EXISTS idx_kc_kb_id ON knowledge_chunks(kb_id)`,
}
for _, q := range queries {
if _, err := s.db.Exec(q); err != nil {
return fmt.Errorf("迁移SQL执行失败: %w\nSQL: %s", err, q)
}
}
// 尝试创建 GIN 索引(可能因权限或扩展问题失败,但不影响功能)
_, err := s.db.Exec(`CREATE INDEX IF NOT EXISTS idx_kc_tsv_gin ON knowledge_chunks USING GIN(tsv)`)
if err != nil {
logger.Printf("[KnowledgeStore] ⚠ GIN索引创建失败(将使用ILIKE降级搜索): %v", err)
}
return nil
}
// ========== 知识库 CRUD ==========
// CreateKB 创建知识库
func (s *KnowledgeStore) CreateKB(kb *KnowledgeBase) error {
now := time.Now()
if kb.CreatedAt.IsZero() {
kb.CreatedAt = now
}
if kb.UpdatedAt.IsZero() {
kb.UpdatedAt = now
}
_, err := s.db.Exec(
`INSERT INTO knowledge_bases (id, user_id, name, description, document_count, chunk_count, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
kb.ID, kb.UserID, kb.Name, kb.Description, kb.DocumentCount, kb.ChunkCount, kb.CreatedAt, kb.UpdatedAt,
)
if err != nil {
return fmt.Errorf("创建知识库失败: %w", err)
}
return nil
}
// GetKBsByUser 获取用户的所有知识库
func (s *KnowledgeStore) GetKBsByUser(userID string) ([]KnowledgeBase, error) {
rows, err := s.db.Query(
`SELECT id, user_id, name, description, document_count, chunk_count, created_at, updated_at
FROM knowledge_bases WHERE user_id = $1
ORDER BY updated_at DESC`,
userID,
)
if err != nil {
return nil, fmt.Errorf("查询知识库列表失败: %w", err)
}
defer rows.Close()
var kbs []KnowledgeBase
for rows.Next() {
var kb KnowledgeBase
if err := rows.Scan(&kb.ID, &kb.UserID, &kb.Name, &kb.Description,
&kb.DocumentCount, &kb.ChunkCount, &kb.CreatedAt, &kb.UpdatedAt); err != nil {
return nil, fmt.Errorf("扫描知识库行失败: %w", err)
}
kbs = append(kbs, kb)
}
if kbs == nil {
kbs = []KnowledgeBase{}
}
return kbs, rows.Err()
}
// GetKB 获取单个知识库
func (s *KnowledgeStore) GetKB(id string) (*KnowledgeBase, error) {
var kb KnowledgeBase
err := s.db.QueryRow(
`SELECT id, user_id, name, description, document_count, chunk_count, created_at, updated_at
FROM knowledge_bases WHERE id = $1`,
id,
).Scan(&kb.ID, &kb.UserID, &kb.Name, &kb.Description,
&kb.DocumentCount, &kb.ChunkCount, &kb.CreatedAt, &kb.UpdatedAt)
if err != nil {
if err == sql.ErrNoRows {
return nil, nil
}
return nil, fmt.Errorf("查询知识库失败: %w", err)
}
return &kb, nil
}
// UpdateKB 更新知识库名称和描述
func (s *KnowledgeStore) UpdateKB(id string, name, description string) error {
_, err := s.db.Exec(
`UPDATE knowledge_bases SET name = $1, description = $2, updated_at = NOW() WHERE id = $3`,
name, description, id,
)
if err != nil {
return fmt.Errorf("更新知识库失败: %w", err)
}
return nil
}
// DeleteKB 删除知识库(级联删除文档和块)
func (s *KnowledgeStore) DeleteKB(id string) error {
_, err := s.db.Exec(`DELETE FROM knowledge_bases WHERE id = $1`, id)
if err != nil {
return fmt.Errorf("删除知识库失败: %w", err)
}
return nil
}
// updateKBStats 更新知识库的统计计数
func (s *KnowledgeStore) updateKBStats(kbID string) error {
_, err := s.db.Exec(
`UPDATE knowledge_bases SET
document_count = (SELECT COUNT(*) FROM knowledge_documents WHERE kb_id = $1),
chunk_count = (SELECT COUNT(*) FROM knowledge_chunks WHERE kb_id = $1),
updated_at = NOW()
WHERE id = $1`,
kbID,
)
return err
}
// ========== 文档 CRUD ==========
// AddDocument 添加文档,返回创建的文档
func (s *KnowledgeStore) AddDocument(doc *KnowledgeDocument) error {
if doc.CreatedAt.IsZero() {
doc.CreatedAt = time.Now()
}
_, err := s.db.Exec(
`INSERT INTO knowledge_documents (id, kb_id, user_id, title, source_type, source_ref, content_type, raw_content, chunk_count, created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`,
doc.ID, doc.KBID, doc.UserID, doc.Title, doc.SourceType, doc.SourceRef,
doc.ContentType, doc.RawContent, doc.ChunkCount, doc.CreatedAt,
)
if err != nil {
return fmt.Errorf("添加文档失败: %w", err)
}
// 更新知识库统计
if err := s.updateKBStats(doc.KBID); err != nil {
logger.Printf("[KnowledgeStore] 更新知识库统计失败: %v", err)
}
return nil
}
// GetDocument 获取单个文档
func (s *KnowledgeStore) GetDocument(id string) (*KnowledgeDocument, error) {
var doc KnowledgeDocument
err := s.db.QueryRow(
`SELECT id, kb_id, user_id, title, source_type, source_ref, content_type, raw_content, chunk_count, created_at
FROM knowledge_documents WHERE id = $1`,
id,
).Scan(&doc.ID, &doc.KBID, &doc.UserID, &doc.Title, &doc.SourceType, &doc.SourceRef,
&doc.ContentType, &doc.RawContent, &doc.ChunkCount, &doc.CreatedAt)
if err != nil {
if err == sql.ErrNoRows {
return nil, nil
}
return nil, fmt.Errorf("查询文档失败: %w", err)
}
return &doc, nil
}
// GetDocumentsByKB 获取知识库中的所有文档
func (s *KnowledgeStore) GetDocumentsByKB(kbID string) ([]KnowledgeDocument, error) {
rows, err := s.db.Query(
`SELECT id, kb_id, user_id, title, source_type, source_ref, content_type, raw_content, chunk_count, created_at
FROM knowledge_documents WHERE kb_id = $1
ORDER BY created_at DESC`,
kbID,
)
if err != nil {
return nil, fmt.Errorf("查询文档列表失败: %w", err)
}
defer rows.Close()
var docs []KnowledgeDocument
for rows.Next() {
var doc KnowledgeDocument
if err := rows.Scan(&doc.ID, &doc.KBID, &doc.UserID, &doc.Title, &doc.SourceType, &doc.SourceRef,
&doc.ContentType, &doc.RawContent, &doc.ChunkCount, &doc.CreatedAt); err != nil {
return nil, fmt.Errorf("扫描文档行失败: %w", err)
}
docs = append(docs, doc)
}
if docs == nil {
docs = []KnowledgeDocument{}
}
return docs, rows.Err()
}
// UpdateDocumentChunkCount 更新文档的分块计数
func (s *KnowledgeStore) UpdateDocumentChunkCount(docID string, count int) error {
_, err := s.db.Exec(
`UPDATE knowledge_documents SET chunk_count = $1 WHERE id = $2`,
count, docID,
)
return err
}
// DeleteDocument 删除文档(级联删除块)
func (s *KnowledgeStore) DeleteDocument(id string) error {
// 先获取 kb_id 以便后续更新统计
var kbID string
err := s.db.QueryRow(`SELECT kb_id FROM knowledge_documents WHERE id = $1`, id).Scan(&kbID)
if err != nil {
if err == sql.ErrNoRows {
return nil
}
return fmt.Errorf("查询文档失败: %w", err)
}
_, err = s.db.Exec(`DELETE FROM knowledge_documents WHERE id = $1`, id)
if err != nil {
return fmt.Errorf("删除文档失败: %w", err)
}
// 更新知识库统计
if err := s.updateKBStats(kbID); err != nil {
logger.Printf("[KnowledgeStore] 更新知识库统计失败: %v", err)
}
return nil
}
// ========== 分块操作 ==========
// AddChunk 添加单个分块
func (s *KnowledgeStore) AddChunk(chunk *KnowledgeChunk) error {
if chunk.CreatedAt.IsZero() {
chunk.CreatedAt = time.Now()
}
// 尝试使用 to_tsvector('chinese', content) 设置 tsv
// 如果中文分词不可用,使用 simple 配置
_, err := s.db.Exec(
`INSERT INTO knowledge_chunks (id, doc_id, kb_id, chunk_index, content, token_count, tsv, created_at)
VALUES ($1, $2, $3, $4, $5, $6,
CASE WHEN (SELECT count(*) FROM pg_ts_config WHERE cfgname = 'chinese') > 0
THEN to_tsvector('chinese', $5)
ELSE to_tsvector('simple', $5)
END,
$7)`,
chunk.ID, chunk.DocID, chunk.KBID, chunk.ChunkIndex, chunk.Content, chunk.TokenCount, chunk.CreatedAt,
)
if err != nil {
// 降级:不使用 tsv
_, err = s.db.Exec(
`INSERT INTO knowledge_chunks (id, doc_id, kb_id, chunk_index, content, token_count, created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7)`,
chunk.ID, chunk.DocID, chunk.KBID, chunk.ChunkIndex, chunk.Content, chunk.TokenCount, chunk.CreatedAt,
)
if err != nil {
return fmt.Errorf("添加分块失败: %w", err)
}
}
return nil
}
// DeleteChunksByDocID 删除文档的所有分块
func (s *KnowledgeStore) DeleteChunksByDocID(docID string) error {
_, err := s.db.Exec(`DELETE FROM knowledge_chunks WHERE doc_id = $1`, docID)
return err
}
// GetChunksByDocID 获取文档的所有分块
func (s *KnowledgeStore) GetChunksByDocID(docID string) ([]KnowledgeChunk, error) {
rows, err := s.db.Query(
`SELECT id, doc_id, kb_id, chunk_index, content, token_count, created_at
FROM knowledge_chunks WHERE doc_id = $1
ORDER BY chunk_index ASC`,
docID,
)
if err != nil {
return nil, fmt.Errorf("查询分块失败: %w", err)
}
defer rows.Close()
var chunks []KnowledgeChunk
for rows.Next() {
var c KnowledgeChunk
if err := rows.Scan(&c.ID, &c.DocID, &c.KBID, &c.ChunkIndex, &c.Content, &c.TokenCount, &c.CreatedAt); err != nil {
return nil, fmt.Errorf("扫描分块行失败: %w", err)
}
chunks = append(chunks, c)
}
if chunks == nil {
chunks = []KnowledgeChunk{}
}
return chunks, rows.Err()
}
// ========== 分块逻辑 ==========
// ChunkDocument 将文档分块并存储
func (s *KnowledgeStore) ChunkDocument(docID string) (int, error) {
// 获取文档
doc, err := s.GetDocument(docID)
if err != nil {
return 0, err
}
if doc == nil {
return 0, fmt.Errorf("文档不存在: %s", docID)
}
// 删除旧的分块
if err := s.DeleteChunksByDocID(docID); err != nil {
return 0, fmt.Errorf("删除旧分块失败: %w", err)
}
// 分块
chunks := splitTextIntoChunks(doc.RawContent, 500, 50)
// 存储分块
for i, content := range chunks {
chunk := &KnowledgeChunk{
ID: generateUUIDv4(),
DocID: docID,
KBID: doc.KBID,
ChunkIndex: i,
Content: content,
TokenCount: estimateTokenCount(content),
}
if err := s.AddChunk(chunk); err != nil {
return 0, fmt.Errorf("存储分块 %d 失败: %w", i, err)
}
}
// 更新文档的分块计数
if err := s.UpdateDocumentChunkCount(docID, len(chunks)); err != nil {
logger.Printf("[KnowledgeStore] 更新文档分块计数失败: %v", err)
}
// 更新知识库统计
if err := s.updateKBStats(doc.KBID); err != nil {
logger.Printf("[KnowledgeStore] 更新知识库统计失败: %v", err)
}
return len(chunks), nil
}
// ========== 搜索 ==========
// SearchChunks 在指定知识库中搜索
func (s *KnowledgeStore) SearchChunks(kbID, query string, limit int) ([]SearchChunkResult, error) {
if limit <= 0 {
limit = 5
}
// 尝试使用 PostgreSQL 全文搜索
results, err := s.searchWithFullText(kbID, query, limit)
if err != nil {
logger.Printf("[KnowledgeStore] 全文搜索失败,降级为ILIKE: %v", err)
// 降级为 ILIKE
results, err = s.searchWithILike(kbID, query, limit)
if err != nil {
return nil, err
}
}
if results == nil {
results = []SearchChunkResult{}
}
return results, nil
}
// SearchAllKBs 在用户的所有知识库中搜索
func (s *KnowledgeStore) SearchAllKBs(userID, query string, limit int) ([]SearchChunkResult, error) {
if limit <= 0 {
limit = 5
}
results, err := s.searchAllWithILike(userID, query, limit)
if err != nil {
return nil, err
}
if results == nil {
results = []SearchChunkResult{}
}
return results, nil
}
// searchWithFullText 使用 PostgreSQL ts_rank + plainto_tsquery 搜索
func (s *KnowledgeStore) searchWithFullText(kbID, query string, limit int) ([]SearchChunkResult, error) {
rows, err := s.db.Query(
`SELECT kc.id, kc.doc_id, kc.kb_id, kc.chunk_index, kc.content, kc.token_count, kc.created_at,
ts_rank(kc.tsv, plainto_tsquery('chinese', $2)) AS relevance,
kd.title AS document_title,
kb.name AS kb_name
FROM knowledge_chunks kc
JOIN knowledge_documents kd ON kc.doc_id = kd.id
JOIN knowledge_bases kb ON kc.kb_id = kb.id
WHERE kc.kb_id = $1 AND kc.tsv @@ plainto_tsquery('chinese', $2)
ORDER BY relevance DESC
LIMIT $3`,
kbID, query, limit,
)
if err != nil {
return nil, err
}
defer rows.Close()
return scanSearchResults(rows)
}
// searchWithILike 使用 ILIKE 降级搜索
func (s *KnowledgeStore) searchWithILike(kbID, query string, limit int) ([]SearchChunkResult, error) {
// 构建 ILIKE 模式
keywords := tokenizeQuery(query)
if len(keywords) == 0 {
return []SearchChunkResult{}, nil
}
// 对每个关键词构建 ILIKE 条件
conditions := make([]string, len(keywords))
args := []interface{}{kbID}
placeholderIdx := 2
for i, kw := range keywords {
conditions[i] = fmt.Sprintf("kc.content ILIKE $%d", placeholderIdx)
args = append(args, "%"+kw+"%")
placeholderIdx++
}
args = append(args, limit)
querySQL := fmt.Sprintf(
`SELECT kc.id, kc.doc_id, kc.kb_id, kc.chunk_index, kc.content, kc.token_count, kc.created_at,
0.0 AS relevance,
kd.title AS document_title,
kb.name AS kb_name
FROM knowledge_chunks kc
JOIN knowledge_documents kd ON kc.doc_id = kd.id
JOIN knowledge_bases kb ON kc.kb_id = kb.id
WHERE kc.kb_id = $1 AND (%s)
LIMIT $%d`,
strings.Join(conditions, " AND "),
placeholderIdx,
)
rows, err := s.db.Query(querySQL, args...)
if err != nil {
return nil, fmt.Errorf("ILIKE搜索失败: %w", err)
}
defer rows.Close()
return scanSearchResults(rows)
}
// searchAllWithILike 跨所有用户知识库使用 ILIKE 搜索
func (s *KnowledgeStore) searchAllWithILike(userID, query string, limit int) ([]SearchChunkResult, error) {
keywords := tokenizeQuery(query)
if len(keywords) == 0 {
return []SearchChunkResult{}, nil
}
conditions := make([]string, len(keywords))
args := []interface{}{userID}
placeholderIdx := 2
for i, kw := range keywords {
conditions[i] = fmt.Sprintf("kc.content ILIKE $%d", placeholderIdx)
args = append(args, "%"+kw+"%")
placeholderIdx++
}
args = append(args, limit)
querySQL := fmt.Sprintf(
`SELECT kc.id, kc.doc_id, kc.kb_id, kc.chunk_index, kc.content, kc.token_count, kc.created_at,
0.0 AS relevance,
kd.title AS document_title,
kb.name AS kb_name
FROM knowledge_chunks kc
JOIN knowledge_documents kd ON kc.doc_id = kd.id
JOIN knowledge_bases kb ON kc.kb_id = kb.id
WHERE kb.user_id = $1 AND (%s)
ORDER BY kc.created_at DESC
LIMIT $%d`,
strings.Join(conditions, " AND "),
placeholderIdx,
)
rows, err := s.db.Query(querySQL, args...)
if err != nil {
return nil, fmt.Errorf("全知识库ILIKE搜索失败: %w", err)
}
defer rows.Close()
return scanSearchResults(rows)
}
// scanSearchResults 扫描搜索结果
func scanSearchResults(rows *sql.Rows) ([]SearchChunkResult, error) {
var results []SearchChunkResult
for rows.Next() {
var r SearchChunkResult
if err := rows.Scan(&r.ID, &r.DocID, &r.KBID, &r.ChunkIndex, &r.Content,
&r.TokenCount, &r.CreatedAt, &r.Relevance, &r.DocumentTitle, &r.KBName); err != nil {
return nil, fmt.Errorf("扫描搜索结果行失败: %w", err)
}
// 生成高亮片段
r.Headline = r.Content
results = append(results, r)
}
if results == nil {
results = []SearchChunkResult{}
}
return results, rows.Err()
}
// ========== 文本分块函数 ==========
// splitTextIntoChunks 将文本按 maxLen 分块,块之间有 overlap 字符重叠
func splitTextIntoChunks(text string, maxLen int, overlap int) []string {
if text == "" {
return nil
}
// 按段落分割
paragraphs := strings.Split(text, "\n\n")
var chunks []string
var currentChunk strings.Builder
for _, para := range paragraphs {
para = strings.TrimSpace(para)
if para == "" {
continue
}
paraLen := utf8.RuneCountInString(para)
if paraLen <= maxLen {
// 如果当前块 + 段落不超过 maxLen,追加到当前块
if utf8.RuneCountInString(currentChunk.String()) == 0 {
currentChunk.WriteString(para)
} else if utf8.RuneCountInString(currentChunk.String())+1+paraLen <= maxLen {
currentChunk.WriteString("\n\n")
currentChunk.WriteString(para)
} else {
// 保存当前块,开始新块
chunks = append(chunks, currentChunk.String())
currentChunk.Reset()
currentChunk.WriteString(para)
}
} else {
// 段落超过 maxLen,需要按句子分割
// 先保存当前块
if currentChunk.Len() > 0 {
chunks = append(chunks, currentChunk.String())
currentChunk.Reset()
}
// 按句子分割
sentences := splitIntoSentences(para)
for _, sent := range sentences {
sent = strings.TrimSpace(sent)
if sent == "" {
continue
}
sentLen := utf8.RuneCountInString(sent)
if sentLen <= maxLen {
if utf8.RuneCountInString(currentChunk.String()) == 0 {
currentChunk.WriteString(sent)
} else if utf8.RuneCountInString(currentChunk.String())+sentLen <= maxLen {
currentChunk.WriteString(sent)
} else {
chunks = append(chunks, currentChunk.String())
currentChunk.Reset()
currentChunk.WriteString(sent)
}
} else {
// 句子超过 maxLen,按 maxLen 截断
if currentChunk.Len() > 0 {
chunks = append(chunks, currentChunk.String())
currentChunk.Reset()
}
// 按 maxLen 截断,带 overlap
runes := []rune(sent)
start := 0
for start < len(runes) {
end := start + maxLen
if end > len(runes) {
end = len(runes)
}
chunks = append(chunks, string(runes[start:end]))
if end >= len(runes) {
break
}
// 下一块从 end-overlap 开始
start = end - overlap
if start <= 0 {
start = end
}
}
}
}
}
}
// 保存最后一个块
if currentChunk.Len() > 0 {
chunks = append(chunks, currentChunk.String())
}
return chunks
}
// splitIntoSentences 按句子分割文本(中文。!?和英文标点)
func splitIntoSentences(text string) []string {
var sentences []string
runes := []rune(text)
var current strings.Builder
for i := 0; i < len(runes); i++ {
current.WriteRune(runes[i])
// 检查句子结束标志
if runes[i] == '。' || runes[i] == '' || runes[i] == '' ||
runes[i] == '!' || runes[i] == '?' ||
(runes[i] == '\n' && i+1 < len(runes) && runes[i+1] != '\n') {
sentences = append(sentences, current.String())
current.Reset()
}
}
// 剩余内容
if current.Len() > 0 {
remaining := strings.TrimSpace(current.String())
if remaining != "" {
sentences = append(sentences, remaining)
}
}
return sentences
}
// estimateTokenCount 估算 token 数量(中文按每个字符1.2个token,英文按每4个字符1个token
func estimateTokenCount(text string) int {
runes := []rune(text)
total := 0
for _, r := range runes {
if r >= 0x4e00 && r <= 0x9fff {
// 中文字符,约1.2个token
total += 1
}
}
// 非中文字符粗略估算:字符数/4
nonChinese := len(runes) - total
total = int(float64(total)*1.2) + nonChinese/4
if total < 1 {
total = 1
}
return total
}
// tokenizeQuery 将查询字符串分词(简单按空格和标点分割)
func tokenizeQuery(query string) []string {
// 按空格、中文标点、英文标点分割
query = strings.TrimSpace(query)
if query == "" {
return nil
}
// 先用空格分割
parts := strings.Fields(query)
var tokens []string
for _, part := range parts {
part = strings.Trim(part, "。!?!?,.;::;、()()[]{}《》\"'")
if part != "" {
tokens = append(tokens, part)
}
}
return tokens
}
// GenerateUUID 使用 crypto/rand 生成 UUID v4 格式的字符串(导出供其他包使用)
func GenerateUUID() string {
return generateUUIDv4()
}
// generateUUIDv4 使用 crypto/rand 生成 UUID v4 格式的字符串
func generateUUIDv4() string {
b := make([]byte, 16)
if _, err := rand.Read(b); err != nil {
// 降级方案:基于时间戳 + 简单随机
ts := time.Now().UnixNano()
for i := 0; i < 16; i++ {
b[i] = byte((ts >> (i * 4)) & 0xFF)
}
}
// 设置 UUID v4 版本位 (version = 4)
b[6] = (b[6] & 0x0f) | 0x40
// 设置 UUID variant 位 (variant = 10xx)
b[8] = (b[8] & 0x3f) | 0x80
return fmt.Sprintf("%08x-%04x-%04x-%04x-%012x",
b[0:4], b[4:6], b[6:8], b[8:10], b[10:16])
}