87214b9441
Phase 1 (基础设施): - ThinkChain 思考链连续性 + 差异化思考提示词 (persistent) - AutonomousToolPolicy 工具安全策略 (safe/unsafe/conditional) - MessageScheduler 自适应消息节奏 (Idle/Available/Busy) - SessionEnrichmentStore 渐进式上下文丰富 (5层) - ConversationBus 事件总线 + ResponseCache (dedup) - pkg/logger 统一日志 + 所有 handler 替换 fmt.Printf - NPE 守卫/链路优化/数据库表修复/Go workspace Phase 2 (人格交互): - EmotionState/EmotionTracker 情感状态机 (5种心情, 情绪衰减) - ProactiveGuard 主动消息多维决策 (静默时段/紧急度/频率/校验) - Gateway↔ai-core 在线状态感知链路 (presence notification) - 离线思考频率控制 + 重连问候 + 离线消息排队 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
823 lines
23 KiB
Go
823 lines
23 KiB
Go
package store
|
||
|
||
import (
|
||
"crypto/rand"
|
||
"database/sql"
|
||
"fmt"
|
||
"github.com/yourname/cyrene-ai/pkg/logger"
|
||
"strings"
|
||
"time"
|
||
"unicode/utf8"
|
||
)
|
||
|
||
// ========== 模型定义 ==========
|
||
|
||
// KnowledgeBase 知识库
|
||
type KnowledgeBase struct {
|
||
ID string `json:"id"`
|
||
UserID string `json:"user_id"`
|
||
Name string `json:"name"`
|
||
Description string `json:"description"`
|
||
DocumentCount int `json:"document_count"`
|
||
ChunkCount int `json:"chunk_count"`
|
||
CreatedAt time.Time `json:"created_at"`
|
||
UpdatedAt time.Time `json:"updated_at"`
|
||
}
|
||
|
||
// KnowledgeDocument 知识库文档
|
||
type KnowledgeDocument struct {
|
||
ID string `json:"id"`
|
||
KBID string `json:"kb_id"`
|
||
UserID string `json:"user_id"`
|
||
Title string `json:"title"`
|
||
SourceType string `json:"source_type"` // "file", "text", "url"
|
||
SourceRef string `json:"source_ref"` // 文件 ID 或 URL
|
||
ContentType string `json:"content_type"` // "text/plain", "text/markdown", "text/html"
|
||
RawContent string `json:"raw_content"`
|
||
ChunkCount int `json:"chunk_count"`
|
||
CreatedAt time.Time `json:"created_at"`
|
||
}
|
||
|
||
// KnowledgeChunk 文档分块
|
||
type KnowledgeChunk struct {
|
||
ID string `json:"id"`
|
||
DocID string `json:"doc_id"`
|
||
KBID string `json:"kb_id"`
|
||
ChunkIndex int `json:"chunk_index"`
|
||
Content string `json:"content"`
|
||
TokenCount int `json:"token_count"`
|
||
CreatedAt time.Time `json:"created_at"`
|
||
}
|
||
|
||
// SearchChunkResult 搜索结果的块,包含额外上下文信息
|
||
type SearchChunkResult struct {
|
||
KnowledgeChunk
|
||
Relevance float64 `json:"relevance"`
|
||
DocumentTitle string `json:"document_title"`
|
||
KBName string `json:"kb_name"`
|
||
Headline string `json:"headline"`
|
||
}
|
||
|
||
// ========== KnowledgeStore ==========
|
||
|
||
// KnowledgeStore 知识库持久化存储
|
||
type KnowledgeStore struct {
|
||
db *sql.DB
|
||
}
|
||
|
||
// NewKnowledgeStore 使用已有数据库连接初始化知识库存储并自动建表
|
||
func NewKnowledgeStore(db *sql.DB) (*KnowledgeStore, error) {
|
||
store := &KnowledgeStore{db: db}
|
||
|
||
if err := store.migrate(); err != nil {
|
||
return nil, fmt.Errorf("知识库表迁移失败: %w", err)
|
||
}
|
||
|
||
logger.Println("[KnowledgeStore] 知识库持久化存储已初始化")
|
||
return store, nil
|
||
}
|
||
|
||
// migrate 自动创建知识库相关表结构
|
||
func (s *KnowledgeStore) migrate() error {
|
||
queries := []string{
|
||
// 知识库表
|
||
`CREATE TABLE IF NOT EXISTS knowledge_bases (
|
||
id VARCHAR(64) PRIMARY KEY,
|
||
user_id VARCHAR(64) NOT NULL,
|
||
name VARCHAR(255) NOT NULL,
|
||
description TEXT DEFAULT '',
|
||
document_count INT DEFAULT 0,
|
||
chunk_count INT DEFAULT 0,
|
||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||
)`,
|
||
`CREATE INDEX IF NOT EXISTS idx_kb_user_id ON knowledge_bases(user_id)`,
|
||
|
||
// 文档表
|
||
`CREATE TABLE IF NOT EXISTS knowledge_documents (
|
||
id VARCHAR(64) PRIMARY KEY,
|
||
kb_id VARCHAR(64) NOT NULL REFERENCES knowledge_bases(id) ON DELETE CASCADE,
|
||
user_id VARCHAR(64) NOT NULL,
|
||
title VARCHAR(512) NOT NULL,
|
||
source_type VARCHAR(32) DEFAULT 'text',
|
||
source_ref VARCHAR(1024) DEFAULT '',
|
||
content_type VARCHAR(64) DEFAULT 'text/plain',
|
||
raw_content TEXT DEFAULT '',
|
||
chunk_count INT DEFAULT 0,
|
||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||
)`,
|
||
`CREATE INDEX IF NOT EXISTS idx_kd_kb_id ON knowledge_documents(kb_id)`,
|
||
`CREATE INDEX IF NOT EXISTS idx_kd_user_id ON knowledge_documents(user_id)`,
|
||
|
||
// 分块表
|
||
`CREATE TABLE IF NOT EXISTS knowledge_chunks (
|
||
id VARCHAR(64) PRIMARY KEY,
|
||
doc_id VARCHAR(64) NOT NULL REFERENCES knowledge_documents(id) ON DELETE CASCADE,
|
||
kb_id VARCHAR(64) NOT NULL,
|
||
chunk_index INT NOT NULL,
|
||
content TEXT NOT NULL,
|
||
token_count INT DEFAULT 0,
|
||
tsv TSVECTOR,
|
||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||
)`,
|
||
`CREATE INDEX IF NOT EXISTS idx_kc_doc_id ON knowledge_chunks(doc_id)`,
|
||
`CREATE INDEX IF NOT EXISTS idx_kc_kb_id ON knowledge_chunks(kb_id)`,
|
||
}
|
||
|
||
for _, q := range queries {
|
||
if _, err := s.db.Exec(q); err != nil {
|
||
return fmt.Errorf("迁移SQL执行失败: %w\nSQL: %s", err, q)
|
||
}
|
||
}
|
||
|
||
// 尝试创建 GIN 索引(可能因权限或扩展问题失败,但不影响功能)
|
||
_, err := s.db.Exec(`CREATE INDEX IF NOT EXISTS idx_kc_tsv_gin ON knowledge_chunks USING GIN(tsv)`)
|
||
if err != nil {
|
||
logger.Printf("[KnowledgeStore] ⚠ GIN索引创建失败(将使用ILIKE降级搜索): %v", err)
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
// ========== 知识库 CRUD ==========
|
||
|
||
// CreateKB 创建知识库
|
||
func (s *KnowledgeStore) CreateKB(kb *KnowledgeBase) error {
|
||
now := time.Now()
|
||
if kb.CreatedAt.IsZero() {
|
||
kb.CreatedAt = now
|
||
}
|
||
if kb.UpdatedAt.IsZero() {
|
||
kb.UpdatedAt = now
|
||
}
|
||
_, err := s.db.Exec(
|
||
`INSERT INTO knowledge_bases (id, user_id, name, description, document_count, chunk_count, created_at, updated_at)
|
||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
|
||
kb.ID, kb.UserID, kb.Name, kb.Description, kb.DocumentCount, kb.ChunkCount, kb.CreatedAt, kb.UpdatedAt,
|
||
)
|
||
if err != nil {
|
||
return fmt.Errorf("创建知识库失败: %w", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// GetKBsByUser 获取用户的所有知识库
|
||
func (s *KnowledgeStore) GetKBsByUser(userID string) ([]KnowledgeBase, error) {
|
||
rows, err := s.db.Query(
|
||
`SELECT id, user_id, name, description, document_count, chunk_count, created_at, updated_at
|
||
FROM knowledge_bases WHERE user_id = $1
|
||
ORDER BY updated_at DESC`,
|
||
userID,
|
||
)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("查询知识库列表失败: %w", err)
|
||
}
|
||
defer rows.Close()
|
||
|
||
var kbs []KnowledgeBase
|
||
for rows.Next() {
|
||
var kb KnowledgeBase
|
||
if err := rows.Scan(&kb.ID, &kb.UserID, &kb.Name, &kb.Description,
|
||
&kb.DocumentCount, &kb.ChunkCount, &kb.CreatedAt, &kb.UpdatedAt); err != nil {
|
||
return nil, fmt.Errorf("扫描知识库行失败: %w", err)
|
||
}
|
||
kbs = append(kbs, kb)
|
||
}
|
||
if kbs == nil {
|
||
kbs = []KnowledgeBase{}
|
||
}
|
||
return kbs, rows.Err()
|
||
}
|
||
|
||
// GetKB 获取单个知识库
|
||
func (s *KnowledgeStore) GetKB(id string) (*KnowledgeBase, error) {
|
||
var kb KnowledgeBase
|
||
err := s.db.QueryRow(
|
||
`SELECT id, user_id, name, description, document_count, chunk_count, created_at, updated_at
|
||
FROM knowledge_bases WHERE id = $1`,
|
||
id,
|
||
).Scan(&kb.ID, &kb.UserID, &kb.Name, &kb.Description,
|
||
&kb.DocumentCount, &kb.ChunkCount, &kb.CreatedAt, &kb.UpdatedAt)
|
||
if err != nil {
|
||
if err == sql.ErrNoRows {
|
||
return nil, nil
|
||
}
|
||
return nil, fmt.Errorf("查询知识库失败: %w", err)
|
||
}
|
||
return &kb, nil
|
||
}
|
||
|
||
// UpdateKB 更新知识库名称和描述
|
||
func (s *KnowledgeStore) UpdateKB(id string, name, description string) error {
|
||
_, err := s.db.Exec(
|
||
`UPDATE knowledge_bases SET name = $1, description = $2, updated_at = NOW() WHERE id = $3`,
|
||
name, description, id,
|
||
)
|
||
if err != nil {
|
||
return fmt.Errorf("更新知识库失败: %w", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// DeleteKB 删除知识库(级联删除文档和块)
|
||
func (s *KnowledgeStore) DeleteKB(id string) error {
|
||
_, err := s.db.Exec(`DELETE FROM knowledge_bases WHERE id = $1`, id)
|
||
if err != nil {
|
||
return fmt.Errorf("删除知识库失败: %w", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// updateKBStats 更新知识库的统计计数
|
||
func (s *KnowledgeStore) updateKBStats(kbID string) error {
|
||
_, err := s.db.Exec(
|
||
`UPDATE knowledge_bases SET
|
||
document_count = (SELECT COUNT(*) FROM knowledge_documents WHERE kb_id = $1),
|
||
chunk_count = (SELECT COUNT(*) FROM knowledge_chunks WHERE kb_id = $1),
|
||
updated_at = NOW()
|
||
WHERE id = $1`,
|
||
kbID,
|
||
)
|
||
return err
|
||
}
|
||
|
||
// ========== 文档 CRUD ==========
|
||
|
||
// AddDocument 添加文档,返回创建的文档
|
||
func (s *KnowledgeStore) AddDocument(doc *KnowledgeDocument) error {
|
||
if doc.CreatedAt.IsZero() {
|
||
doc.CreatedAt = time.Now()
|
||
}
|
||
_, err := s.db.Exec(
|
||
`INSERT INTO knowledge_documents (id, kb_id, user_id, title, source_type, source_ref, content_type, raw_content, chunk_count, created_at)
|
||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`,
|
||
doc.ID, doc.KBID, doc.UserID, doc.Title, doc.SourceType, doc.SourceRef,
|
||
doc.ContentType, doc.RawContent, doc.ChunkCount, doc.CreatedAt,
|
||
)
|
||
if err != nil {
|
||
return fmt.Errorf("添加文档失败: %w", err)
|
||
}
|
||
|
||
// 更新知识库统计
|
||
if err := s.updateKBStats(doc.KBID); err != nil {
|
||
logger.Printf("[KnowledgeStore] 更新知识库统计失败: %v", err)
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
// GetDocument 获取单个文档
|
||
func (s *KnowledgeStore) GetDocument(id string) (*KnowledgeDocument, error) {
|
||
var doc KnowledgeDocument
|
||
err := s.db.QueryRow(
|
||
`SELECT id, kb_id, user_id, title, source_type, source_ref, content_type, raw_content, chunk_count, created_at
|
||
FROM knowledge_documents WHERE id = $1`,
|
||
id,
|
||
).Scan(&doc.ID, &doc.KBID, &doc.UserID, &doc.Title, &doc.SourceType, &doc.SourceRef,
|
||
&doc.ContentType, &doc.RawContent, &doc.ChunkCount, &doc.CreatedAt)
|
||
if err != nil {
|
||
if err == sql.ErrNoRows {
|
||
return nil, nil
|
||
}
|
||
return nil, fmt.Errorf("查询文档失败: %w", err)
|
||
}
|
||
return &doc, nil
|
||
}
|
||
|
||
// GetDocumentsByKB 获取知识库中的所有文档
|
||
func (s *KnowledgeStore) GetDocumentsByKB(kbID string) ([]KnowledgeDocument, error) {
|
||
rows, err := s.db.Query(
|
||
`SELECT id, kb_id, user_id, title, source_type, source_ref, content_type, raw_content, chunk_count, created_at
|
||
FROM knowledge_documents WHERE kb_id = $1
|
||
ORDER BY created_at DESC`,
|
||
kbID,
|
||
)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("查询文档列表失败: %w", err)
|
||
}
|
||
defer rows.Close()
|
||
|
||
var docs []KnowledgeDocument
|
||
for rows.Next() {
|
||
var doc KnowledgeDocument
|
||
if err := rows.Scan(&doc.ID, &doc.KBID, &doc.UserID, &doc.Title, &doc.SourceType, &doc.SourceRef,
|
||
&doc.ContentType, &doc.RawContent, &doc.ChunkCount, &doc.CreatedAt); err != nil {
|
||
return nil, fmt.Errorf("扫描文档行失败: %w", err)
|
||
}
|
||
docs = append(docs, doc)
|
||
}
|
||
if docs == nil {
|
||
docs = []KnowledgeDocument{}
|
||
}
|
||
return docs, rows.Err()
|
||
}
|
||
|
||
// UpdateDocumentChunkCount 更新文档的分块计数
|
||
func (s *KnowledgeStore) UpdateDocumentChunkCount(docID string, count int) error {
|
||
_, err := s.db.Exec(
|
||
`UPDATE knowledge_documents SET chunk_count = $1 WHERE id = $2`,
|
||
count, docID,
|
||
)
|
||
return err
|
||
}
|
||
|
||
// DeleteDocument 删除文档(级联删除块)
|
||
func (s *KnowledgeStore) DeleteDocument(id string) error {
|
||
// 先获取 kb_id 以便后续更新统计
|
||
var kbID string
|
||
err := s.db.QueryRow(`SELECT kb_id FROM knowledge_documents WHERE id = $1`, id).Scan(&kbID)
|
||
if err != nil {
|
||
if err == sql.ErrNoRows {
|
||
return nil
|
||
}
|
||
return fmt.Errorf("查询文档失败: %w", err)
|
||
}
|
||
|
||
_, err = s.db.Exec(`DELETE FROM knowledge_documents WHERE id = $1`, id)
|
||
if err != nil {
|
||
return fmt.Errorf("删除文档失败: %w", err)
|
||
}
|
||
|
||
// 更新知识库统计
|
||
if err := s.updateKBStats(kbID); err != nil {
|
||
logger.Printf("[KnowledgeStore] 更新知识库统计失败: %v", err)
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
// ========== 分块操作 ==========
|
||
|
||
// AddChunk 添加单个分块
|
||
func (s *KnowledgeStore) AddChunk(chunk *KnowledgeChunk) error {
|
||
if chunk.CreatedAt.IsZero() {
|
||
chunk.CreatedAt = time.Now()
|
||
}
|
||
|
||
// 尝试使用 to_tsvector('chinese', content) 设置 tsv
|
||
// 如果中文分词不可用,使用 simple 配置
|
||
_, err := s.db.Exec(
|
||
`INSERT INTO knowledge_chunks (id, doc_id, kb_id, chunk_index, content, token_count, tsv, created_at)
|
||
VALUES ($1, $2, $3, $4, $5, $6,
|
||
CASE WHEN (SELECT count(*) FROM pg_ts_config WHERE cfgname = 'chinese') > 0
|
||
THEN to_tsvector('chinese', $5)
|
||
ELSE to_tsvector('simple', $5)
|
||
END,
|
||
$7)`,
|
||
chunk.ID, chunk.DocID, chunk.KBID, chunk.ChunkIndex, chunk.Content, chunk.TokenCount, chunk.CreatedAt,
|
||
)
|
||
if err != nil {
|
||
// 降级:不使用 tsv
|
||
_, err = s.db.Exec(
|
||
`INSERT INTO knowledge_chunks (id, doc_id, kb_id, chunk_index, content, token_count, created_at)
|
||
VALUES ($1, $2, $3, $4, $5, $6, $7)`,
|
||
chunk.ID, chunk.DocID, chunk.KBID, chunk.ChunkIndex, chunk.Content, chunk.TokenCount, chunk.CreatedAt,
|
||
)
|
||
if err != nil {
|
||
return fmt.Errorf("添加分块失败: %w", err)
|
||
}
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
// DeleteChunksByDocID 删除文档的所有分块
|
||
func (s *KnowledgeStore) DeleteChunksByDocID(docID string) error {
|
||
_, err := s.db.Exec(`DELETE FROM knowledge_chunks WHERE doc_id = $1`, docID)
|
||
return err
|
||
}
|
||
|
||
// GetChunksByDocID 获取文档的所有分块
|
||
func (s *KnowledgeStore) GetChunksByDocID(docID string) ([]KnowledgeChunk, error) {
|
||
rows, err := s.db.Query(
|
||
`SELECT id, doc_id, kb_id, chunk_index, content, token_count, created_at
|
||
FROM knowledge_chunks WHERE doc_id = $1
|
||
ORDER BY chunk_index ASC`,
|
||
docID,
|
||
)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("查询分块失败: %w", err)
|
||
}
|
||
defer rows.Close()
|
||
|
||
var chunks []KnowledgeChunk
|
||
for rows.Next() {
|
||
var c KnowledgeChunk
|
||
if err := rows.Scan(&c.ID, &c.DocID, &c.KBID, &c.ChunkIndex, &c.Content, &c.TokenCount, &c.CreatedAt); err != nil {
|
||
return nil, fmt.Errorf("扫描分块行失败: %w", err)
|
||
}
|
||
chunks = append(chunks, c)
|
||
}
|
||
if chunks == nil {
|
||
chunks = []KnowledgeChunk{}
|
||
}
|
||
return chunks, rows.Err()
|
||
}
|
||
|
||
// ========== 分块逻辑 ==========
|
||
|
||
// ChunkDocument 将文档分块并存储
|
||
func (s *KnowledgeStore) ChunkDocument(docID string) (int, error) {
|
||
// 获取文档
|
||
doc, err := s.GetDocument(docID)
|
||
if err != nil {
|
||
return 0, err
|
||
}
|
||
if doc == nil {
|
||
return 0, fmt.Errorf("文档不存在: %s", docID)
|
||
}
|
||
|
||
// 删除旧的分块
|
||
if err := s.DeleteChunksByDocID(docID); err != nil {
|
||
return 0, fmt.Errorf("删除旧分块失败: %w", err)
|
||
}
|
||
|
||
// 分块
|
||
chunks := splitTextIntoChunks(doc.RawContent, 500, 50)
|
||
|
||
// 存储分块
|
||
for i, content := range chunks {
|
||
chunk := &KnowledgeChunk{
|
||
ID: generateUUIDv4(),
|
||
DocID: docID,
|
||
KBID: doc.KBID,
|
||
ChunkIndex: i,
|
||
Content: content,
|
||
TokenCount: estimateTokenCount(content),
|
||
}
|
||
if err := s.AddChunk(chunk); err != nil {
|
||
return 0, fmt.Errorf("存储分块 %d 失败: %w", i, err)
|
||
}
|
||
}
|
||
|
||
// 更新文档的分块计数
|
||
if err := s.UpdateDocumentChunkCount(docID, len(chunks)); err != nil {
|
||
logger.Printf("[KnowledgeStore] 更新文档分块计数失败: %v", err)
|
||
}
|
||
|
||
// 更新知识库统计
|
||
if err := s.updateKBStats(doc.KBID); err != nil {
|
||
logger.Printf("[KnowledgeStore] 更新知识库统计失败: %v", err)
|
||
}
|
||
|
||
return len(chunks), nil
|
||
}
|
||
|
||
// ========== 搜索 ==========
|
||
|
||
// SearchChunks 在指定知识库中搜索
|
||
func (s *KnowledgeStore) SearchChunks(kbID, query string, limit int) ([]SearchChunkResult, error) {
|
||
if limit <= 0 {
|
||
limit = 5
|
||
}
|
||
|
||
// 尝试使用 PostgreSQL 全文搜索
|
||
results, err := s.searchWithFullText(kbID, query, limit)
|
||
if err != nil {
|
||
logger.Printf("[KnowledgeStore] 全文搜索失败,降级为ILIKE: %v", err)
|
||
// 降级为 ILIKE
|
||
results, err = s.searchWithILike(kbID, query, limit)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
}
|
||
|
||
if results == nil {
|
||
results = []SearchChunkResult{}
|
||
}
|
||
return results, nil
|
||
}
|
||
|
||
// SearchAllKBs 在用户的所有知识库中搜索
|
||
func (s *KnowledgeStore) SearchAllKBs(userID, query string, limit int) ([]SearchChunkResult, error) {
|
||
if limit <= 0 {
|
||
limit = 5
|
||
}
|
||
|
||
results, err := s.searchAllWithILike(userID, query, limit)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
if results == nil {
|
||
results = []SearchChunkResult{}
|
||
}
|
||
return results, nil
|
||
}
|
||
|
||
// searchWithFullText 使用 PostgreSQL ts_rank + plainto_tsquery 搜索
|
||
func (s *KnowledgeStore) searchWithFullText(kbID, query string, limit int) ([]SearchChunkResult, error) {
|
||
rows, err := s.db.Query(
|
||
`SELECT kc.id, kc.doc_id, kc.kb_id, kc.chunk_index, kc.content, kc.token_count, kc.created_at,
|
||
ts_rank(kc.tsv, plainto_tsquery('chinese', $2)) AS relevance,
|
||
kd.title AS document_title,
|
||
kb.name AS kb_name
|
||
FROM knowledge_chunks kc
|
||
JOIN knowledge_documents kd ON kc.doc_id = kd.id
|
||
JOIN knowledge_bases kb ON kc.kb_id = kb.id
|
||
WHERE kc.kb_id = $1 AND kc.tsv @@ plainto_tsquery('chinese', $2)
|
||
ORDER BY relevance DESC
|
||
LIMIT $3`,
|
||
kbID, query, limit,
|
||
)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer rows.Close()
|
||
|
||
return scanSearchResults(rows)
|
||
}
|
||
|
||
// searchWithILike 使用 ILIKE 降级搜索
|
||
func (s *KnowledgeStore) searchWithILike(kbID, query string, limit int) ([]SearchChunkResult, error) {
|
||
// 构建 ILIKE 模式
|
||
keywords := tokenizeQuery(query)
|
||
if len(keywords) == 0 {
|
||
return []SearchChunkResult{}, nil
|
||
}
|
||
|
||
// 对每个关键词构建 ILIKE 条件
|
||
conditions := make([]string, len(keywords))
|
||
args := []interface{}{kbID}
|
||
placeholderIdx := 2
|
||
for i, kw := range keywords {
|
||
conditions[i] = fmt.Sprintf("kc.content ILIKE $%d", placeholderIdx)
|
||
args = append(args, "%"+kw+"%")
|
||
placeholderIdx++
|
||
}
|
||
args = append(args, limit)
|
||
|
||
querySQL := fmt.Sprintf(
|
||
`SELECT kc.id, kc.doc_id, kc.kb_id, kc.chunk_index, kc.content, kc.token_count, kc.created_at,
|
||
0.0 AS relevance,
|
||
kd.title AS document_title,
|
||
kb.name AS kb_name
|
||
FROM knowledge_chunks kc
|
||
JOIN knowledge_documents kd ON kc.doc_id = kd.id
|
||
JOIN knowledge_bases kb ON kc.kb_id = kb.id
|
||
WHERE kc.kb_id = $1 AND (%s)
|
||
LIMIT $%d`,
|
||
strings.Join(conditions, " AND "),
|
||
placeholderIdx,
|
||
)
|
||
|
||
rows, err := s.db.Query(querySQL, args...)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("ILIKE搜索失败: %w", err)
|
||
}
|
||
defer rows.Close()
|
||
|
||
return scanSearchResults(rows)
|
||
}
|
||
|
||
// searchAllWithILike 跨所有用户知识库使用 ILIKE 搜索
|
||
func (s *KnowledgeStore) searchAllWithILike(userID, query string, limit int) ([]SearchChunkResult, error) {
|
||
keywords := tokenizeQuery(query)
|
||
if len(keywords) == 0 {
|
||
return []SearchChunkResult{}, nil
|
||
}
|
||
|
||
conditions := make([]string, len(keywords))
|
||
args := []interface{}{userID}
|
||
placeholderIdx := 2
|
||
for i, kw := range keywords {
|
||
conditions[i] = fmt.Sprintf("kc.content ILIKE $%d", placeholderIdx)
|
||
args = append(args, "%"+kw+"%")
|
||
placeholderIdx++
|
||
}
|
||
args = append(args, limit)
|
||
|
||
querySQL := fmt.Sprintf(
|
||
`SELECT kc.id, kc.doc_id, kc.kb_id, kc.chunk_index, kc.content, kc.token_count, kc.created_at,
|
||
0.0 AS relevance,
|
||
kd.title AS document_title,
|
||
kb.name AS kb_name
|
||
FROM knowledge_chunks kc
|
||
JOIN knowledge_documents kd ON kc.doc_id = kd.id
|
||
JOIN knowledge_bases kb ON kc.kb_id = kb.id
|
||
WHERE kb.user_id = $1 AND (%s)
|
||
ORDER BY kc.created_at DESC
|
||
LIMIT $%d`,
|
||
strings.Join(conditions, " AND "),
|
||
placeholderIdx,
|
||
)
|
||
|
||
rows, err := s.db.Query(querySQL, args...)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("全知识库ILIKE搜索失败: %w", err)
|
||
}
|
||
defer rows.Close()
|
||
|
||
return scanSearchResults(rows)
|
||
}
|
||
|
||
// scanSearchResults 扫描搜索结果
|
||
func scanSearchResults(rows *sql.Rows) ([]SearchChunkResult, error) {
|
||
var results []SearchChunkResult
|
||
for rows.Next() {
|
||
var r SearchChunkResult
|
||
if err := rows.Scan(&r.ID, &r.DocID, &r.KBID, &r.ChunkIndex, &r.Content,
|
||
&r.TokenCount, &r.CreatedAt, &r.Relevance, &r.DocumentTitle, &r.KBName); err != nil {
|
||
return nil, fmt.Errorf("扫描搜索结果行失败: %w", err)
|
||
}
|
||
// 生成高亮片段
|
||
r.Headline = r.Content
|
||
results = append(results, r)
|
||
}
|
||
if results == nil {
|
||
results = []SearchChunkResult{}
|
||
}
|
||
return results, rows.Err()
|
||
}
|
||
|
||
// ========== 文本分块函数 ==========
|
||
|
||
// splitTextIntoChunks 将文本按 maxLen 分块,块之间有 overlap 字符重叠
|
||
func splitTextIntoChunks(text string, maxLen int, overlap int) []string {
|
||
if text == "" {
|
||
return nil
|
||
}
|
||
|
||
// 按段落分割
|
||
paragraphs := strings.Split(text, "\n\n")
|
||
var chunks []string
|
||
var currentChunk strings.Builder
|
||
|
||
for _, para := range paragraphs {
|
||
para = strings.TrimSpace(para)
|
||
if para == "" {
|
||
continue
|
||
}
|
||
|
||
paraLen := utf8.RuneCountInString(para)
|
||
|
||
if paraLen <= maxLen {
|
||
// 如果当前块 + 段落不超过 maxLen,追加到当前块
|
||
if utf8.RuneCountInString(currentChunk.String()) == 0 {
|
||
currentChunk.WriteString(para)
|
||
} else if utf8.RuneCountInString(currentChunk.String())+1+paraLen <= maxLen {
|
||
currentChunk.WriteString("\n\n")
|
||
currentChunk.WriteString(para)
|
||
} else {
|
||
// 保存当前块,开始新块
|
||
chunks = append(chunks, currentChunk.String())
|
||
currentChunk.Reset()
|
||
currentChunk.WriteString(para)
|
||
}
|
||
} else {
|
||
// 段落超过 maxLen,需要按句子分割
|
||
// 先保存当前块
|
||
if currentChunk.Len() > 0 {
|
||
chunks = append(chunks, currentChunk.String())
|
||
currentChunk.Reset()
|
||
}
|
||
|
||
// 按句子分割
|
||
sentences := splitIntoSentences(para)
|
||
for _, sent := range sentences {
|
||
sent = strings.TrimSpace(sent)
|
||
if sent == "" {
|
||
continue
|
||
}
|
||
|
||
sentLen := utf8.RuneCountInString(sent)
|
||
|
||
if sentLen <= maxLen {
|
||
if utf8.RuneCountInString(currentChunk.String()) == 0 {
|
||
currentChunk.WriteString(sent)
|
||
} else if utf8.RuneCountInString(currentChunk.String())+sentLen <= maxLen {
|
||
currentChunk.WriteString(sent)
|
||
} else {
|
||
chunks = append(chunks, currentChunk.String())
|
||
currentChunk.Reset()
|
||
currentChunk.WriteString(sent)
|
||
}
|
||
} else {
|
||
// 句子超过 maxLen,按 maxLen 截断
|
||
if currentChunk.Len() > 0 {
|
||
chunks = append(chunks, currentChunk.String())
|
||
currentChunk.Reset()
|
||
}
|
||
|
||
// 按 maxLen 截断,带 overlap
|
||
runes := []rune(sent)
|
||
start := 0
|
||
for start < len(runes) {
|
||
end := start + maxLen
|
||
if end > len(runes) {
|
||
end = len(runes)
|
||
}
|
||
chunks = append(chunks, string(runes[start:end]))
|
||
if end >= len(runes) {
|
||
break
|
||
}
|
||
// 下一块从 end-overlap 开始
|
||
start = end - overlap
|
||
if start <= 0 {
|
||
start = end
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 保存最后一个块
|
||
if currentChunk.Len() > 0 {
|
||
chunks = append(chunks, currentChunk.String())
|
||
}
|
||
|
||
return chunks
|
||
}
|
||
|
||
// splitIntoSentences 按句子分割文本(中文。!?和英文标点)
|
||
func splitIntoSentences(text string) []string {
|
||
var sentences []string
|
||
runes := []rune(text)
|
||
var current strings.Builder
|
||
|
||
for i := 0; i < len(runes); i++ {
|
||
current.WriteRune(runes[i])
|
||
|
||
// 检查句子结束标志
|
||
if runes[i] == '。' || runes[i] == '!' || runes[i] == '?' ||
|
||
runes[i] == '!' || runes[i] == '?' ||
|
||
(runes[i] == '\n' && i+1 < len(runes) && runes[i+1] != '\n') {
|
||
sentences = append(sentences, current.String())
|
||
current.Reset()
|
||
}
|
||
}
|
||
|
||
// 剩余内容
|
||
if current.Len() > 0 {
|
||
remaining := strings.TrimSpace(current.String())
|
||
if remaining != "" {
|
||
sentences = append(sentences, remaining)
|
||
}
|
||
}
|
||
|
||
return sentences
|
||
}
|
||
|
||
// estimateTokenCount 估算 token 数量(中文按每个字符1.2个token,英文按每4个字符1个token)
|
||
func estimateTokenCount(text string) int {
|
||
runes := []rune(text)
|
||
total := 0
|
||
for _, r := range runes {
|
||
if r >= 0x4e00 && r <= 0x9fff {
|
||
// 中文字符,约1.2个token
|
||
total += 1
|
||
}
|
||
}
|
||
// 非中文字符粗略估算:字符数/4
|
||
nonChinese := len(runes) - total
|
||
total = int(float64(total)*1.2) + nonChinese/4
|
||
if total < 1 {
|
||
total = 1
|
||
}
|
||
return total
|
||
}
|
||
|
||
// tokenizeQuery 将查询字符串分词(简单按空格和标点分割)
|
||
func tokenizeQuery(query string) []string {
|
||
// 按空格、中文标点、英文标点分割
|
||
query = strings.TrimSpace(query)
|
||
if query == "" {
|
||
return nil
|
||
}
|
||
|
||
// 先用空格分割
|
||
parts := strings.Fields(query)
|
||
var tokens []string
|
||
for _, part := range parts {
|
||
part = strings.Trim(part, "。!?!?,.;::;、()()[]{}《》\"'")
|
||
if part != "" {
|
||
tokens = append(tokens, part)
|
||
}
|
||
}
|
||
return tokens
|
||
}
|
||
|
||
// GenerateUUID 使用 crypto/rand 生成 UUID v4 格式的字符串(导出供其他包使用)
|
||
func GenerateUUID() string {
|
||
return generateUUIDv4()
|
||
}
|
||
|
||
// generateUUIDv4 使用 crypto/rand 生成 UUID v4 格式的字符串
|
||
func generateUUIDv4() string {
|
||
b := make([]byte, 16)
|
||
if _, err := rand.Read(b); err != nil {
|
||
// 降级方案:基于时间戳 + 简单随机
|
||
ts := time.Now().UnixNano()
|
||
for i := 0; i < 16; i++ {
|
||
b[i] = byte((ts >> (i * 4)) & 0xFF)
|
||
}
|
||
}
|
||
// 设置 UUID v4 版本位 (version = 4)
|
||
b[6] = (b[6] & 0x0f) | 0x40
|
||
// 设置 UUID variant 位 (variant = 10xx)
|
||
b[8] = (b[8] & 0x3f) | 0x80
|
||
return fmt.Sprintf("%08x-%04x-%04x-%04x-%012x",
|
||
b[0:4], b[4:6], b[6:8], b[8:10], b[10:16])
|
||
}
|