dev 分支暂存

This commit is contained in:
2026-05-16 08:26:56 +08:00
parent 58c8caa570
commit eb4129176c
71 changed files with 8474 additions and 214 deletions
+191
View File
@@ -0,0 +1,191 @@
package llm
import (
"strings"
"sync"
"unicode"
)
// Segmenter 断句器 —— 将流式文本按句号切分为语音播放片段
type Segmenter struct {
mu sync.Mutex
buffer strings.Builder
segments []Segment
index int
}
// Segment 语音片段
type Segment struct {
Index int `json:"index"`
Text string `json:"text"`
}
// NewSegmenter 创建断句器
func NewSegmenter() *Segmenter {
return &Segmenter{}
}
// Feed 喂入新的文本片段
// 返回已完成的断句列表
func (s *Segmenter) Feed(delta string) []Segment {
s.mu.Lock()
defer s.mu.Unlock()
s.buffer.WriteString(delta)
content := s.buffer.String()
var newSegments []Segment
for {
idx := findSentenceEnd(content)
if idx == -1 {
break
}
segmentText := strings.TrimSpace(content[:idx+len(string(content[idx]))])
// 检查是否是完整中文字符的句末
// idx 指向标点符号的位置
runes := []rune(content)
var byteIdx int
for i, r := range runes {
if i == idx {
// 标点之后的字符
break
}
byteIdx += len(string(r))
}
// 简化处理:直接取到idx+1字节 (对于ASCII标点)
// 对于中文标点,需要用rune处理
realIdx := 0
runeCount := 0
for i, r := range content {
if runeCount == idx {
realIdx = i
break
}
runeCount++
_ = r
}
// 包含标点符号本身
endIdx := realIdx + len(string([]rune(content)[idx]))
if endIdx <= realIdx {
endIdx = realIdx + 3 // fallback for UTF-8 multi-byte
}
segmentText = strings.TrimSpace(content[:endIdx])
if segmentText == "" {
content = strings.TrimSpace(content[endIdx:])
s.buffer.Reset()
s.buffer.WriteString(content)
continue
}
s.index++
seg := Segment{
Index: s.index,
Text: segmentText,
}
s.segments = append(s.segments, seg)
newSegments = append(newSegments, seg)
// 更新buffer,移除已处理的部分
content = strings.TrimSpace(content[endIdx:])
s.buffer.Reset()
s.buffer.WriteString(content)
}
return newSegments
}
// Flush 强制输出buffer中剩余的内容
func (s *Segmenter) Flush() *Segment {
s.mu.Lock()
defer s.mu.Unlock()
remaining := strings.TrimSpace(s.buffer.String())
if remaining == "" {
return nil
}
s.index++
seg := Segment{
Index: s.index,
Text: remaining,
}
s.segments = append(s.segments, seg)
s.buffer.Reset()
return &seg
}
// AllSegments 返回所有已完成的断句
func (s *Segmenter) AllSegments() []Segment {
s.mu.Lock()
defer s.mu.Unlock()
result := make([]Segment, len(s.segments))
copy(result, s.segments)
return result
}
// findSentenceEnd 查找句子结束位置(返回标点符号在rune数组中的索引)
// 中文标点:。!? 英文标点:. ! ?
func findSentenceEnd(text string) int {
runes := []rune(text)
for i, r := range runes {
if isSentenceEnd(r) {
return i
}
}
return -1
}
// isSentenceEnd 判断是否为句末标点
func isSentenceEnd(r rune) bool {
switch r {
case '。', '', '', '.', '!', '?', '\n':
return true
}
return false
}
// splitIntoSegments 将完整文本按句号断句(用于post-processing
func splitIntoSegments(text string) []Segment {
var segments []Segment
runes := []rune(text)
start := 0
index := 0
for i, r := range runes {
if isSentenceEnd(r) {
segText := strings.TrimSpace(string(runes[start : i+1]))
if segText != "" {
index++
segments = append(segments, Segment{
Index: index,
Text: segText,
})
}
start = i + 1
}
}
// 处理末尾无标点的剩余文本
if start < len(runes) {
remaining := strings.TrimSpace(string(runes[start:]))
if remaining != "" {
index++
segments = append(segments, Segment{
Index: index,
Text: remaining,
})
}
}
return segments
}
// Ensure unicode is used
var _ = unicode.Is