dev 分支暂存
This commit is contained in:
@@ -0,0 +1,191 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"sync"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// Segmenter 断句器 —— 将流式文本按句号切分为语音播放片段
|
||||
type Segmenter struct {
|
||||
mu sync.Mutex
|
||||
buffer strings.Builder
|
||||
segments []Segment
|
||||
index int
|
||||
}
|
||||
|
||||
// Segment 语音片段
|
||||
type Segment struct {
|
||||
Index int `json:"index"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
// NewSegmenter 创建断句器
|
||||
func NewSegmenter() *Segmenter {
|
||||
return &Segmenter{}
|
||||
}
|
||||
|
||||
// Feed 喂入新的文本片段
|
||||
// 返回已完成的断句列表
|
||||
func (s *Segmenter) Feed(delta string) []Segment {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
s.buffer.WriteString(delta)
|
||||
content := s.buffer.String()
|
||||
|
||||
var newSegments []Segment
|
||||
|
||||
for {
|
||||
idx := findSentenceEnd(content)
|
||||
if idx == -1 {
|
||||
break
|
||||
}
|
||||
|
||||
segmentText := strings.TrimSpace(content[:idx+len(string(content[idx]))])
|
||||
// 检查是否是完整中文字符的句末
|
||||
// idx 指向标点符号的位置
|
||||
runes := []rune(content)
|
||||
var byteIdx int
|
||||
for i, r := range runes {
|
||||
if i == idx {
|
||||
// 标点之后的字符
|
||||
break
|
||||
}
|
||||
byteIdx += len(string(r))
|
||||
}
|
||||
|
||||
// 简化处理:直接取到idx+1字节 (对于ASCII标点)
|
||||
// 对于中文标点,需要用rune处理
|
||||
realIdx := 0
|
||||
runeCount := 0
|
||||
for i, r := range content {
|
||||
if runeCount == idx {
|
||||
realIdx = i
|
||||
break
|
||||
}
|
||||
runeCount++
|
||||
_ = r
|
||||
}
|
||||
// 包含标点符号本身
|
||||
endIdx := realIdx + len(string([]rune(content)[idx]))
|
||||
if endIdx <= realIdx {
|
||||
endIdx = realIdx + 3 // fallback for UTF-8 multi-byte
|
||||
}
|
||||
|
||||
segmentText = strings.TrimSpace(content[:endIdx])
|
||||
if segmentText == "" {
|
||||
content = strings.TrimSpace(content[endIdx:])
|
||||
s.buffer.Reset()
|
||||
s.buffer.WriteString(content)
|
||||
continue
|
||||
}
|
||||
|
||||
s.index++
|
||||
seg := Segment{
|
||||
Index: s.index,
|
||||
Text: segmentText,
|
||||
}
|
||||
s.segments = append(s.segments, seg)
|
||||
newSegments = append(newSegments, seg)
|
||||
|
||||
// 更新buffer,移除已处理的部分
|
||||
content = strings.TrimSpace(content[endIdx:])
|
||||
s.buffer.Reset()
|
||||
s.buffer.WriteString(content)
|
||||
}
|
||||
|
||||
return newSegments
|
||||
}
|
||||
|
||||
// Flush 强制输出buffer中剩余的内容
|
||||
func (s *Segmenter) Flush() *Segment {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
remaining := strings.TrimSpace(s.buffer.String())
|
||||
if remaining == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
s.index++
|
||||
seg := Segment{
|
||||
Index: s.index,
|
||||
Text: remaining,
|
||||
}
|
||||
s.segments = append(s.segments, seg)
|
||||
s.buffer.Reset()
|
||||
|
||||
return &seg
|
||||
}
|
||||
|
||||
// AllSegments 返回所有已完成的断句
|
||||
func (s *Segmenter) AllSegments() []Segment {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
result := make([]Segment, len(s.segments))
|
||||
copy(result, s.segments)
|
||||
return result
|
||||
}
|
||||
|
||||
// findSentenceEnd 查找句子结束位置(返回标点符号在rune数组中的索引)
|
||||
// 中文标点:。!? 英文标点:. ! ?
|
||||
func findSentenceEnd(text string) int {
|
||||
runes := []rune(text)
|
||||
for i, r := range runes {
|
||||
if isSentenceEnd(r) {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// isSentenceEnd 判断是否为句末标点
|
||||
func isSentenceEnd(r rune) bool {
|
||||
switch r {
|
||||
case '。', '!', '?', '.', '!', '?', '\n':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// splitIntoSegments 将完整文本按句号断句(用于post-processing)
|
||||
func splitIntoSegments(text string) []Segment {
|
||||
var segments []Segment
|
||||
runes := []rune(text)
|
||||
|
||||
start := 0
|
||||
index := 0
|
||||
|
||||
for i, r := range runes {
|
||||
if isSentenceEnd(r) {
|
||||
segText := strings.TrimSpace(string(runes[start : i+1]))
|
||||
if segText != "" {
|
||||
index++
|
||||
segments = append(segments, Segment{
|
||||
Index: index,
|
||||
Text: segText,
|
||||
})
|
||||
}
|
||||
start = i + 1
|
||||
}
|
||||
}
|
||||
|
||||
// 处理末尾无标点的剩余文本
|
||||
if start < len(runes) {
|
||||
remaining := strings.TrimSpace(string(runes[start:]))
|
||||
if remaining != "" {
|
||||
index++
|
||||
segments = append(segments, Segment{
|
||||
Index: index,
|
||||
Text: remaining,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return segments
|
||||
}
|
||||
|
||||
// Ensure unicode is used
|
||||
var _ = unicode.Is
|
||||
|
||||
Reference in New Issue
Block a user