bcf4d4e621
W1-W14 全部完成: - W1: 消息搜索 (ILIKE全文检索 + SearchModal) - W2: 对话导出 (JSON/Markdown/TXT三格式) - W3: 记忆时间线 DevTools 可视化 - W4: 通知推送系统 (WebSocket + Browser Notification API) - W5: 定时提醒 (30s轮询 + 重复提醒 + WebSocket推送) - W6: 每日简报 (08:00自动生成: 天气+新闻+提醒+AI摘要) - W7: IoT场景自动化 (规则引擎 10s轮询 + 条件评估 + 场景执行) - W8: 语音输入 (浏览器 Speech Recognition API) - W9: STT服务 (voice-service + whisper.cpp) - W10: TTS服务 (浏览器 Speech Synthesis + edge-tts三档回退) - W11: 文件管理 (上传/下载/缩略图/纯Go bilinear缩放) - W12: 知识库RAG (PostgreSQL tsvector + 文档分块 + 检索) - W13: 多模态 (图片上传+分析: Vision API + 本地Go分析回退) - W14: PWA (Service Worker + 离线页 + install prompt) 总计: 6个Go微服务 + 10+前端组件 + 10+ PostgreSQL表 + 4个后台调度器
295 lines
7.7 KiB
Go
295 lines
7.7 KiB
Go
package service
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
)
|
|
|
|
// TTSVoice 表示一个可用的 TTS 语音
|
|
type TTSVoice struct {
|
|
Name string `json:"name"`
|
|
DisplayName string `json:"display_name"`
|
|
Gender string `json:"gender"`
|
|
Locale string `json:"locale"`
|
|
}
|
|
|
|
// BuiltinVoices 内置的 edge-tts 中文语音列表
|
|
var BuiltinVoices = []TTSVoice{
|
|
{Name: "zh-CN-XiaoxiaoNeural", DisplayName: "晓晓 (女声)", Gender: "Female", Locale: "zh-CN"},
|
|
{Name: "zh-CN-YunxiNeural", DisplayName: "云希 (男声)", Gender: "Male", Locale: "zh-CN"},
|
|
{Name: "zh-CN-XiaoyiNeural", DisplayName: "晓伊 (女声)", Gender: "Female", Locale: "zh-CN"},
|
|
}
|
|
|
|
// TTSService 文字转语音服务
|
|
type TTSService struct{}
|
|
|
|
// NewTTSService 创建 TTS 服务
|
|
func NewTTSService() *TTSService {
|
|
return &TTSService{}
|
|
}
|
|
|
|
// IsAvailable 检查 TTS 引擎是否可用
|
|
// 优先级: edge-tts > espeak-ng > 纯 Go fallback
|
|
func (s *TTSService) IsAvailable() bool {
|
|
return s.edgeTTSAvailable() || s.espeakAvailable()
|
|
}
|
|
|
|
// edgeTTSAvailable 检查 edge-tts 是否可用
|
|
func (s *TTSService) edgeTTSAvailable() bool {
|
|
_, err := exec.LookPath("edge-tts")
|
|
return err == nil
|
|
}
|
|
|
|
// espeakAvailable 检查 espeak-ng 是否可用
|
|
func (s *TTSService) espeakAvailable() bool {
|
|
_, err := exec.LookPath("espeak-ng")
|
|
return err == nil
|
|
}
|
|
|
|
// Synthesize 将文字合成为音频
|
|
// text: 要合成的文字
|
|
// voice: 语音名称 (zh-CN-XiaoxiaoNeural 等)
|
|
// rate: 语速调整 ("+0%", "+20%", "-20%" 等)
|
|
// 返回: 音频数据, 音频格式 (mp3/wav), 错误
|
|
func (s *TTSService) Synthesize(text string, voice string, rate string) ([]byte, string, error) {
|
|
if text == "" {
|
|
return nil, "", fmt.Errorf("文字内容为空")
|
|
}
|
|
|
|
// 方案 A: edge-tts (推荐)
|
|
if s.edgeTTSAvailable() {
|
|
return s.synthesizeEdgeTTS(text, voice, rate)
|
|
}
|
|
|
|
// 方案 B: espeak-ng
|
|
if s.espeakAvailable() {
|
|
return s.synthesizeEspeak(text, voice)
|
|
}
|
|
|
|
// 方案 C: 纯 Go fallback
|
|
return s.synthesizeFallback()
|
|
}
|
|
|
|
// synthesizeEdgeTTS 使用 edge-tts 合成语音
|
|
func (s *TTSService) synthesizeEdgeTTS(text string, voice string, rate string) ([]byte, string, error) {
|
|
if voice == "" {
|
|
voice = "zh-CN-XiaoxiaoNeural"
|
|
}
|
|
if rate == "" {
|
|
rate = "+0%"
|
|
}
|
|
|
|
// 写入文本到临时文件
|
|
tmpText, err := os.CreateTemp("/tmp", "cyrene-tts-text-*.txt")
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("创建临时文本文件失败: %w", err)
|
|
}
|
|
tmpTextPath := tmpText.Name()
|
|
defer os.Remove(tmpTextPath)
|
|
|
|
if _, err := tmpText.WriteString(text); err != nil {
|
|
tmpText.Close()
|
|
return nil, "", fmt.Errorf("写入临时文本失败: %w", err)
|
|
}
|
|
tmpText.Close()
|
|
|
|
// 输出音频文件
|
|
tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-output-*.mp3")
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err)
|
|
}
|
|
tmpOutputPath := tmpOutput.Name()
|
|
tmpOutput.Close()
|
|
defer os.Remove(tmpOutputPath)
|
|
|
|
// 构建 edge-tts 命令
|
|
cmd := exec.Command("edge-tts",
|
|
"--voice", voice,
|
|
"--rate="+rate,
|
|
"--text", text,
|
|
"--write-media", tmpOutputPath,
|
|
)
|
|
|
|
output, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("edge-tts 合成失败: %w\n输出: %s", err, string(output))
|
|
}
|
|
|
|
// 读取生成的音频
|
|
audioData, err := os.ReadFile(tmpOutputPath)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("读取合成的音频失败: %w", err)
|
|
}
|
|
|
|
if len(audioData) == 0 {
|
|
return nil, "", fmt.Errorf("edge-tts 生成的音频为空")
|
|
}
|
|
|
|
return audioData, "mp3", nil
|
|
}
|
|
|
|
// synthesizeEspeak 使用 espeak-ng 合成语音
|
|
func (s *TTSService) synthesizeEspeak(text string, voice string) ([]byte, string, error) {
|
|
if voice == "" {
|
|
voice = "zh"
|
|
}
|
|
|
|
// 输出 WAV 文件
|
|
tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-espeak-*.wav")
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err)
|
|
}
|
|
tmpOutputPath := tmpOutput.Name()
|
|
tmpOutput.Close()
|
|
defer os.Remove(tmpOutputPath)
|
|
|
|
cmd := exec.Command("espeak-ng",
|
|
"-v", voice,
|
|
"-w", tmpOutputPath,
|
|
text,
|
|
)
|
|
|
|
output, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("espeak-ng 合成失败: %w\n输出: %s", err, string(output))
|
|
}
|
|
|
|
audioData, err := os.ReadFile(tmpOutputPath)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("读取合成的音频失败: %w", err)
|
|
}
|
|
|
|
if len(audioData) == 0 {
|
|
return nil, "", fmt.Errorf("espeak-ng 生成的音频为空")
|
|
}
|
|
|
|
return audioData, "wav", nil
|
|
}
|
|
|
|
// synthesizeFallback 生成静默 WAV 作为降级方案
|
|
// 生成 1 秒 16kHz 16-bit mono 静默 PCM WAV
|
|
func (s *TTSService) synthesizeFallback() ([]byte, string, error) {
|
|
// 1 秒 @ 16kHz mono 16-bit = 32000 字节采样数据
|
|
sampleRate := 16000
|
|
numChannels := 1
|
|
bitsPerSample := 16
|
|
durationSec := 1
|
|
|
|
dataSize := sampleRate * numChannels * (bitsPerSample / 8) * durationSec
|
|
// WAV header 44 bytes + data
|
|
wav := make([]byte, 44+dataSize)
|
|
|
|
// RIFF header
|
|
copy(wav[0:4], "RIFF")
|
|
writeUint32LE(wav[4:8], uint32(36+dataSize))
|
|
copy(wav[8:12], "WAVE")
|
|
|
|
// fmt chunk
|
|
copy(wav[12:16], "fmt ")
|
|
writeUint32LE(wav[16:20], 16) // chunk size
|
|
writeUint16LE(wav[20:22], 1) // PCM
|
|
writeUint16LE(wav[22:24], uint16(numChannels)) // channels
|
|
writeUint32LE(wav[24:28], uint32(sampleRate)) // sample rate
|
|
writeUint32LE(wav[28:32], uint32(sampleRate*numChannels*bitsPerSample/8)) // byte rate
|
|
writeUint16LE(wav[32:34], uint16(numChannels*bitsPerSample/8)) // block align
|
|
writeUint16LE(wav[34:36], uint16(bitsPerSample)) // bits per sample
|
|
|
|
// data chunk
|
|
copy(wav[36:40], "data")
|
|
writeUint32LE(wav[40:44], uint32(dataSize))
|
|
// 采样数据全是 0 (静默)
|
|
|
|
return wav, "wav", nil
|
|
}
|
|
|
|
func writeUint16LE(buf []byte, v uint16) {
|
|
buf[0] = byte(v)
|
|
buf[1] = byte(v >> 8)
|
|
}
|
|
|
|
func writeUint32LE(buf []byte, v uint32) {
|
|
buf[0] = byte(v)
|
|
buf[1] = byte(v >> 8)
|
|
buf[2] = byte(v >> 16)
|
|
buf[3] = byte(v >> 24)
|
|
}
|
|
|
|
// GetVoices 返回可用语音列表
|
|
func (s *TTSService) GetVoices() []TTSVoice {
|
|
// 检查 edge-tts 是否可用,尝试获取完整语音列表
|
|
if s.edgeTTSAvailable() {
|
|
cmd := exec.Command("edge-tts", "--list-voices")
|
|
output, err := cmd.Output()
|
|
if err == nil {
|
|
voices := s.parseEdgeTTSVoices(string(output))
|
|
if len(voices) > 0 {
|
|
return voices
|
|
}
|
|
}
|
|
}
|
|
return BuiltinVoices
|
|
}
|
|
|
|
// parseEdgeTTSVoices 解析 edge-tts --list-voices 输出
|
|
// 简单解析:查找包含 "zh-CN" 的语音
|
|
func (s *TTSService) parseEdgeTTSVoices(output string) []TTSVoice {
|
|
var voices []TTSVoice
|
|
for _, line := range strings.Split(output, "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if !strings.Contains(line, "zh-CN") {
|
|
continue
|
|
}
|
|
|
|
voice := TTSVoice{
|
|
Name: "",
|
|
Gender: "Unknown",
|
|
Locale: "zh-CN",
|
|
}
|
|
|
|
// 简单解析 "Name: zh-CN-XiaoxiaoNeural" 和 "Gender: Female" 格式
|
|
for _, field := range strings.Split(line, ",") {
|
|
field = strings.TrimSpace(field)
|
|
if strings.HasPrefix(field, "Name:") {
|
|
voice.Name = strings.TrimSpace(strings.TrimPrefix(field, "Name:"))
|
|
}
|
|
if strings.HasPrefix(field, "Gender:") {
|
|
voice.Gender = strings.TrimSpace(strings.TrimPrefix(field, "Gender:"))
|
|
}
|
|
}
|
|
|
|
if voice.Name != "" {
|
|
voice.DisplayName = voice.Name
|
|
voices = append(voices, voice)
|
|
}
|
|
}
|
|
|
|
if len(voices) == 0 {
|
|
return nil
|
|
}
|
|
return voices
|
|
}
|
|
|
|
// GetEngineStatus 返回 TTS 引擎状态
|
|
func (s *TTSService) GetEngineStatus() map[string]interface{} {
|
|
status := map[string]interface{}{
|
|
"available": s.IsAvailable(),
|
|
"edge_tts": s.edgeTTSAvailable(),
|
|
"espeak_ng": s.espeakAvailable(),
|
|
"engine": "none",
|
|
"default_voice": "zh-CN-XiaoxiaoNeural",
|
|
"builtin_voices": len(BuiltinVoices),
|
|
}
|
|
|
|
if s.edgeTTSAvailable() {
|
|
status["engine"] = "edge-tts"
|
|
} else if s.espeakAvailable() {
|
|
status["engine"] = "espeak-ng"
|
|
} else {
|
|
status["engine"] = "fallback (silent WAV)"
|
|
}
|
|
|
|
return status
|
|
}
|