package service import ( "fmt" "os" "os/exec" "strings" ) // TTSVoice 表示一个可用的 TTS 语音 type TTSVoice struct { Name string `json:"name"` DisplayName string `json:"display_name"` Gender string `json:"gender"` Locale string `json:"locale"` } // BuiltinVoices 内置的 edge-tts 中文语音列表 var BuiltinVoices = []TTSVoice{ {Name: "zh-CN-XiaoxiaoNeural", DisplayName: "晓晓 (女声)", Gender: "Female", Locale: "zh-CN"}, {Name: "zh-CN-YunxiNeural", DisplayName: "云希 (男声)", Gender: "Male", Locale: "zh-CN"}, {Name: "zh-CN-XiaoyiNeural", DisplayName: "晓伊 (女声)", Gender: "Female", Locale: "zh-CN"}, } // TTSService 文字转语音服务 type TTSService struct{} // NewTTSService 创建 TTS 服务 func NewTTSService() *TTSService { return &TTSService{} } // IsAvailable 检查 TTS 引擎是否可用 // 优先级: edge-tts > espeak-ng > 纯 Go fallback func (s *TTSService) IsAvailable() bool { return s.edgeTTSAvailable() || s.espeakAvailable() } // edgeTTSAvailable 检查 edge-tts 是否可用 func (s *TTSService) edgeTTSAvailable() bool { _, err := exec.LookPath("edge-tts") return err == nil } // espeakAvailable 检查 espeak-ng 是否可用 func (s *TTSService) espeakAvailable() bool { _, err := exec.LookPath("espeak-ng") return err == nil } // Synthesize 将文字合成为音频 // text: 要合成的文字 // voice: 语音名称 (zh-CN-XiaoxiaoNeural 等) // rate: 语速调整 ("+0%", "+20%", "-20%" 等) // 返回: 音频数据, 音频格式 (mp3/wav), 错误 func (s *TTSService) Synthesize(text string, voice string, rate string) ([]byte, string, error) { if text == "" { return nil, "", fmt.Errorf("文字内容为空") } // 方案 A: edge-tts (推荐) if s.edgeTTSAvailable() { return s.synthesizeEdgeTTS(text, voice, rate) } // 方案 B: espeak-ng if s.espeakAvailable() { return s.synthesizeEspeak(text, voice) } // 方案 C: 纯 Go fallback return s.synthesizeFallback() } // synthesizeEdgeTTS 使用 edge-tts 合成语音 func (s *TTSService) synthesizeEdgeTTS(text string, voice string, rate string) ([]byte, string, error) { if voice == "" { voice = "zh-CN-XiaoxiaoNeural" } if rate == "" { rate = "+0%" } // 写入文本到临时文件 tmpText, err := os.CreateTemp("/tmp", "cyrene-tts-text-*.txt") if err != nil { return nil, "", fmt.Errorf("创建临时文本文件失败: %w", err) } tmpTextPath := tmpText.Name() defer os.Remove(tmpTextPath) if _, err := tmpText.WriteString(text); err != nil { tmpText.Close() return nil, "", fmt.Errorf("写入临时文本失败: %w", err) } tmpText.Close() // 输出音频文件 tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-output-*.mp3") if err != nil { return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err) } tmpOutputPath := tmpOutput.Name() tmpOutput.Close() defer os.Remove(tmpOutputPath) // 构建 edge-tts 命令 cmd := exec.Command("edge-tts", "--voice", voice, "--rate="+rate, "--text", text, "--write-media", tmpOutputPath, ) output, err := cmd.CombinedOutput() if err != nil { return nil, "", fmt.Errorf("edge-tts 合成失败: %w\n输出: %s", err, string(output)) } // 读取生成的音频 audioData, err := os.ReadFile(tmpOutputPath) if err != nil { return nil, "", fmt.Errorf("读取合成的音频失败: %w", err) } if len(audioData) == 0 { return nil, "", fmt.Errorf("edge-tts 生成的音频为空") } return audioData, "mp3", nil } // synthesizeEspeak 使用 espeak-ng 合成语音 func (s *TTSService) synthesizeEspeak(text string, voice string) ([]byte, string, error) { if voice == "" { voice = "zh" } // 输出 WAV 文件 tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-espeak-*.wav") if err != nil { return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err) } tmpOutputPath := tmpOutput.Name() tmpOutput.Close() defer os.Remove(tmpOutputPath) cmd := exec.Command("espeak-ng", "-v", voice, "-w", tmpOutputPath, text, ) output, err := cmd.CombinedOutput() if err != nil { return nil, "", fmt.Errorf("espeak-ng 合成失败: %w\n输出: %s", err, string(output)) } audioData, err := os.ReadFile(tmpOutputPath) if err != nil { return nil, "", fmt.Errorf("读取合成的音频失败: %w", err) } if len(audioData) == 0 { return nil, "", fmt.Errorf("espeak-ng 生成的音频为空") } return audioData, "wav", nil } // synthesizeFallback 生成静默 WAV 作为降级方案 // 生成 1 秒 16kHz 16-bit mono 静默 PCM WAV func (s *TTSService) synthesizeFallback() ([]byte, string, error) { // 1 秒 @ 16kHz mono 16-bit = 32000 字节采样数据 sampleRate := 16000 numChannels := 1 bitsPerSample := 16 durationSec := 1 dataSize := sampleRate * numChannels * (bitsPerSample / 8) * durationSec // WAV header 44 bytes + data wav := make([]byte, 44+dataSize) // RIFF header copy(wav[0:4], "RIFF") writeUint32LE(wav[4:8], uint32(36+dataSize)) copy(wav[8:12], "WAVE") // fmt chunk copy(wav[12:16], "fmt ") writeUint32LE(wav[16:20], 16) // chunk size writeUint16LE(wav[20:22], 1) // PCM writeUint16LE(wav[22:24], uint16(numChannels)) // channels writeUint32LE(wav[24:28], uint32(sampleRate)) // sample rate writeUint32LE(wav[28:32], uint32(sampleRate*numChannels*bitsPerSample/8)) // byte rate writeUint16LE(wav[32:34], uint16(numChannels*bitsPerSample/8)) // block align writeUint16LE(wav[34:36], uint16(bitsPerSample)) // bits per sample // data chunk copy(wav[36:40], "data") writeUint32LE(wav[40:44], uint32(dataSize)) // 采样数据全是 0 (静默) return wav, "wav", nil } func writeUint16LE(buf []byte, v uint16) { buf[0] = byte(v) buf[1] = byte(v >> 8) } func writeUint32LE(buf []byte, v uint32) { buf[0] = byte(v) buf[1] = byte(v >> 8) buf[2] = byte(v >> 16) buf[3] = byte(v >> 24) } // GetVoices 返回可用语音列表 func (s *TTSService) GetVoices() []TTSVoice { // 检查 edge-tts 是否可用,尝试获取完整语音列表 if s.edgeTTSAvailable() { cmd := exec.Command("edge-tts", "--list-voices") output, err := cmd.Output() if err == nil { voices := s.parseEdgeTTSVoices(string(output)) if len(voices) > 0 { return voices } } } return BuiltinVoices } // parseEdgeTTSVoices 解析 edge-tts --list-voices 输出 // 简单解析:查找包含 "zh-CN" 的语音 func (s *TTSService) parseEdgeTTSVoices(output string) []TTSVoice { var voices []TTSVoice for _, line := range strings.Split(output, "\n") { line = strings.TrimSpace(line) if !strings.Contains(line, "zh-CN") { continue } voice := TTSVoice{ Name: "", Gender: "Unknown", Locale: "zh-CN", } // 简单解析 "Name: zh-CN-XiaoxiaoNeural" 和 "Gender: Female" 格式 for _, field := range strings.Split(line, ",") { field = strings.TrimSpace(field) if strings.HasPrefix(field, "Name:") { voice.Name = strings.TrimSpace(strings.TrimPrefix(field, "Name:")) } if strings.HasPrefix(field, "Gender:") { voice.Gender = strings.TrimSpace(strings.TrimPrefix(field, "Gender:")) } } if voice.Name != "" { voice.DisplayName = voice.Name voices = append(voices, voice) } } if len(voices) == 0 { return nil } return voices } // GetEngineStatus 返回 TTS 引擎状态 func (s *TTSService) GetEngineStatus() map[string]interface{} { status := map[string]interface{}{ "available": s.IsAvailable(), "edge_tts": s.edgeTTSAvailable(), "espeak_ng": s.espeakAvailable(), "engine": "none", "default_voice": "zh-CN-XiaoxiaoNeural", "builtin_voices": len(BuiltinVoices), } if s.edgeTTSAvailable() { status["engine"] = "edge-tts" } else if s.espeakAvailable() { status["engine"] = "espeak-ng" } else { status["engine"] = "fallback (silent WAV)" } return status }