Cyrene/backend/voice-service/internal/service/tts_service.go

package service

import (
	"fmt"
	"os"
	"os/exec"
	"strings"
)

// TTSVoice 表示一个可用的 TTS 语音
type TTSVoice struct {
	Name        string `json:"name"`
	DisplayName string `json:"display_name"`
	Gender      string `json:"gender"`
	Locale      string `json:"locale"`
}

// BuiltinVoices 内置的 edge-tts 中文语音列表
var BuiltinVoices = []TTSVoice{
	{Name: "zh-CN-XiaoxiaoNeural", DisplayName: "晓晓 (女声)", Gender: "Female", Locale: "zh-CN"},
	{Name: "zh-CN-YunxiNeural", DisplayName: "云希 (男声)", Gender: "Male", Locale: "zh-CN"},
	{Name: "zh-CN-XiaoyiNeural", DisplayName: "晓伊 (女声)", Gender: "Female", Locale: "zh-CN"},
}

// TTSService 文字转语音服务
type TTSService struct{}

// NewTTSService 创建 TTS 服务
func NewTTSService() *TTSService {
	return &TTSService{}
}

// IsAvailable 检查 TTS 引擎是否可用
// 优先级: edge-tts > espeak-ng > 纯 Go fallback
func (s *TTSService) IsAvailable() bool {
	return s.edgeTTSAvailable() || s.espeakAvailable()
}

// edgeTTSAvailable 检查 edge-tts 是否可用
func (s *TTSService) edgeTTSAvailable() bool {
	_, err := exec.LookPath("edge-tts")
	return err == nil
}

// espeakAvailable 检查 espeak-ng 是否可用
func (s *TTSService) espeakAvailable() bool {
	_, err := exec.LookPath("espeak-ng")
	return err == nil
}

// Synthesize 将文字合成为音频
// text: 要合成的文字
// voice: 语音名称 (zh-CN-XiaoxiaoNeural 等)
// rate: 语速调整 ("+0%", "+20%", "-20%" 等)
// 返回: 音频数据, 音频格式 (mp3/wav), 错误
func (s *TTSService) Synthesize(text string, voice string, rate string) ([]byte, string, error) {
	if text == "" {
		return nil, "", fmt.Errorf("文字内容为空")
	}

	// 方案 A: edge-tts (推荐)
	if s.edgeTTSAvailable() {
		return s.synthesizeEdgeTTS(text, voice, rate)
	}

	// 方案 B: espeak-ng
	if s.espeakAvailable() {
		return s.synthesizeEspeak(text, voice)
	}

	// 方案 C: 纯 Go fallback
	return s.synthesizeFallback()
}

// synthesizeEdgeTTS 使用 edge-tts 合成语音
func (s *TTSService) synthesizeEdgeTTS(text string, voice string, rate string) ([]byte, string, error) {
	if voice == "" {
		voice = "zh-CN-XiaoxiaoNeural"
	}
	if rate == "" {
		rate = "+0%"
	}

	// 写入文本到临时文件
	tmpText, err := os.CreateTemp("/tmp", "cyrene-tts-text-*.txt")
	if err != nil {
		return nil, "", fmt.Errorf("创建临时文本文件失败: %w", err)
	}
	tmpTextPath := tmpText.Name()
	defer os.Remove(tmpTextPath)

	if _, err := tmpText.WriteString(text); err != nil {
		tmpText.Close()
		return nil, "", fmt.Errorf("写入临时文本失败: %w", err)
	}
	tmpText.Close()

	// 输出音频文件
	tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-output-*.mp3")
	if err != nil {
		return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err)
	}
	tmpOutputPath := tmpOutput.Name()
	tmpOutput.Close()
	defer os.Remove(tmpOutputPath)

	// 构建 edge-tts 命令
	cmd := exec.Command("edge-tts",
		"--voice", voice,
		"--rate="+rate,
		"--text", text,
		"--write-media", tmpOutputPath,
	)

	output, err := cmd.CombinedOutput()
	if err != nil {
		return nil, "", fmt.Errorf("edge-tts 合成失败: %w\n输出: %s", err, string(output))
	}

	// 读取生成的音频
	audioData, err := os.ReadFile(tmpOutputPath)
	if err != nil {
		return nil, "", fmt.Errorf("读取合成的音频失败: %w", err)
	}

	if len(audioData) == 0 {
		return nil, "", fmt.Errorf("edge-tts 生成的音频为空")
	}

	return audioData, "mp3", nil
}

// synthesizeEspeak 使用 espeak-ng 合成语音
func (s *TTSService) synthesizeEspeak(text string, voice string) ([]byte, string, error) {
	if voice == "" {
		voice = "zh"
	}

	// 输出 WAV 文件
	tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-espeak-*.wav")
	if err != nil {
		return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err)
	}
	tmpOutputPath := tmpOutput.Name()
	tmpOutput.Close()
	defer os.Remove(tmpOutputPath)

	cmd := exec.Command("espeak-ng",
		"-v", voice,
		"-w", tmpOutputPath,
		text,
	)

	output, err := cmd.CombinedOutput()
	if err != nil {
		return nil, "", fmt.Errorf("espeak-ng 合成失败: %w\n输出: %s", err, string(output))
	}

	audioData, err := os.ReadFile(tmpOutputPath)
	if err != nil {
		return nil, "", fmt.Errorf("读取合成的音频失败: %w", err)
	}

	if len(audioData) == 0 {
		return nil, "", fmt.Errorf("espeak-ng 生成的音频为空")
	}

	return audioData, "wav", nil
}

// synthesizeFallback 生成静默 WAV 作为降级方案
// 生成 1 秒 16kHz 16-bit mono 静默 PCM WAV
func (s *TTSService) synthesizeFallback() ([]byte, string, error) {
	// 1 秒 @ 16kHz mono 16-bit = 32000 字节采样数据
	sampleRate := 16000
	numChannels := 1
	bitsPerSample := 16
	durationSec := 1

	dataSize := sampleRate * numChannels * (bitsPerSample / 8) * durationSec
	// WAV header 44 bytes + data
	wav := make([]byte, 44+dataSize)

	// RIFF header
	copy(wav[0:4], "RIFF")
	writeUint32LE(wav[4:8], uint32(36+dataSize))
	copy(wav[8:12], "WAVE")

	// fmt chunk
	copy(wav[12:16], "fmt ")
	writeUint32LE(wav[16:20], 16)                   // chunk size
	writeUint16LE(wav[20:22], 1)                     // PCM
	writeUint16LE(wav[22:24], uint16(numChannels))   // channels
	writeUint32LE(wav[24:28], uint32(sampleRate))    // sample rate
	writeUint32LE(wav[28:32], uint32(sampleRate*numChannels*bitsPerSample/8)) // byte rate
	writeUint16LE(wav[32:34], uint16(numChannels*bitsPerSample/8))            // block align
	writeUint16LE(wav[34:36], uint16(bitsPerSample)) // bits per sample

	// data chunk
	copy(wav[36:40], "data")
	writeUint32LE(wav[40:44], uint32(dataSize))
	// 采样数据全是 0 (静默)

	return wav, "wav", nil
}

func writeUint16LE(buf []byte, v uint16) {
	buf[0] = byte(v)
	buf[1] = byte(v >> 8)
}

func writeUint32LE(buf []byte, v uint32) {
	buf[0] = byte(v)
	buf[1] = byte(v >> 8)
	buf[2] = byte(v >> 16)
	buf[3] = byte(v >> 24)
}

// GetVoices 返回可用语音列表
func (s *TTSService) GetVoices() []TTSVoice {
	// 检查 edge-tts 是否可用，尝试获取完整语音列表
	if s.edgeTTSAvailable() {
		cmd := exec.Command("edge-tts", "--list-voices")
		output, err := cmd.Output()
		if err == nil {
			voices := s.parseEdgeTTSVoices(string(output))
			if len(voices) > 0 {
				return voices
			}
		}
	}
	return BuiltinVoices
}

// parseEdgeTTSVoices 解析 edge-tts --list-voices 输出
// 简单解析：查找包含 "zh-CN" 的语音
func (s *TTSService) parseEdgeTTSVoices(output string) []TTSVoice {
	var voices []TTSVoice
	for _, line := range strings.Split(output, "\n") {
		line = strings.TrimSpace(line)
		if !strings.Contains(line, "zh-CN") {
			continue
		}

		voice := TTSVoice{
			Name:   "",
			Gender: "Unknown",
			Locale: "zh-CN",
		}

		// 简单解析 "Name: zh-CN-XiaoxiaoNeural" 和 "Gender: Female" 格式
		for _, field := range strings.Split(line, ",") {
			field = strings.TrimSpace(field)
			if strings.HasPrefix(field, "Name:") {
				voice.Name = strings.TrimSpace(strings.TrimPrefix(field, "Name:"))
			}
			if strings.HasPrefix(field, "Gender:") {
				voice.Gender = strings.TrimSpace(strings.TrimPrefix(field, "Gender:"))
			}
		}

		if voice.Name != "" {
			voice.DisplayName = voice.Name
			voices = append(voices, voice)
		}
	}

	if len(voices) == 0 {
		return nil
	}
	return voices
}

// GetEngineStatus 返回 TTS 引擎状态
func (s *TTSService) GetEngineStatus() map[string]interface{} {
	status := map[string]interface{}{
		"available":       s.IsAvailable(),
		"edge_tts":        s.edgeTTSAvailable(),
		"espeak_ng":       s.espeakAvailable(),
		"engine":          "none",
		"default_voice":   "zh-CN-XiaoxiaoNeural",
		"builtin_voices":  len(BuiltinVoices),
	}

	if s.edgeTTSAvailable() {
		status["engine"] = "edge-tts"
	} else if s.espeakAvailable() {
		status["engine"] = "espeak-ng"
	} else {
		status["engine"] = "fallback (silent WAV)"
	}

	return status
}