feat: 第五轮开发 - 14项未来路线图功能完整实现
W1-W14 全部完成: - W1: 消息搜索 (ILIKE全文检索 + SearchModal) - W2: 对话导出 (JSON/Markdown/TXT三格式) - W3: 记忆时间线 DevTools 可视化 - W4: 通知推送系统 (WebSocket + Browser Notification API) - W5: 定时提醒 (30s轮询 + 重复提醒 + WebSocket推送) - W6: 每日简报 (08:00自动生成: 天气+新闻+提醒+AI摘要) - W7: IoT场景自动化 (规则引擎 10s轮询 + 条件评估 + 场景执行) - W8: 语音输入 (浏览器 Speech Recognition API) - W9: STT服务 (voice-service + whisper.cpp) - W10: TTS服务 (浏览器 Speech Synthesis + edge-tts三档回退) - W11: 文件管理 (上传/下载/缩略图/纯Go bilinear缩放) - W12: 知识库RAG (PostgreSQL tsvector + 文档分块 + 检索) - W13: 多模态 (图片上传+分析: Vision API + 本地Go分析回退) - W14: PWA (Service Worker + 离线页 + install prompt) 总计: 6个Go微服务 + 10+前端组件 + 10+ PostgreSQL表 + 4个后台调度器
This commit is contained in:
@@ -0,0 +1,76 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
|
||||
"github.com/yourname/cyrene-ai/voice-service/internal/config"
|
||||
"github.com/yourname/cyrene-ai/voice-service/internal/handler"
|
||||
"github.com/yourname/cyrene-ai/voice-service/internal/service"
|
||||
)
|
||||
|
||||
func main() {
|
||||
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
||||
log.Println("🎤 Voice-Service (STT + TTS) 启动中...")
|
||||
|
||||
// 加载配置
|
||||
cfg := config.Load()
|
||||
|
||||
log.Printf("配置: 端口=%s, WhisperBinary=%s, WhisperModel=%s, Language=%s",
|
||||
cfg.Port, cfg.WhisperBinary, cfg.WhisperModel, cfg.WhisperLanguage)
|
||||
|
||||
// 初始化 STT 服务
|
||||
sttSvc := service.NewSTTService(cfg)
|
||||
|
||||
// 检查 whisper 引擎是否可用
|
||||
if !sttSvc.IsAvailable() {
|
||||
log.Printf("⚠️ Whisper 引擎未安装 (%s),STT 功能不可用", cfg.WhisperBinary)
|
||||
log.Printf(" 请运行: bash scripts/setup-whisper.sh")
|
||||
} else {
|
||||
log.Println("✅ Whisper 引擎已就绪")
|
||||
}
|
||||
|
||||
// 初始化 TTS 服务
|
||||
ttsSvc := service.NewTTSService()
|
||||
|
||||
if !ttsSvc.IsAvailable() {
|
||||
log.Println("⚠️ TTS 引擎不可用 (请安装: pip install edge-tts)")
|
||||
} else {
|
||||
ttsStatus := ttsSvc.GetEngineStatus()
|
||||
log.Printf("✅ TTS 引擎已就绪 (引擎: %s)", ttsStatus["engine"])
|
||||
}
|
||||
|
||||
// 初始化 HTTP 处理器
|
||||
sttHandler := handler.NewSTTHandler(sttSvc, cfg)
|
||||
sttHandler.SetTTSService(ttsSvc)
|
||||
ttsHandler := handler.NewTTSHandler(ttsSvc)
|
||||
|
||||
// 注册路由
|
||||
mux := http.NewServeMux()
|
||||
sttHandler.RegisterRoutes(mux)
|
||||
ttsHandler.RegisterRoutes(mux)
|
||||
|
||||
// 启动 HTTP 服务
|
||||
srv := &http.Server{
|
||||
Addr: ":" + cfg.Port,
|
||||
Handler: mux,
|
||||
}
|
||||
|
||||
go func() {
|
||||
log.Printf("🚀 Voice-Service 已启动在端口 %s", cfg.Port)
|
||||
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
log.Fatalf("服务启动失败: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// 优雅关闭
|
||||
quit := make(chan os.Signal, 1)
|
||||
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
||||
<-quit
|
||||
log.Println("正在关闭 Voice-Service...")
|
||||
srv.Close()
|
||||
log.Println("Voice-Service 已关闭")
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
module github.com/yourname/cyrene-ai/voice-service
|
||||
|
||||
go 1.26.2
|
||||
@@ -0,0 +1,30 @@
|
||||
package config
|
||||
|
||||
import "os"
|
||||
|
||||
// Config STT 语音识别服务配置
|
||||
type Config struct {
|
||||
Port string
|
||||
WhisperBinary string
|
||||
WhisperModel string
|
||||
WhisperLanguage string
|
||||
MaxAudioSize int64 // 字节
|
||||
}
|
||||
|
||||
// Load 从环境变量加载配置
|
||||
func Load() *Config {
|
||||
return &Config{
|
||||
Port: getEnv("PORT", "8093"),
|
||||
WhisperBinary: getEnv("WHISPER_BINARY", "./whisper.cpp/main"),
|
||||
WhisperModel: getEnv("WHISPER_MODEL", "./whisper.cpp/models/ggml-small.bin"),
|
||||
WhisperLanguage: getEnv("WHISPER_LANGUAGE", "zh"),
|
||||
MaxAudioSize: 10 * 1024 * 1024, // 10MB
|
||||
}
|
||||
}
|
||||
|
||||
func getEnv(key, fallback string) string {
|
||||
if v := os.Getenv(key); v != "" {
|
||||
return v
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
@@ -0,0 +1,201 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourname/cyrene-ai/voice-service/internal/config"
|
||||
"github.com/yourname/cyrene-ai/voice-service/internal/service"
|
||||
)
|
||||
|
||||
// STTHandler HTTP API 处理器
|
||||
type STTHandler struct {
|
||||
svc *service.STTService
|
||||
ttsSvc *service.TTSService
|
||||
cfg *config.Config
|
||||
}
|
||||
|
||||
// NewSTTHandler 创建 STT 处理器(可选传入 TTSService 用于组合状态)
|
||||
func NewSTTHandler(svc *service.STTService, cfg *config.Config) *STTHandler {
|
||||
return &STTHandler{svc: svc, cfg: cfg}
|
||||
}
|
||||
|
||||
// SetTTSService 设置 TTS 服务引用,用于组合状态端点
|
||||
func (h *STTHandler) SetTTSService(ttsSvc *service.TTSService) {
|
||||
h.ttsSvc = ttsSvc
|
||||
}
|
||||
|
||||
// RegisterRoutes 注册所有路由到 mux
|
||||
func (h *STTHandler) RegisterRoutes(mux *http.ServeMux) {
|
||||
mux.HandleFunc("/api/v1/transcribe", h.handleTranscribe)
|
||||
mux.HandleFunc("/api/v1/health", h.handleHealth)
|
||||
mux.HandleFunc("/api/v1/status", h.handleStatus)
|
||||
}
|
||||
|
||||
// handleTranscribe POST /api/v1/transcribe
|
||||
// 接受 multipart/form-data,字段 audio (文件) 和 language (可选)
|
||||
func (h *STTHandler) handleTranscribe(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
return
|
||||
}
|
||||
|
||||
// 限制上传大小
|
||||
r.Body = http.MaxBytesReader(w, r.Body, h.cfg.MaxAudioSize)
|
||||
|
||||
if err := r.ParseMultipartForm(h.cfg.MaxAudioSize); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "文件过大或解析失败,最大支持 10MB")
|
||||
return
|
||||
}
|
||||
|
||||
// 获取上传的文件
|
||||
file, header, err := r.FormFile("audio")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, "缺少 audio 文件字段")
|
||||
return
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// 读取文件内容
|
||||
audioData, err := io.ReadAll(file)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "读取音频文件失败")
|
||||
return
|
||||
}
|
||||
|
||||
if len(audioData) == 0 {
|
||||
writeError(w, http.StatusBadRequest, "音频文件为空")
|
||||
return
|
||||
}
|
||||
|
||||
// 获取语言参数 (可选)
|
||||
language := r.FormValue("language")
|
||||
|
||||
// 推断音频格式
|
||||
format := inferFormat(header.Filename)
|
||||
if !isSupportedFormat(format) {
|
||||
writeError(w, http.StatusBadRequest, "不支持的音频格式: "+format+",支持的格式: WAV, MP3, OGG, FLAC, M4A")
|
||||
return
|
||||
}
|
||||
|
||||
// 执行转录
|
||||
startTime := time.Now()
|
||||
text, err := h.svc.Transcribe(audioData, format, language)
|
||||
durationMs := time.Since(startTime).Milliseconds()
|
||||
|
||||
if err != nil {
|
||||
log.Printf("[stt-handler] 转录失败: %v", err)
|
||||
writeJSON(w, http.StatusInternalServerError, map[string]interface{}{
|
||||
"success": false,
|
||||
"error": err.Error(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
actualLang := language
|
||||
if actualLang == "" {
|
||||
actualLang = h.cfg.WhisperLanguage
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, map[string]interface{}{
|
||||
"success": true,
|
||||
"text": text,
|
||||
"language": actualLang,
|
||||
"duration_ms": durationMs,
|
||||
})
|
||||
}
|
||||
|
||||
// handleHealth GET /api/v1/health
|
||||
func (h *STTHandler) handleHealth(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
return
|
||||
}
|
||||
|
||||
sttStatus := h.svc.GetStatus()
|
||||
healthStatus := "ok"
|
||||
if !sttStatus["available"].(bool) {
|
||||
healthStatus = "degraded"
|
||||
}
|
||||
|
||||
resp := map[string]interface{}{
|
||||
"status": healthStatus,
|
||||
"service": "voice-service",
|
||||
"stt": sttStatus,
|
||||
}
|
||||
|
||||
// 如果有 TTS 服务,也包含 TTS 状态
|
||||
if h.ttsSvc != nil {
|
||||
resp["tts"] = h.ttsSvc.GetEngineStatus()
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// handleStatus GET /api/v1/status
|
||||
func (h *STTHandler) handleStatus(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
return
|
||||
}
|
||||
|
||||
resp := map[string]interface{}{
|
||||
"service": "voice-service",
|
||||
"stt": h.svc.GetStatus(),
|
||||
}
|
||||
|
||||
// 如果有 TTS 服务,也包含 TTS 状态
|
||||
if h.ttsSvc != nil {
|
||||
resp["tts"] = h.ttsSvc.GetEngineStatus()
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// --- 辅助函数 ---
|
||||
|
||||
func writeJSON(w http.ResponseWriter, status int, data interface{}) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
json.NewEncoder(w).Encode(data)
|
||||
}
|
||||
|
||||
func writeError(w http.ResponseWriter, status int, message string) {
|
||||
writeJSON(w, status, map[string]interface{}{
|
||||
"error": message,
|
||||
})
|
||||
}
|
||||
|
||||
// inferFormat 根据文件名推断音频格式
|
||||
func inferFormat(filename string) string {
|
||||
ext := strings.ToLower(filepath.Ext(filename))
|
||||
switch ext {
|
||||
case ".wav", ".wave":
|
||||
return "wav"
|
||||
case ".mp3", ".mpeg":
|
||||
return "mp3"
|
||||
case ".ogg", ".opus":
|
||||
return "ogg"
|
||||
case ".flac":
|
||||
return "flac"
|
||||
case ".m4a", ".mp4", ".aac":
|
||||
return "m4a"
|
||||
default:
|
||||
return ext
|
||||
}
|
||||
}
|
||||
|
||||
// isSupportedFormat 检查是否支持的音频格式
|
||||
func isSupportedFormat(format string) bool {
|
||||
switch format {
|
||||
case "wav", "mp3", "ogg", "flac", "m4a":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"net/http"
|
||||
|
||||
"github.com/yourname/cyrene-ai/voice-service/internal/service"
|
||||
)
|
||||
|
||||
// TTSHandler TTS HTTP API 处理器
|
||||
type TTSHandler struct {
|
||||
svc *service.TTSService
|
||||
}
|
||||
|
||||
// NewTTSHandler 创建 TTS 处理器
|
||||
func NewTTSHandler(svc *service.TTSService) *TTSHandler {
|
||||
return &TTSHandler{svc: svc}
|
||||
}
|
||||
|
||||
// RegisterRoutes 注册 TTS 路由
|
||||
func (h *TTSHandler) RegisterRoutes(mux *http.ServeMux) {
|
||||
mux.HandleFunc("/api/v1/tts/synthesize", h.handleSynthesize)
|
||||
mux.HandleFunc("/api/v1/tts/voices", h.handleVoices)
|
||||
mux.HandleFunc("/api/v1/tts/status", h.handleStatus)
|
||||
}
|
||||
|
||||
// TTSSynthesizeRequest TTS 合成请求体
|
||||
type TTSSynthesizeRequest struct {
|
||||
Text string `json:"text"`
|
||||
Voice string `json:"voice"`
|
||||
Rate string `json:"rate"`
|
||||
}
|
||||
|
||||
// handleSynthesize POST /api/v1/tts/synthesize
|
||||
func (h *TTSHandler) handleSynthesize(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
return
|
||||
}
|
||||
|
||||
// 解析 JSON 请求体
|
||||
var req TTSSynthesizeRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "请求体解析失败: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if req.Text == "" {
|
||||
writeError(w, http.StatusBadRequest, "text 字段不能为空")
|
||||
return
|
||||
}
|
||||
|
||||
// 检查 TTS 引擎是否可用
|
||||
if !h.svc.IsAvailable() {
|
||||
log.Printf("[tts-handler] TTS 引擎不可用")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"error": "TTS 引擎不可用,请安装 edge-tts (pip install edge-tts) 或 espeak-ng",
|
||||
"code": "TTS_UNAVAILABLE",
|
||||
"install": "pip install edge-tts",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// 调用合成
|
||||
audioData, format, err := h.svc.Synthesize(req.Text, req.Voice, req.Rate)
|
||||
if err != nil {
|
||||
log.Printf("[tts-handler] TTS 合成失败: %v", err)
|
||||
writeError(w, http.StatusInternalServerError, "TTS 合成失败: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
// 返回音频流
|
||||
contentType := "audio/mpeg"
|
||||
if format == "wav" {
|
||||
contentType = "audio/wav"
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", contentType)
|
||||
w.Header().Set("Content-Disposition", "inline; filename=synthesized."+format)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write(audioData)
|
||||
}
|
||||
|
||||
// handleVoices GET /api/v1/tts/voices
|
||||
func (h *TTSHandler) handleVoices(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
return
|
||||
}
|
||||
|
||||
voices := h.svc.GetVoices()
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"voices": voices,
|
||||
"count": len(voices),
|
||||
})
|
||||
}
|
||||
|
||||
// handleStatus GET /api/v1/tts/status
|
||||
func (h *TTSHandler) handleStatus(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
return
|
||||
}
|
||||
|
||||
status := h.svc.GetEngineStatus()
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"service": "voice-service",
|
||||
"tts": status,
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/yourname/cyrene-ai/voice-service/internal/config"
|
||||
)
|
||||
|
||||
// SupportedLanguages STT 支持的语言列表
|
||||
var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"}
|
||||
|
||||
// STTService 语音转文字服务
|
||||
type STTService struct {
|
||||
whisperBinary string
|
||||
whisperModel string
|
||||
language string
|
||||
}
|
||||
|
||||
// NewSTTService 创建 STT 服务
|
||||
func NewSTTService(cfg *config.Config) *STTService {
|
||||
return &STTService{
|
||||
whisperBinary: cfg.WhisperBinary,
|
||||
whisperModel: cfg.WhisperModel,
|
||||
language: cfg.WhisperLanguage,
|
||||
}
|
||||
}
|
||||
|
||||
// IsAvailable 检查 whisper binary 是否存在
|
||||
func (s *STTService) IsAvailable() bool {
|
||||
_, err := os.Stat(s.whisperBinary)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// Transcribe 将音频数据转录为文字
|
||||
// audioData: 音频文件的二进制数据
|
||||
// format: 音频格式 (wav, mp3, ogg, flac, m4a)
|
||||
// language: 转录语言 (zh, en, ja, ko, auto),为空则使用默认语言
|
||||
func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) {
|
||||
if !s.IsAvailable() {
|
||||
return "", fmt.Errorf("STT 引擎未安装,请运行 scripts/setup-whisper.sh")
|
||||
}
|
||||
|
||||
// 如果未指定语言,使用默认语言
|
||||
if language == "" {
|
||||
language = s.language
|
||||
}
|
||||
|
||||
// 验证语言是否支持
|
||||
if !isSupportedLanguage(language) {
|
||||
return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", "))
|
||||
}
|
||||
|
||||
// 将音频数据写入临时文件
|
||||
ext := normalizeExt(format)
|
||||
tmpFile, err := os.CreateTemp("/tmp", "cyrene-stt-*"+ext)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("创建临时文件失败: %w", err)
|
||||
}
|
||||
tmpPath := tmpFile.Name()
|
||||
defer os.Remove(tmpPath)
|
||||
|
||||
if _, err := tmpFile.Write(audioData); err != nil {
|
||||
tmpFile.Close()
|
||||
return "", fmt.Errorf("写入临时文件失败: %w", err)
|
||||
}
|
||||
tmpFile.Close()
|
||||
|
||||
// 如果不是 WAV 格式,尝试用 ffmpeg 转换
|
||||
inputPath := tmpPath
|
||||
if format != "wav" && format != "" {
|
||||
convertedPath := tmpPath + ".wav"
|
||||
if err := convertToWav(tmpPath, convertedPath); err == nil {
|
||||
defer os.Remove(convertedPath)
|
||||
inputPath = convertedPath
|
||||
}
|
||||
// 转换失败则仍使用原始文件(whisper.cpp 也支持其他格式)
|
||||
}
|
||||
|
||||
// 调用 whisper.cpp
|
||||
outputTxt := inputPath + ".txt"
|
||||
|
||||
cmd := exec.Command(s.whisperBinary,
|
||||
"-m", s.whisperModel,
|
||||
"-l", language,
|
||||
"-f", inputPath,
|
||||
"-otxt",
|
||||
"-of", strings.TrimSuffix(inputPath, filepath.Ext(inputPath)),
|
||||
)
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
os.Remove(outputTxt)
|
||||
return "", fmt.Errorf("whisper 转录失败: %w", err)
|
||||
}
|
||||
|
||||
// 读取输出文本
|
||||
defer os.Remove(outputTxt)
|
||||
|
||||
txtData, err := os.ReadFile(outputTxt)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("读取转录结果失败: %w", err)
|
||||
}
|
||||
|
||||
text := strings.TrimSpace(string(txtData))
|
||||
|
||||
return text, nil
|
||||
}
|
||||
|
||||
// GetStatus 返回服务状态
|
||||
func (s *STTService) GetStatus() map[string]interface{} {
|
||||
binaryAvailable := s.IsAvailable()
|
||||
modelExists := false
|
||||
if _, err := os.Stat(s.whisperModel); err == nil {
|
||||
modelExists = true
|
||||
}
|
||||
|
||||
modelName := filepath.Base(s.whisperModel)
|
||||
|
||||
return map[string]interface{}{
|
||||
"available": binaryAvailable && modelExists,
|
||||
"binary_available": binaryAvailable,
|
||||
"model_loaded": modelExists,
|
||||
"binary_path": s.whisperBinary,
|
||||
"model_path": s.whisperModel,
|
||||
"model_name": modelName,
|
||||
"default_language": s.language,
|
||||
"supported_languages": SupportedLanguages,
|
||||
}
|
||||
}
|
||||
|
||||
// normalizeExt 规范化文件扩展名
|
||||
func normalizeExt(format string) string {
|
||||
switch strings.ToLower(format) {
|
||||
case "wav":
|
||||
return ".wav"
|
||||
case "mp3", "mpeg":
|
||||
return ".mp3"
|
||||
case "ogg", "opus":
|
||||
return ".ogg"
|
||||
case "flac":
|
||||
return ".flac"
|
||||
case "m4a", "mp4", "aac":
|
||||
return ".m4a"
|
||||
default:
|
||||
return ".wav"
|
||||
}
|
||||
}
|
||||
|
||||
// isSupportedLanguage 检查语言是否支持
|
||||
func isSupportedLanguage(lang string) bool {
|
||||
for _, l := range SupportedLanguages {
|
||||
if l == lang {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// convertToWav 使用 ffmpeg 将音频转换为 WAV 格式
|
||||
func convertToWav(inputPath, outputPath string) error {
|
||||
cmd := exec.Command("ffmpeg",
|
||||
"-i", inputPath,
|
||||
"-ar", "16000",
|
||||
"-ac", "1",
|
||||
"-c:a", "pcm_s16le",
|
||||
outputPath,
|
||||
"-y",
|
||||
)
|
||||
cmd.Stderr = nil
|
||||
return cmd.Run()
|
||||
}
|
||||
@@ -0,0 +1,294 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// TTSVoice 表示一个可用的 TTS 语音
|
||||
type TTSVoice struct {
|
||||
Name string `json:"name"`
|
||||
DisplayName string `json:"display_name"`
|
||||
Gender string `json:"gender"`
|
||||
Locale string `json:"locale"`
|
||||
}
|
||||
|
||||
// BuiltinVoices 内置的 edge-tts 中文语音列表
|
||||
var BuiltinVoices = []TTSVoice{
|
||||
{Name: "zh-CN-XiaoxiaoNeural", DisplayName: "晓晓 (女声)", Gender: "Female", Locale: "zh-CN"},
|
||||
{Name: "zh-CN-YunxiNeural", DisplayName: "云希 (男声)", Gender: "Male", Locale: "zh-CN"},
|
||||
{Name: "zh-CN-XiaoyiNeural", DisplayName: "晓伊 (女声)", Gender: "Female", Locale: "zh-CN"},
|
||||
}
|
||||
|
||||
// TTSService 文字转语音服务
|
||||
type TTSService struct{}
|
||||
|
||||
// NewTTSService 创建 TTS 服务
|
||||
func NewTTSService() *TTSService {
|
||||
return &TTSService{}
|
||||
}
|
||||
|
||||
// IsAvailable 检查 TTS 引擎是否可用
|
||||
// 优先级: edge-tts > espeak-ng > 纯 Go fallback
|
||||
func (s *TTSService) IsAvailable() bool {
|
||||
return s.edgeTTSAvailable() || s.espeakAvailable()
|
||||
}
|
||||
|
||||
// edgeTTSAvailable 检查 edge-tts 是否可用
|
||||
func (s *TTSService) edgeTTSAvailable() bool {
|
||||
_, err := exec.LookPath("edge-tts")
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// espeakAvailable 检查 espeak-ng 是否可用
|
||||
func (s *TTSService) espeakAvailable() bool {
|
||||
_, err := exec.LookPath("espeak-ng")
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// Synthesize 将文字合成为音频
|
||||
// text: 要合成的文字
|
||||
// voice: 语音名称 (zh-CN-XiaoxiaoNeural 等)
|
||||
// rate: 语速调整 ("+0%", "+20%", "-20%" 等)
|
||||
// 返回: 音频数据, 音频格式 (mp3/wav), 错误
|
||||
func (s *TTSService) Synthesize(text string, voice string, rate string) ([]byte, string, error) {
|
||||
if text == "" {
|
||||
return nil, "", fmt.Errorf("文字内容为空")
|
||||
}
|
||||
|
||||
// 方案 A: edge-tts (推荐)
|
||||
if s.edgeTTSAvailable() {
|
||||
return s.synthesizeEdgeTTS(text, voice, rate)
|
||||
}
|
||||
|
||||
// 方案 B: espeak-ng
|
||||
if s.espeakAvailable() {
|
||||
return s.synthesizeEspeak(text, voice)
|
||||
}
|
||||
|
||||
// 方案 C: 纯 Go fallback
|
||||
return s.synthesizeFallback()
|
||||
}
|
||||
|
||||
// synthesizeEdgeTTS 使用 edge-tts 合成语音
|
||||
func (s *TTSService) synthesizeEdgeTTS(text string, voice string, rate string) ([]byte, string, error) {
|
||||
if voice == "" {
|
||||
voice = "zh-CN-XiaoxiaoNeural"
|
||||
}
|
||||
if rate == "" {
|
||||
rate = "+0%"
|
||||
}
|
||||
|
||||
// 写入文本到临时文件
|
||||
tmpText, err := os.CreateTemp("/tmp", "cyrene-tts-text-*.txt")
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("创建临时文本文件失败: %w", err)
|
||||
}
|
||||
tmpTextPath := tmpText.Name()
|
||||
defer os.Remove(tmpTextPath)
|
||||
|
||||
if _, err := tmpText.WriteString(text); err != nil {
|
||||
tmpText.Close()
|
||||
return nil, "", fmt.Errorf("写入临时文本失败: %w", err)
|
||||
}
|
||||
tmpText.Close()
|
||||
|
||||
// 输出音频文件
|
||||
tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-output-*.mp3")
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err)
|
||||
}
|
||||
tmpOutputPath := tmpOutput.Name()
|
||||
tmpOutput.Close()
|
||||
defer os.Remove(tmpOutputPath)
|
||||
|
||||
// 构建 edge-tts 命令
|
||||
cmd := exec.Command("edge-tts",
|
||||
"--voice", voice,
|
||||
"--rate="+rate,
|
||||
"--text", text,
|
||||
"--write-media", tmpOutputPath,
|
||||
)
|
||||
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("edge-tts 合成失败: %w\n输出: %s", err, string(output))
|
||||
}
|
||||
|
||||
// 读取生成的音频
|
||||
audioData, err := os.ReadFile(tmpOutputPath)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("读取合成的音频失败: %w", err)
|
||||
}
|
||||
|
||||
if len(audioData) == 0 {
|
||||
return nil, "", fmt.Errorf("edge-tts 生成的音频为空")
|
||||
}
|
||||
|
||||
return audioData, "mp3", nil
|
||||
}
|
||||
|
||||
// synthesizeEspeak 使用 espeak-ng 合成语音
|
||||
func (s *TTSService) synthesizeEspeak(text string, voice string) ([]byte, string, error) {
|
||||
if voice == "" {
|
||||
voice = "zh"
|
||||
}
|
||||
|
||||
// 输出 WAV 文件
|
||||
tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-espeak-*.wav")
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err)
|
||||
}
|
||||
tmpOutputPath := tmpOutput.Name()
|
||||
tmpOutput.Close()
|
||||
defer os.Remove(tmpOutputPath)
|
||||
|
||||
cmd := exec.Command("espeak-ng",
|
||||
"-v", voice,
|
||||
"-w", tmpOutputPath,
|
||||
text,
|
||||
)
|
||||
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("espeak-ng 合成失败: %w\n输出: %s", err, string(output))
|
||||
}
|
||||
|
||||
audioData, err := os.ReadFile(tmpOutputPath)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("读取合成的音频失败: %w", err)
|
||||
}
|
||||
|
||||
if len(audioData) == 0 {
|
||||
return nil, "", fmt.Errorf("espeak-ng 生成的音频为空")
|
||||
}
|
||||
|
||||
return audioData, "wav", nil
|
||||
}
|
||||
|
||||
// synthesizeFallback 生成静默 WAV 作为降级方案
|
||||
// 生成 1 秒 16kHz 16-bit mono 静默 PCM WAV
|
||||
func (s *TTSService) synthesizeFallback() ([]byte, string, error) {
|
||||
// 1 秒 @ 16kHz mono 16-bit = 32000 字节采样数据
|
||||
sampleRate := 16000
|
||||
numChannels := 1
|
||||
bitsPerSample := 16
|
||||
durationSec := 1
|
||||
|
||||
dataSize := sampleRate * numChannels * (bitsPerSample / 8) * durationSec
|
||||
// WAV header 44 bytes + data
|
||||
wav := make([]byte, 44+dataSize)
|
||||
|
||||
// RIFF header
|
||||
copy(wav[0:4], "RIFF")
|
||||
writeUint32LE(wav[4:8], uint32(36+dataSize))
|
||||
copy(wav[8:12], "WAVE")
|
||||
|
||||
// fmt chunk
|
||||
copy(wav[12:16], "fmt ")
|
||||
writeUint32LE(wav[16:20], 16) // chunk size
|
||||
writeUint16LE(wav[20:22], 1) // PCM
|
||||
writeUint16LE(wav[22:24], uint16(numChannels)) // channels
|
||||
writeUint32LE(wav[24:28], uint32(sampleRate)) // sample rate
|
||||
writeUint32LE(wav[28:32], uint32(sampleRate*numChannels*bitsPerSample/8)) // byte rate
|
||||
writeUint16LE(wav[32:34], uint16(numChannels*bitsPerSample/8)) // block align
|
||||
writeUint16LE(wav[34:36], uint16(bitsPerSample)) // bits per sample
|
||||
|
||||
// data chunk
|
||||
copy(wav[36:40], "data")
|
||||
writeUint32LE(wav[40:44], uint32(dataSize))
|
||||
// 采样数据全是 0 (静默)
|
||||
|
||||
return wav, "wav", nil
|
||||
}
|
||||
|
||||
func writeUint16LE(buf []byte, v uint16) {
|
||||
buf[0] = byte(v)
|
||||
buf[1] = byte(v >> 8)
|
||||
}
|
||||
|
||||
func writeUint32LE(buf []byte, v uint32) {
|
||||
buf[0] = byte(v)
|
||||
buf[1] = byte(v >> 8)
|
||||
buf[2] = byte(v >> 16)
|
||||
buf[3] = byte(v >> 24)
|
||||
}
|
||||
|
||||
// GetVoices 返回可用语音列表
|
||||
func (s *TTSService) GetVoices() []TTSVoice {
|
||||
// 检查 edge-tts 是否可用,尝试获取完整语音列表
|
||||
if s.edgeTTSAvailable() {
|
||||
cmd := exec.Command("edge-tts", "--list-voices")
|
||||
output, err := cmd.Output()
|
||||
if err == nil {
|
||||
voices := s.parseEdgeTTSVoices(string(output))
|
||||
if len(voices) > 0 {
|
||||
return voices
|
||||
}
|
||||
}
|
||||
}
|
||||
return BuiltinVoices
|
||||
}
|
||||
|
||||
// parseEdgeTTSVoices 解析 edge-tts --list-voices 输出
|
||||
// 简单解析:查找包含 "zh-CN" 的语音
|
||||
func (s *TTSService) parseEdgeTTSVoices(output string) []TTSVoice {
|
||||
var voices []TTSVoice
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.Contains(line, "zh-CN") {
|
||||
continue
|
||||
}
|
||||
|
||||
voice := TTSVoice{
|
||||
Name: "",
|
||||
Gender: "Unknown",
|
||||
Locale: "zh-CN",
|
||||
}
|
||||
|
||||
// 简单解析 "Name: zh-CN-XiaoxiaoNeural" 和 "Gender: Female" 格式
|
||||
for _, field := range strings.Split(line, ",") {
|
||||
field = strings.TrimSpace(field)
|
||||
if strings.HasPrefix(field, "Name:") {
|
||||
voice.Name = strings.TrimSpace(strings.TrimPrefix(field, "Name:"))
|
||||
}
|
||||
if strings.HasPrefix(field, "Gender:") {
|
||||
voice.Gender = strings.TrimSpace(strings.TrimPrefix(field, "Gender:"))
|
||||
}
|
||||
}
|
||||
|
||||
if voice.Name != "" {
|
||||
voice.DisplayName = voice.Name
|
||||
voices = append(voices, voice)
|
||||
}
|
||||
}
|
||||
|
||||
if len(voices) == 0 {
|
||||
return nil
|
||||
}
|
||||
return voices
|
||||
}
|
||||
|
||||
// GetEngineStatus 返回 TTS 引擎状态
|
||||
func (s *TTSService) GetEngineStatus() map[string]interface{} {
|
||||
status := map[string]interface{}{
|
||||
"available": s.IsAvailable(),
|
||||
"edge_tts": s.edgeTTSAvailable(),
|
||||
"espeak_ng": s.espeakAvailable(),
|
||||
"engine": "none",
|
||||
"default_voice": "zh-CN-XiaoxiaoNeural",
|
||||
"builtin_voices": len(BuiltinVoices),
|
||||
}
|
||||
|
||||
if s.edgeTTSAvailable() {
|
||||
status["engine"] = "edge-tts"
|
||||
} else if s.espeakAvailable() {
|
||||
status["engine"] = "espeak-ng"
|
||||
} else {
|
||||
status["engine"] = "fallback (silent WAV)"
|
||||
}
|
||||
|
||||
return status
|
||||
}
|
||||
Reference in New Issue
Block a user