package service import ( "context" "fmt" "os" "os/exec" "path/filepath" "strings" "time" "git.yeij.top/AskaEth/Cyrene/voice-service/internal/config" ) // SupportedLanguages STT 支持的语言列表 var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"} // STTService 语音转文字服务。 // 优先使用 DashScope API,不可用时回退到本地 Whisper。 type STTService struct { whisperBinary string whisperModel string language string dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime) } // NewSTTService 创建 STT 服务。 func NewSTTService(cfg *config.Config) *STTService { // 实时模型用于所有 WebSocket ASR 请求(支持 one-shot 和 streaming) // 离线模型 (qwen3-asr-flash-2026-02-10) 是 HTTP REST API,暂未实现 model := cfg.DashScopeSTTRealtime if model == "" { model = cfg.DashScopeModel } return &STTService{ whisperBinary: cfg.WhisperBinary, whisperModel: cfg.WhisperModel, language: cfg.WhisperLanguage, dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, model), } } // IsAvailable 检查是否有任一 STT 引擎可用。 func (s *STTService) IsAvailable() bool { if s.dashscope.IsAvailable() { return true } _, err := os.Stat(s.whisperBinary) return err == nil } // Transcribe 将音频数据转录为文字。 // 优先使用 DashScope,不可用时回退到本地 Whisper。 func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) { if language == "" { language = s.language } if !isSupportedLanguage(language) { return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", ")) } // 优先 DashScope if s.dashscope.IsAvailable() { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() text, err := s.dashscope.Transcribe(ctx, audioData, format, language) if err == nil { return text, nil } // DashScope 失败,返回具体错误而不是回退到 Whisper return "", fmt.Errorf("语音识别失败: %w", err) } // 回退到本地 Whisper return s.transcribeWhisper(audioData, format, language) } // StartStreaming 创建持久的流式语音识别会话。 func (s *STTService) StartStreaming(format, language string) (*StreamingSession, error) { if !s.dashscope.IsAvailable() { return nil, fmt.Errorf("流式识别需要 DashScope,请配置 DASHSCOPE_API_KEY") } if language == "" { language = s.language } ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) defer cancel() return s.dashscope.StartStreaming(ctx, format, language) } // transcribeWhisper 使用本地 Whisper 引擎转录。 func (s *STTService) transcribeWhisper(audioData []byte, format string, language string) (string, error) { if _, err := os.Stat(s.whisperBinary); err != nil { return "", fmt.Errorf("STT 引擎不可用: DashScope API Key 未配置且 Whisper 未安装") } ext := normalizeExt(format) tmpFile, err := os.CreateTemp("/tmp", "cyrene-stt-*"+ext) if err != nil { return "", fmt.Errorf("创建临时文件失败: %w", err) } tmpPath := tmpFile.Name() defer os.Remove(tmpPath) if _, err := tmpFile.Write(audioData); err != nil { tmpFile.Close() return "", fmt.Errorf("写入临时文件失败: %w", err) } tmpFile.Close() inputPath := tmpPath if format != "wav" && format != "" { convertedPath := tmpPath + ".wav" if err := convertToWav(tmpPath, convertedPath); err == nil { defer os.Remove(convertedPath) inputPath = convertedPath } } outputPrefix := strings.TrimSuffix(inputPath, filepath.Ext(inputPath)) outputTxt := outputPrefix + ".txt" cmd := exec.Command(s.whisperBinary, "-m", s.whisperModel, "-l", language, "-f", inputPath, "-otxt", "-of", outputPrefix, ) cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { os.Remove(outputTxt) return "", fmt.Errorf("whisper 转录失败: %w", err) } defer os.Remove(outputTxt) txtData, err := os.ReadFile(outputTxt) if err != nil { return "", fmt.Errorf("读取转录结果失败: %w", err) } return strings.TrimSpace(string(txtData)), nil } // GetStatus 返回服务状态。 func (s *STTService) GetStatus() map[string]interface{} { binaryAvailable := false if _, err := os.Stat(s.whisperBinary); err == nil { binaryAvailable = true } modelExists := false if _, err := os.Stat(s.whisperModel); err == nil { modelExists = true } return map[string]interface{}{ "available": s.IsAvailable(), "primary": "dashscope", "dashscope": s.dashscope.GetStatus(), "whisper": map[string]interface{}{ "available": binaryAvailable && modelExists, "binary_available": binaryAvailable, "model_loaded": modelExists, "model_name": filepath.Base(s.whisperModel), }, "default_language": s.language, "supported_languages": SupportedLanguages, } } // normalizeExt 规范化文件扩展名。 func normalizeExt(format string) string { switch strings.ToLower(format) { case "wav": return ".wav" case "mp3", "mpeg": return ".mp3" case "ogg", "opus": return ".ogg" case "flac": return ".flac" case "m4a", "mp4", "aac": return ".m4a" default: return ".wav" } } // isSupportedLanguage 检查语言是否支持。 func isSupportedLanguage(lang string) bool { for _, l := range SupportedLanguages { if l == lang { return true } } return false } // convertToWav 使用 ffmpeg 将音频转换为 WAV 格式。 func convertToWav(inputPath, outputPath string) error { cmd := exec.Command("ffmpeg", "-i", inputPath, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", outputPath, "-y", ) cmd.Stderr = nil return cmd.Run() }