package service import ( "fmt" "os" "os/exec" "path/filepath" "strings" "github.com/yourname/cyrene-ai/voice-service/internal/config" ) // SupportedLanguages STT 支持的语言列表 var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"} // STTService 语音转文字服务 type STTService struct { whisperBinary string whisperModel string language string } // NewSTTService 创建 STT 服务 func NewSTTService(cfg *config.Config) *STTService { return &STTService{ whisperBinary: cfg.WhisperBinary, whisperModel: cfg.WhisperModel, language: cfg.WhisperLanguage, } } // IsAvailable 检查 whisper binary 是否存在 func (s *STTService) IsAvailable() bool { _, err := os.Stat(s.whisperBinary) return err == nil } // Transcribe 将音频数据转录为文字 // audioData: 音频文件的二进制数据 // format: 音频格式 (wav, mp3, ogg, flac, m4a) // language: 转录语言 (zh, en, ja, ko, auto),为空则使用默认语言 func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) { if !s.IsAvailable() { return "", fmt.Errorf("STT 引擎未安装,请运行 scripts/setup-whisper.sh") } // 如果未指定语言,使用默认语言 if language == "" { language = s.language } // 验证语言是否支持 if !isSupportedLanguage(language) { return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", ")) } // 将音频数据写入临时文件 ext := normalizeExt(format) tmpFile, err := os.CreateTemp("/tmp", "cyrene-stt-*"+ext) if err != nil { return "", fmt.Errorf("创建临时文件失败: %w", err) } tmpPath := tmpFile.Name() defer os.Remove(tmpPath) if _, err := tmpFile.Write(audioData); err != nil { tmpFile.Close() return "", fmt.Errorf("写入临时文件失败: %w", err) } tmpFile.Close() // 如果不是 WAV 格式,尝试用 ffmpeg 转换 inputPath := tmpPath if format != "wav" && format != "" { convertedPath := tmpPath + ".wav" if err := convertToWav(tmpPath, convertedPath); err == nil { defer os.Remove(convertedPath) inputPath = convertedPath } // 转换失败则仍使用原始文件(whisper.cpp 也支持其他格式) } // 调用 whisper.cpp // whisper-cli 的 -of 标志会在去掉扩展名后追加 .txt outputPrefix := strings.TrimSuffix(inputPath, filepath.Ext(inputPath)) outputTxt := outputPrefix + ".txt" cmd := exec.Command(s.whisperBinary, "-m", s.whisperModel, "-l", language, "-f", inputPath, "-otxt", "-of", outputPrefix, ) cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { os.Remove(outputTxt) return "", fmt.Errorf("whisper 转录失败: %w", err) } // 读取输出文本 defer os.Remove(outputTxt) txtData, err := os.ReadFile(outputTxt) if err != nil { return "", fmt.Errorf("读取转录结果失败: %w", err) } text := strings.TrimSpace(string(txtData)) return text, nil } // GetStatus 返回服务状态 func (s *STTService) GetStatus() map[string]interface{} { binaryAvailable := s.IsAvailable() modelExists := false if _, err := os.Stat(s.whisperModel); err == nil { modelExists = true } modelName := filepath.Base(s.whisperModel) return map[string]interface{}{ "available": binaryAvailable && modelExists, "binary_available": binaryAvailable, "model_loaded": modelExists, "binary_path": s.whisperBinary, "model_path": s.whisperModel, "model_name": modelName, "default_language": s.language, "supported_languages": SupportedLanguages, } } // normalizeExt 规范化文件扩展名 func normalizeExt(format string) string { switch strings.ToLower(format) { case "wav": return ".wav" case "mp3", "mpeg": return ".mp3" case "ogg", "opus": return ".ogg" case "flac": return ".flac" case "m4a", "mp4", "aac": return ".m4a" default: return ".wav" } } // isSupportedLanguage 检查语言是否支持 func isSupportedLanguage(lang string) bool { for _, l := range SupportedLanguages { if l == lang { return true } } return false } // convertToWav 使用 ffmpeg 将音频转换为 WAV 格式 func convertToWav(inputPath, outputPath string) error { cmd := exec.Command("ffmpeg", "-i", inputPath, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", outputPath, "-y", ) cmd.Stderr = nil return cmd.Run() }