feat: 语音流式输入管线 + VAD前端集成 + 插件-工具合并清理

- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST)
- Gateway: VoiceStreamManager代理WS流式STT到voice-service
- Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码
- 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端)
- 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并
- 文档: 完善gateway-api.md和voice-service.md语音API文档
- 工具: scripts/voice/ 语音转换脚本集

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-06-06 11:50:40 +08:00
parent 258cf81b25
commit 6ef9e082a6
91 changed files with 4091 additions and 3929 deletions
@@ -0,0 +1,52 @@
package service
import (
"context"
"fmt"
"git.yeij.top/AskaEth/Cyrene/pkg/dashscope"
)
// DashScopeRESTSTT 使用 DashScope REST API 进行离线语音识别。
// 离线模型 (qwen3-asr-flash-2026-02-10) 通过 HTTP REST 端点进行转录,
// 无需 session 协商和 Server VAD,延迟更低,适合非实时场景。
type DashScopeRESTSTT struct {
model string
client *dashscope.RESTClient
}
// NewDashScopeRESTSTT 创建 DashScope REST STT 客户端。
func NewDashScopeRESTSTT(apiKey, model string) *DashScopeRESTSTT {
if model == "" {
model = "qwen3-asr-flash-2026-02-10"
}
return &DashScopeRESTSTT{
model: model,
client: dashscope.NewRESTClient(apiKey),
}
}
// IsAvailable 检查 API Key 是否已配置。
func (d *DashScopeRESTSTT) IsAvailable() bool {
return d.client.IsAvailable()
}
// Model 返回模型名。
func (d *DashScopeRESTSTT) Model() string { return d.model }
// Transcribe 使用 DashScope REST API 进行离线语音识别。
func (d *DashScopeRESTSTT) Transcribe(ctx context.Context, audioData []byte, format, language string) (string, error) {
if !d.IsAvailable() {
return "", fmt.Errorf("DashScope REST ASR API key 未配置")
}
return d.client.Transcribe(ctx, d.model, audioData, format, 16000, language)
}
// GetStatus 返回 REST STT 客户端的运行状态。
func (d *DashScopeRESTSTT) GetStatus() map[string]interface{} {
return map[string]interface{}{
"available": d.IsAvailable(),
"model": d.model,
"protocol": "rest",
}
}
@@ -0,0 +1,53 @@
package service
import (
"context"
"testing"
)
func TestDashScopeRESTSTT_Transcribe_Success(t *testing.T) {
client := NewDashScopeRESTSTT("test-key", "qwen3-asr-flash-2026-02-10")
if !client.IsAvailable() {
t.Error("client should be available when apiKey is set")
}
if client.Model() != "qwen3-asr-flash-2026-02-10" {
t.Errorf("unexpected model: %s", client.Model())
}
}
func TestDashScopeRESTSTT_NotAvailable(t *testing.T) {
client := NewDashScopeRESTSTT("", "")
if client.IsAvailable() {
t.Error("client should not be available without apiKey")
}
}
func TestDashScopeRESTSTT_DefaultModel(t *testing.T) {
client := NewDashScopeRESTSTT("key", "")
if client.Model() != "qwen3-asr-flash-2026-02-10" {
t.Errorf("expected default model, got %s", client.Model())
}
}
func TestDashScopeRESTSTT_Transcribe_NoAPIKey(t *testing.T) {
client := NewDashScopeRESTSTT("", "")
_, err := client.Transcribe(context.Background(), []byte{}, "wav", "zh")
if err == nil {
t.Error("expected error when API key is not configured")
}
}
func TestDashScopeRESTSTT_GetStatus(t *testing.T) {
client := NewDashScopeRESTSTT("key", "test-model")
status := client.GetStatus()
if status["available"] != true {
t.Error("status should be available")
}
if status["model"] != "test-model" {
t.Errorf("unexpected model in status: %v", status["model"])
}
if status["protocol"] != "rest" {
t.Errorf("unexpected protocol: %v", status["protocol"])
}
}
@@ -5,12 +5,10 @@ import (
"encoding/base64"
"encoding/json"
"fmt"
"os"
"os/exec"
"strings"
"sync"
"time"
"git.yeij.top/AskaEth/Cyrene/pkg/audio"
"github.com/gorilla/websocket"
)
@@ -145,7 +143,7 @@ func (d *DashScopeSTT) Transcribe(ctx context.Context, audioData []byte, format
}
// 4. 规范化音频格式并发送
pcmData, err := convertToPCM16(audioData, format)
pcmData, err := audio.ConvertToPCM16(audioData, format)
if err != nil {
return "", fmt.Errorf("音频格式转换失败: %w", err)
}
@@ -447,74 +445,3 @@ func (d *DashScopeSTT) GetStatus() map[string]interface{} {
"provider": "dashscope",
}
}
// normalizeSTTFormat 规范化音频格式字符串。
func normalizeSTTFormat(format string) string {
switch strings.ToLower(format) {
case "pcm", "wav", "mp3", "mpeg", "ogg", "opus", "flac", "m4a", "mp4", "aac", "webm":
return strings.ToLower(format)
default:
return format
}
}
// convertToPCM16 将音频数据转换为 16-bit PCM 16000Hz mono。
func convertToPCM16(data []byte, format string) ([]byte, error) {
normFormat := normalizeSTTFormat(format)
switch normFormat {
case "pcm":
return data, nil
case "wav":
if len(data) > 44 {
return data[44:], nil
}
return data, nil
default:
return transcodeToPCM(data, normFormat)
}
}
// transcodeToPCM 使用 ffmpeg 将音频数据转码为 PCM 16-bit 16000Hz mono。
func transcodeToPCM(data []byte, format string) ([]byte, error) {
inFile, err := os.CreateTemp(os.TempDir(), "cyrene-asr-in-*."+format)
if err != nil {
return nil, fmt.Errorf("创建输入临时文件失败: %w", err)
}
inPath := inFile.Name()
defer os.Remove(inPath)
if _, err := inFile.Write(data); err != nil {
inFile.Close()
return nil, fmt.Errorf("写入输入临时文件失败: %w", err)
}
inFile.Close()
outFile, err := os.CreateTemp(os.TempDir(), "cyrene-asr-out-*.pcm")
if err != nil {
return nil, fmt.Errorf("创建输出临时文件失败: %w", err)
}
outPath := outFile.Name()
outFile.Close()
defer os.Remove(outPath)
cmd := exec.Command("ffmpeg",
"-i", inPath,
"-ar", "16000",
"-ac", "1",
"-c:a", "pcm_s16le",
"-f", "s16le",
outPath,
"-y",
)
cmd.Stderr = nil
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("音频转码失败 (ffmpeg): %w", err)
}
outData, err := os.ReadFile(outPath)
if err != nil {
return nil, fmt.Errorf("读取转码结果失败: %w", err)
}
return outData, nil
}
@@ -16,41 +16,59 @@ import (
var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"}
// STTService 语音转文字服务。
// 优先使用 DashScope API,不可用时回退到本地 Whisper。
// 离线转录优先使用 DashScope REST API,失败回退 Whisper。
// 流式转录使用 DashScope Realtime WS。
type STTService struct {
whisperBinary string
whisperModel string
language string
dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime)
whisperBinary string
whisperModel string
language string
dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime)
dashscopeREST *DashScopeRESTSTT // 离线 ASR (qwen3-asr-flash-2026-02-10)
}
// NewSTTService 创建 STT 服务。
func NewSTTService(cfg *config.Config) *STTService {
// 实时模型用于所有 WebSocket ASR 请求(支持 one-shot 和 streaming
// 离线模型 (qwen3-asr-flash-2026-02-10) 是 HTTP REST API,暂未实现
model := cfg.DashScopeSTTRealtime
if model == "" {
model = cfg.DashScopeModel
realtimeModel := cfg.DashScopeSTTRealtime
if realtimeModel == "" {
realtimeModel = "qwen3-asr-flash-realtime"
}
offlineModel := cfg.DashScopeModel
if offlineModel == "" {
offlineModel = "qwen3-asr-flash-2026-02-10"
}
return &STTService{
whisperBinary: cfg.WhisperBinary,
whisperModel: cfg.WhisperModel,
language: cfg.WhisperLanguage,
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, model),
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, realtimeModel),
dashscopeREST: NewDashScopeRESTSTT(cfg.DashScopeAPIKey, offlineModel),
}
}
// IsAvailable 检查是否有任一 STT 引擎可用。
func (s *STTService) IsAvailable() bool {
if s.dashscope.IsAvailable() {
if s.dashscopeREST.IsAvailable() || s.dashscope.IsAvailable() {
return true
}
_, err := os.Stat(s.whisperBinary)
return err == nil
return s.whisperAvailable()
}
// whisperAvailable 检查本地 Whisper 引擎是否真正可用。
func (s *STTService) whisperAvailable() bool {
if _, err := os.Stat(s.whisperBinary); err != nil {
return false
}
if _, err := os.Stat(s.whisperModel); err != nil {
return false
}
if _, err := exec.LookPath("ffmpeg"); err != nil {
return false
}
return true
}
// Transcribe 将音频数据转录为文字。
// 优先使用 DashScope,不可用时回退到本地 Whisper。
// 优先使用 DashScope REST 离线模型,失败回退到本地 Whisper。
func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) {
if language == "" {
language = s.language
@@ -59,16 +77,15 @@ func (s *STTService) Transcribe(audioData []byte, format string, language string
return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", "))
}
// 优先 DashScope
if s.dashscope.IsAvailable() {
// 优先 DashScope REST 离线模型(低延迟,无需 session 协商)
if s.dashscopeREST.IsAvailable() {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
text, err := s.dashscope.Transcribe(ctx, audioData, format, language)
text, err := s.dashscopeREST.Transcribe(ctx, audioData, format, language)
if err == nil {
return text, nil
}
// DashScope 失败,返回具体错误而不是回退 Whisper
return "", fmt.Errorf("语音识别失败: %w", err)
fmt.Printf("[stt] DashScope REST 失败,回退 Whisper: %v\n", err)
}
// 回退到本地 Whisper
@@ -152,15 +169,21 @@ func (s *STTService) GetStatus() map[string]interface{} {
if _, err := os.Stat(s.whisperModel); err == nil {
modelExists = true
}
ffmpegAvailable := false
if _, err := exec.LookPath("ffmpeg"); err == nil {
ffmpegAvailable = true
}
return map[string]interface{}{
"available": s.IsAvailable(),
"primary": "dashscope",
"dashscope": s.dashscope.GetStatus(),
"available": s.IsAvailable(),
"primary": "dashscope_rest",
"dashscope_rest": s.dashscopeREST.GetStatus(),
"dashscope_ws": s.dashscope.GetStatus(),
"whisper": map[string]interface{}{
"available": binaryAvailable && modelExists,
"available": s.whisperAvailable(),
"binary_available": binaryAvailable,
"model_loaded": modelExists,
"ffmpeg_available": ffmpegAvailable,
"model_name": filepath.Base(s.whisperModel),
},
"default_language": s.language,