feat: 语音流式输入管线 + VAD前端集成 + 插件-工具合并清理
- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST) - Gateway: VoiceStreamManager代理WS流式STT到voice-service - Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码 - 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端) - 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并 - 文档: 完善gateway-api.md和voice-service.md语音API文档 - 工具: scripts/voice/ 语音转换脚本集 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -19,7 +19,7 @@ RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /voice-service ./cmd/m
|
||||
# ========== 运行阶段 ==========
|
||||
FROM alpine:3.21
|
||||
|
||||
RUN apk add --no-cache ca-certificates tzdata && \
|
||||
RUN apk add --no-cache ca-certificates tzdata ffmpeg && \
|
||||
cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
|
||||
echo "Asia/Shanghai" > /etc/timezone
|
||||
|
||||
|
||||
@@ -0,0 +1,224 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
func main() {
|
||||
mode := flag.String("mode", "offline", "测试模式: offline (非实时) 或 realtime (实时)")
|
||||
file := flag.String("file", "", "音频文件路径 (WAV/MP3/OGG/FLAC)")
|
||||
server := flag.String("server", "http://localhost:8093", "Voice-Service 地址")
|
||||
lang := flag.String("lang", "zh", "语言代码")
|
||||
flag.Parse()
|
||||
|
||||
if *file == "" {
|
||||
fmt.Println("用法: test_asr -mode=offline -file=audio.wav [-server=http://localhost:8093]")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
switch *mode {
|
||||
case "offline":
|
||||
testOffline(*server, *file, *lang)
|
||||
case "realtime":
|
||||
testRealtime(*server, *file, *lang)
|
||||
default:
|
||||
fmt.Printf("未知模式: %s (支持: offline, realtime)\n", *mode)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// testOffline 测试非实时语音识别 (HTTP multipart 上传)。
|
||||
func testOffline(server, filePath, lang string) {
|
||||
fmt.Printf("=== 非实时 ASR 测试 ===\n")
|
||||
fmt.Printf("服务器: %s\n", server)
|
||||
fmt.Printf("文件: %s\n", filePath)
|
||||
fmt.Printf("语言: %s\n\n", lang)
|
||||
|
||||
// 读取音频文件
|
||||
audioData, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
fmt.Printf("读取文件失败: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Printf("音频大小: %d bytes\n", len(audioData))
|
||||
|
||||
// 创建 multipart 请求
|
||||
req, err := http.NewRequest("POST", server+"/api/v1/transcribe", nil)
|
||||
if err != nil {
|
||||
fmt.Printf("创建请求失败: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// 使用 multipart form
|
||||
body, contentType, err := createMultipartBody(audioData, filePath, lang)
|
||||
if err != nil {
|
||||
fmt.Printf("创建 multipart body 失败: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
req.Body = body
|
||||
req.Header.Set("Content-Type", contentType)
|
||||
|
||||
start := time.Now()
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
fmt.Printf("请求失败: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
elapsed := time.Since(start)
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
fmt.Printf("状态码: %d\n", resp.StatusCode)
|
||||
fmt.Printf("耗时: %v\n", elapsed)
|
||||
fmt.Printf("响应:\n%s\n", string(respBody))
|
||||
|
||||
if resp.StatusCode == 200 {
|
||||
fmt.Println("\n✅ 非实时语音识别成功!")
|
||||
} else {
|
||||
fmt.Println("\n❌ 非实时语音识别失败")
|
||||
}
|
||||
}
|
||||
|
||||
// testRealtime 测试实时语音识别 (WebSocket 流式)。
|
||||
func testRealtime(server, filePath, lang string) {
|
||||
fmt.Printf("=== 实时 ASR 测试 ===\n")
|
||||
fmt.Printf("服务器: %s\n", server)
|
||||
fmt.Printf("文件: %s\n", filePath)
|
||||
fmt.Printf("语言: %s\n\n", lang)
|
||||
|
||||
// 读取音频文件
|
||||
audioData, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
fmt.Printf("读取文件失败: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Printf("音频大小: %d bytes\n", len(audioData))
|
||||
|
||||
// 推断格式
|
||||
format := inferFormat(filePath)
|
||||
|
||||
// 连接 WebSocket
|
||||
wsURL := fmt.Sprintf("ws://%s/api/v1/stt/stream?format=%s&language=%s",
|
||||
server[7:], format, lang) // 去掉 http:// 前缀
|
||||
conn, _, err := websocket.DefaultDialer.Dial(wsURL, nil)
|
||||
if err != nil {
|
||||
fmt.Printf("WebSocket 连接失败: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
fmt.Printf("WebSocket 已连接: %s\n", wsURL)
|
||||
|
||||
// 设置 interrupt 处理
|
||||
interrupt := make(chan os.Signal, 1)
|
||||
signal.Notify(interrupt, os.Interrupt)
|
||||
|
||||
// goroutine: 读取识别结果
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
defer close(done)
|
||||
for {
|
||||
_, msg, err := conn.ReadMessage()
|
||||
if err != nil {
|
||||
fmt.Printf("读取结果错误: %v\n", err)
|
||||
return
|
||||
}
|
||||
fmt.Printf("◀ 结果: %s\n", string(msg))
|
||||
}
|
||||
}()
|
||||
|
||||
// 模拟实时流式发送音频(每 100ms 发送 3200 bytes)
|
||||
chunkSize := 3200
|
||||
totalSent := 0
|
||||
start := time.Now()
|
||||
var elapsed time.Duration
|
||||
|
||||
cancelled := false
|
||||
for i := 0; i < len(audioData); i += chunkSize {
|
||||
end := i + chunkSize
|
||||
if end > len(audioData) {
|
||||
end = len(audioData)
|
||||
}
|
||||
|
||||
select {
|
||||
case <-interrupt:
|
||||
fmt.Println("\n用户中断")
|
||||
cancelled = true
|
||||
default:
|
||||
}
|
||||
if cancelled {
|
||||
break
|
||||
}
|
||||
|
||||
if err := conn.WriteMessage(websocket.BinaryMessage, audioData[i:end]); err != nil {
|
||||
fmt.Printf("发送音频失败: %v\n", err)
|
||||
break
|
||||
}
|
||||
totalSent += end - i
|
||||
fmt.Printf("▶ 发送 %d/%d bytes (%.1f%%)\n", totalSent, len(audioData),
|
||||
float64(totalSent)/float64(len(audioData))*100)
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
|
||||
elapsed = time.Since(start)
|
||||
|
||||
// 发送停止消息
|
||||
conn.WriteMessage(websocket.TextMessage, []byte(`{"action":"stop"}`))
|
||||
|
||||
// 等待最后的结果
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
fmt.Printf("\n总耗时: %v, 总发送: %d bytes\n", elapsed, totalSent)
|
||||
fmt.Println("✅ 实时语音识别测试完成")
|
||||
}
|
||||
|
||||
func inferFormat(filename string) string {
|
||||
ext := ""
|
||||
for i := len(filename) - 1; i >= 0; i-- {
|
||||
if filename[i] == '.' {
|
||||
ext = filename[i+1:]
|
||||
break
|
||||
}
|
||||
}
|
||||
switch ext {
|
||||
case "wav", "wave":
|
||||
return "wav"
|
||||
case "mp3", "mpeg":
|
||||
return "mp3"
|
||||
case "ogg", "opus":
|
||||
return "ogg"
|
||||
case "flac":
|
||||
return "flac"
|
||||
case "m4a", "mp4", "aac":
|
||||
return "m4a"
|
||||
default:
|
||||
return "pcm"
|
||||
}
|
||||
}
|
||||
|
||||
func createMultipartBody(audioData []byte, filename, lang string) (io.ReadCloser, string, error) {
|
||||
boundary := "cyrene-asr-test-boundary"
|
||||
header := fmt.Sprintf("--%s\r\nContent-Disposition: form-data; name=\"audio\"; filename=\"%s\"\r\nContent-Type: application/octet-stream\r\n\r\n",
|
||||
boundary, filename)
|
||||
footer := fmt.Sprintf("\r\n--%s\r\nContent-Disposition: form-data; name=\"language\"\r\n\r\n%s\r\n--%s--\r\n",
|
||||
boundary, lang, boundary)
|
||||
|
||||
pr, pw := io.Pipe()
|
||||
go func() {
|
||||
pw.Write([]byte(header))
|
||||
pw.Write(audioData)
|
||||
pw.Write([]byte(footer))
|
||||
pw.Close()
|
||||
}()
|
||||
|
||||
return pr, "multipart/form-data; boundary=" + boundary, nil
|
||||
}
|
||||
@@ -2,9 +2,15 @@ module git.yeij.top/AskaEth/Cyrene/voice-service
|
||||
|
||||
go 1.26.2
|
||||
|
||||
replace git.yeij.top/AskaEth/Cyrene/pkg/logger => ../pkg/logger
|
||||
replace (
|
||||
git.yeij.top/AskaEth/Cyrene/pkg/logger => ../pkg/logger
|
||||
git.yeij.top/AskaEth/Cyrene/pkg/audio => ../pkg/audio
|
||||
git.yeij.top/AskaEth/Cyrene/pkg/dashscope => ../pkg/dashscope
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
git.yeij.top/AskaEth/Cyrene/pkg/audio v0.0.0
|
||||
git.yeij.top/AskaEth/Cyrene/pkg/dashscope v0.0.0
|
||||
git.yeij.top/AskaEth/Cyrene/pkg/logger v0.0.0
|
||||
)
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"git.yeij.top/AskaEth/Cyrene/pkg/dashscope"
|
||||
)
|
||||
|
||||
// DashScopeRESTSTT 使用 DashScope REST API 进行离线语音识别。
|
||||
// 离线模型 (qwen3-asr-flash-2026-02-10) 通过 HTTP REST 端点进行转录,
|
||||
// 无需 session 协商和 Server VAD,延迟更低,适合非实时场景。
|
||||
type DashScopeRESTSTT struct {
|
||||
model string
|
||||
client *dashscope.RESTClient
|
||||
}
|
||||
|
||||
// NewDashScopeRESTSTT 创建 DashScope REST STT 客户端。
|
||||
func NewDashScopeRESTSTT(apiKey, model string) *DashScopeRESTSTT {
|
||||
if model == "" {
|
||||
model = "qwen3-asr-flash-2026-02-10"
|
||||
}
|
||||
return &DashScopeRESTSTT{
|
||||
model: model,
|
||||
client: dashscope.NewRESTClient(apiKey),
|
||||
}
|
||||
}
|
||||
|
||||
// IsAvailable 检查 API Key 是否已配置。
|
||||
func (d *DashScopeRESTSTT) IsAvailable() bool {
|
||||
return d.client.IsAvailable()
|
||||
}
|
||||
|
||||
// Model 返回模型名。
|
||||
func (d *DashScopeRESTSTT) Model() string { return d.model }
|
||||
|
||||
// Transcribe 使用 DashScope REST API 进行离线语音识别。
|
||||
func (d *DashScopeRESTSTT) Transcribe(ctx context.Context, audioData []byte, format, language string) (string, error) {
|
||||
if !d.IsAvailable() {
|
||||
return "", fmt.Errorf("DashScope REST ASR API key 未配置")
|
||||
}
|
||||
return d.client.Transcribe(ctx, d.model, audioData, format, 16000, language)
|
||||
}
|
||||
|
||||
// GetStatus 返回 REST STT 客户端的运行状态。
|
||||
func (d *DashScopeRESTSTT) GetStatus() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"available": d.IsAvailable(),
|
||||
"model": d.model,
|
||||
"protocol": "rest",
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDashScopeRESTSTT_Transcribe_Success(t *testing.T) {
|
||||
client := NewDashScopeRESTSTT("test-key", "qwen3-asr-flash-2026-02-10")
|
||||
|
||||
if !client.IsAvailable() {
|
||||
t.Error("client should be available when apiKey is set")
|
||||
}
|
||||
if client.Model() != "qwen3-asr-flash-2026-02-10" {
|
||||
t.Errorf("unexpected model: %s", client.Model())
|
||||
}
|
||||
}
|
||||
|
||||
func TestDashScopeRESTSTT_NotAvailable(t *testing.T) {
|
||||
client := NewDashScopeRESTSTT("", "")
|
||||
if client.IsAvailable() {
|
||||
t.Error("client should not be available without apiKey")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDashScopeRESTSTT_DefaultModel(t *testing.T) {
|
||||
client := NewDashScopeRESTSTT("key", "")
|
||||
if client.Model() != "qwen3-asr-flash-2026-02-10" {
|
||||
t.Errorf("expected default model, got %s", client.Model())
|
||||
}
|
||||
}
|
||||
|
||||
func TestDashScopeRESTSTT_Transcribe_NoAPIKey(t *testing.T) {
|
||||
client := NewDashScopeRESTSTT("", "")
|
||||
_, err := client.Transcribe(context.Background(), []byte{}, "wav", "zh")
|
||||
if err == nil {
|
||||
t.Error("expected error when API key is not configured")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDashScopeRESTSTT_GetStatus(t *testing.T) {
|
||||
client := NewDashScopeRESTSTT("key", "test-model")
|
||||
status := client.GetStatus()
|
||||
if status["available"] != true {
|
||||
t.Error("status should be available")
|
||||
}
|
||||
if status["model"] != "test-model" {
|
||||
t.Errorf("unexpected model in status: %v", status["model"])
|
||||
}
|
||||
if status["protocol"] != "rest" {
|
||||
t.Errorf("unexpected protocol: %v", status["protocol"])
|
||||
}
|
||||
}
|
||||
@@ -5,12 +5,10 @@ import (
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"git.yeij.top/AskaEth/Cyrene/pkg/audio"
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
@@ -145,7 +143,7 @@ func (d *DashScopeSTT) Transcribe(ctx context.Context, audioData []byte, format
|
||||
}
|
||||
|
||||
// 4. 规范化音频格式并发送
|
||||
pcmData, err := convertToPCM16(audioData, format)
|
||||
pcmData, err := audio.ConvertToPCM16(audioData, format)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("音频格式转换失败: %w", err)
|
||||
}
|
||||
@@ -447,74 +445,3 @@ func (d *DashScopeSTT) GetStatus() map[string]interface{} {
|
||||
"provider": "dashscope",
|
||||
}
|
||||
}
|
||||
|
||||
// normalizeSTTFormat 规范化音频格式字符串。
|
||||
func normalizeSTTFormat(format string) string {
|
||||
switch strings.ToLower(format) {
|
||||
case "pcm", "wav", "mp3", "mpeg", "ogg", "opus", "flac", "m4a", "mp4", "aac", "webm":
|
||||
return strings.ToLower(format)
|
||||
default:
|
||||
return format
|
||||
}
|
||||
}
|
||||
|
||||
// convertToPCM16 将音频数据转换为 16-bit PCM 16000Hz mono。
|
||||
func convertToPCM16(data []byte, format string) ([]byte, error) {
|
||||
normFormat := normalizeSTTFormat(format)
|
||||
switch normFormat {
|
||||
case "pcm":
|
||||
return data, nil
|
||||
case "wav":
|
||||
if len(data) > 44 {
|
||||
return data[44:], nil
|
||||
}
|
||||
return data, nil
|
||||
default:
|
||||
return transcodeToPCM(data, normFormat)
|
||||
}
|
||||
}
|
||||
|
||||
// transcodeToPCM 使用 ffmpeg 将音频数据转码为 PCM 16-bit 16000Hz mono。
|
||||
func transcodeToPCM(data []byte, format string) ([]byte, error) {
|
||||
inFile, err := os.CreateTemp(os.TempDir(), "cyrene-asr-in-*."+format)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("创建输入临时文件失败: %w", err)
|
||||
}
|
||||
inPath := inFile.Name()
|
||||
defer os.Remove(inPath)
|
||||
if _, err := inFile.Write(data); err != nil {
|
||||
inFile.Close()
|
||||
return nil, fmt.Errorf("写入输入临时文件失败: %w", err)
|
||||
}
|
||||
inFile.Close()
|
||||
|
||||
outFile, err := os.CreateTemp(os.TempDir(), "cyrene-asr-out-*.pcm")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("创建输出临时文件失败: %w", err)
|
||||
}
|
||||
outPath := outFile.Name()
|
||||
outFile.Close()
|
||||
defer os.Remove(outPath)
|
||||
|
||||
cmd := exec.Command("ffmpeg",
|
||||
"-i", inPath,
|
||||
"-ar", "16000",
|
||||
"-ac", "1",
|
||||
"-c:a", "pcm_s16le",
|
||||
"-f", "s16le",
|
||||
outPath,
|
||||
"-y",
|
||||
)
|
||||
cmd.Stderr = nil
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return nil, fmt.Errorf("音频转码失败 (ffmpeg): %w", err)
|
||||
}
|
||||
|
||||
outData, err := os.ReadFile(outPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("读取转码结果失败: %w", err)
|
||||
}
|
||||
|
||||
return outData, nil
|
||||
}
|
||||
|
||||
@@ -16,41 +16,59 @@ import (
|
||||
var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"}
|
||||
|
||||
// STTService 语音转文字服务。
|
||||
// 优先使用 DashScope API,不可用时回退到本地 Whisper。
|
||||
// 离线转录优先使用 DashScope REST API,失败回退 Whisper。
|
||||
// 流式转录使用 DashScope Realtime WS。
|
||||
type STTService struct {
|
||||
whisperBinary string
|
||||
whisperModel string
|
||||
language string
|
||||
dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime)
|
||||
whisperBinary string
|
||||
whisperModel string
|
||||
language string
|
||||
dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime)
|
||||
dashscopeREST *DashScopeRESTSTT // 离线 ASR (qwen3-asr-flash-2026-02-10)
|
||||
}
|
||||
|
||||
// NewSTTService 创建 STT 服务。
|
||||
func NewSTTService(cfg *config.Config) *STTService {
|
||||
// 实时模型用于所有 WebSocket ASR 请求(支持 one-shot 和 streaming)
|
||||
// 离线模型 (qwen3-asr-flash-2026-02-10) 是 HTTP REST API,暂未实现
|
||||
model := cfg.DashScopeSTTRealtime
|
||||
if model == "" {
|
||||
model = cfg.DashScopeModel
|
||||
realtimeModel := cfg.DashScopeSTTRealtime
|
||||
if realtimeModel == "" {
|
||||
realtimeModel = "qwen3-asr-flash-realtime"
|
||||
}
|
||||
offlineModel := cfg.DashScopeModel
|
||||
if offlineModel == "" {
|
||||
offlineModel = "qwen3-asr-flash-2026-02-10"
|
||||
}
|
||||
return &STTService{
|
||||
whisperBinary: cfg.WhisperBinary,
|
||||
whisperModel: cfg.WhisperModel,
|
||||
language: cfg.WhisperLanguage,
|
||||
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, model),
|
||||
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, realtimeModel),
|
||||
dashscopeREST: NewDashScopeRESTSTT(cfg.DashScopeAPIKey, offlineModel),
|
||||
}
|
||||
}
|
||||
|
||||
// IsAvailable 检查是否有任一 STT 引擎可用。
|
||||
func (s *STTService) IsAvailable() bool {
|
||||
if s.dashscope.IsAvailable() {
|
||||
if s.dashscopeREST.IsAvailable() || s.dashscope.IsAvailable() {
|
||||
return true
|
||||
}
|
||||
_, err := os.Stat(s.whisperBinary)
|
||||
return err == nil
|
||||
return s.whisperAvailable()
|
||||
}
|
||||
|
||||
// whisperAvailable 检查本地 Whisper 引擎是否真正可用。
|
||||
func (s *STTService) whisperAvailable() bool {
|
||||
if _, err := os.Stat(s.whisperBinary); err != nil {
|
||||
return false
|
||||
}
|
||||
if _, err := os.Stat(s.whisperModel); err != nil {
|
||||
return false
|
||||
}
|
||||
if _, err := exec.LookPath("ffmpeg"); err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Transcribe 将音频数据转录为文字。
|
||||
// 优先使用 DashScope,不可用时回退到本地 Whisper。
|
||||
// 优先使用 DashScope REST 离线模型,失败回退到本地 Whisper。
|
||||
func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) {
|
||||
if language == "" {
|
||||
language = s.language
|
||||
@@ -59,16 +77,15 @@ func (s *STTService) Transcribe(audioData []byte, format string, language string
|
||||
return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", "))
|
||||
}
|
||||
|
||||
// 优先 DashScope
|
||||
if s.dashscope.IsAvailable() {
|
||||
// 优先 DashScope REST 离线模型(低延迟,无需 session 协商)
|
||||
if s.dashscopeREST.IsAvailable() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
text, err := s.dashscope.Transcribe(ctx, audioData, format, language)
|
||||
text, err := s.dashscopeREST.Transcribe(ctx, audioData, format, language)
|
||||
if err == nil {
|
||||
return text, nil
|
||||
}
|
||||
// DashScope 失败,返回具体错误而不是回退到 Whisper
|
||||
return "", fmt.Errorf("语音识别失败: %w", err)
|
||||
fmt.Printf("[stt] DashScope REST 失败,回退 Whisper: %v\n", err)
|
||||
}
|
||||
|
||||
// 回退到本地 Whisper
|
||||
@@ -152,15 +169,21 @@ func (s *STTService) GetStatus() map[string]interface{} {
|
||||
if _, err := os.Stat(s.whisperModel); err == nil {
|
||||
modelExists = true
|
||||
}
|
||||
ffmpegAvailable := false
|
||||
if _, err := exec.LookPath("ffmpeg"); err == nil {
|
||||
ffmpegAvailable = true
|
||||
}
|
||||
|
||||
return map[string]interface{}{
|
||||
"available": s.IsAvailable(),
|
||||
"primary": "dashscope",
|
||||
"dashscope": s.dashscope.GetStatus(),
|
||||
"available": s.IsAvailable(),
|
||||
"primary": "dashscope_rest",
|
||||
"dashscope_rest": s.dashscopeREST.GetStatus(),
|
||||
"dashscope_ws": s.dashscope.GetStatus(),
|
||||
"whisper": map[string]interface{}{
|
||||
"available": binaryAvailable && modelExists,
|
||||
"available": s.whisperAvailable(),
|
||||
"binary_available": binaryAvailable,
|
||||
"model_loaded": modelExists,
|
||||
"ffmpeg_available": ffmpegAvailable,
|
||||
"model_name": filepath.Base(s.whisperModel),
|
||||
},
|
||||
"default_language": s.language,
|
||||
|
||||
Reference in New Issue
Block a user