Files
Cyrene/backend/voice-service/cmd/test_asr/main.go
T
AskaEth 6ef9e082a6 feat: 语音流式输入管线 + VAD前端集成 + 插件-工具合并清理
- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST)
- Gateway: VoiceStreamManager代理WS流式STT到voice-service
- Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码
- 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端)
- 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并
- 文档: 完善gateway-api.md和voice-service.md语音API文档
- 工具: scripts/voice/ 语音转换脚本集

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-06 11:50:40 +08:00

225 lines
5.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"flag"
"fmt"
"io"
"net/http"
"os"
"os/signal"
"time"
"github.com/gorilla/websocket"
)
func main() {
mode := flag.String("mode", "offline", "测试模式: offline (非实时) 或 realtime (实时)")
file := flag.String("file", "", "音频文件路径 (WAV/MP3/OGG/FLAC)")
server := flag.String("server", "http://localhost:8093", "Voice-Service 地址")
lang := flag.String("lang", "zh", "语言代码")
flag.Parse()
if *file == "" {
fmt.Println("用法: test_asr -mode=offline -file=audio.wav [-server=http://localhost:8093]")
os.Exit(1)
}
switch *mode {
case "offline":
testOffline(*server, *file, *lang)
case "realtime":
testRealtime(*server, *file, *lang)
default:
fmt.Printf("未知模式: %s (支持: offline, realtime)\n", *mode)
os.Exit(1)
}
}
// testOffline 测试非实时语音识别 (HTTP multipart 上传)。
func testOffline(server, filePath, lang string) {
fmt.Printf("=== 非实时 ASR 测试 ===\n")
fmt.Printf("服务器: %s\n", server)
fmt.Printf("文件: %s\n", filePath)
fmt.Printf("语言: %s\n\n", lang)
// 读取音频文件
audioData, err := os.ReadFile(filePath)
if err != nil {
fmt.Printf("读取文件失败: %v\n", err)
os.Exit(1)
}
fmt.Printf("音频大小: %d bytes\n", len(audioData))
// 创建 multipart 请求
req, err := http.NewRequest("POST", server+"/api/v1/transcribe", nil)
if err != nil {
fmt.Printf("创建请求失败: %v\n", err)
os.Exit(1)
}
// 使用 multipart form
body, contentType, err := createMultipartBody(audioData, filePath, lang)
if err != nil {
fmt.Printf("创建 multipart body 失败: %v\n", err)
os.Exit(1)
}
req.Body = body
req.Header.Set("Content-Type", contentType)
start := time.Now()
resp, err := http.DefaultClient.Do(req)
if err != nil {
fmt.Printf("请求失败: %v\n", err)
os.Exit(1)
}
defer resp.Body.Close()
elapsed := time.Since(start)
respBody, _ := io.ReadAll(resp.Body)
fmt.Printf("状态码: %d\n", resp.StatusCode)
fmt.Printf("耗时: %v\n", elapsed)
fmt.Printf("响应:\n%s\n", string(respBody))
if resp.StatusCode == 200 {
fmt.Println("\n✅ 非实时语音识别成功!")
} else {
fmt.Println("\n❌ 非实时语音识别失败")
}
}
// testRealtime 测试实时语音识别 (WebSocket 流式)。
func testRealtime(server, filePath, lang string) {
fmt.Printf("=== 实时 ASR 测试 ===\n")
fmt.Printf("服务器: %s\n", server)
fmt.Printf("文件: %s\n", filePath)
fmt.Printf("语言: %s\n\n", lang)
// 读取音频文件
audioData, err := os.ReadFile(filePath)
if err != nil {
fmt.Printf("读取文件失败: %v\n", err)
os.Exit(1)
}
fmt.Printf("音频大小: %d bytes\n", len(audioData))
// 推断格式
format := inferFormat(filePath)
// 连接 WebSocket
wsURL := fmt.Sprintf("ws://%s/api/v1/stt/stream?format=%s&language=%s",
server[7:], format, lang) // 去掉 http:// 前缀
conn, _, err := websocket.DefaultDialer.Dial(wsURL, nil)
if err != nil {
fmt.Printf("WebSocket 连接失败: %v\n", err)
os.Exit(1)
}
defer conn.Close()
fmt.Printf("WebSocket 已连接: %s\n", wsURL)
// 设置 interrupt 处理
interrupt := make(chan os.Signal, 1)
signal.Notify(interrupt, os.Interrupt)
// goroutine: 读取识别结果
done := make(chan struct{})
go func() {
defer close(done)
for {
_, msg, err := conn.ReadMessage()
if err != nil {
fmt.Printf("读取结果错误: %v\n", err)
return
}
fmt.Printf("◀ 结果: %s\n", string(msg))
}
}()
// 模拟实时流式发送音频(每 100ms 发送 3200 bytes
chunkSize := 3200
totalSent := 0
start := time.Now()
var elapsed time.Duration
cancelled := false
for i := 0; i < len(audioData); i += chunkSize {
end := i + chunkSize
if end > len(audioData) {
end = len(audioData)
}
select {
case <-interrupt:
fmt.Println("\n用户中断")
cancelled = true
default:
}
if cancelled {
break
}
if err := conn.WriteMessage(websocket.BinaryMessage, audioData[i:end]); err != nil {
fmt.Printf("发送音频失败: %v\n", err)
break
}
totalSent += end - i
fmt.Printf("▶ 发送 %d/%d bytes (%.1f%%)\n", totalSent, len(audioData),
float64(totalSent)/float64(len(audioData))*100)
time.Sleep(100 * time.Millisecond)
}
elapsed = time.Since(start)
// 发送停止消息
conn.WriteMessage(websocket.TextMessage, []byte(`{"action":"stop"}`))
// 等待最后的结果
time.Sleep(2 * time.Second)
fmt.Printf("\n总耗时: %v, 总发送: %d bytes\n", elapsed, totalSent)
fmt.Println("✅ 实时语音识别测试完成")
}
func inferFormat(filename string) string {
ext := ""
for i := len(filename) - 1; i >= 0; i-- {
if filename[i] == '.' {
ext = filename[i+1:]
break
}
}
switch ext {
case "wav", "wave":
return "wav"
case "mp3", "mpeg":
return "mp3"
case "ogg", "opus":
return "ogg"
case "flac":
return "flac"
case "m4a", "mp4", "aac":
return "m4a"
default:
return "pcm"
}
}
func createMultipartBody(audioData []byte, filename, lang string) (io.ReadCloser, string, error) {
boundary := "cyrene-asr-test-boundary"
header := fmt.Sprintf("--%s\r\nContent-Disposition: form-data; name=\"audio\"; filename=\"%s\"\r\nContent-Type: application/octet-stream\r\n\r\n",
boundary, filename)
footer := fmt.Sprintf("\r\n--%s\r\nContent-Disposition: form-data; name=\"language\"\r\n\r\n%s\r\n--%s--\r\n",
boundary, lang, boundary)
pr, pw := io.Pipe()
go func() {
pw.Write([]byte(header))
pw.Write(audioData)
pw.Write([]byte(footer))
pw.Close()
}()
return pr, "multipart/form-data; boundary=" + boundary, nil
}