Files
Cyrene/backend/ai-core/internal/llm/asr.go
T
AskaEth 6ef9e082a6 feat: 语音流式输入管线 + VAD前端集成 + 插件-工具合并清理
- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST)
- Gateway: VoiceStreamManager代理WS流式STT到voice-service
- Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码
- 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端)
- 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并
- 文档: 完善gateway-api.md和voice-service.md语音API文档
- 工具: scripts/voice/ 语音转换脚本集

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-06 11:50:40 +08:00

124 lines
3.4 KiB
Go

package llm
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"git.yeij.top/AskaEth/Cyrene/pkg/audio"
"git.yeij.top/AskaEth/Cyrene/pkg/dashscope"
)
// ASRProvider handles speech-to-text transcription.
type ASRProvider interface {
Transcribe(ctx context.Context, audioURL, language string) (string, error)
IsAvailable() bool
ModelName() string
}
// DashScopeASRProvider uses DashScope Paraformer API for offline speech recognition.
type DashScopeASRProvider struct {
model string
client *dashscope.RESTClient
http *http.Client
}
// NewDashScopeASRProvider creates a DashScope ASR provider.
func NewDashScopeASRProvider(baseURL, apiKey, model string) *DashScopeASRProvider {
if model == "" {
model = "qwen3-asr-flash-2026-02-10"
}
return &DashScopeASRProvider{
model: model,
client: dashscope.NewRESTClient(apiKey),
http: &http.Client{Timeout: 60 * time.Second},
}
}
// IsAvailable returns true if the API key is configured.
func (p *DashScopeASRProvider) IsAvailable() bool {
return p.client.IsAvailable()
}
// ModelName returns the ASR model name.
func (p *DashScopeASRProvider) ModelName() string {
return p.model
}
// downloadAudio fetches audio data from a URL and returns the bytes with inferred format.
func (p *DashScopeASRProvider) downloadAudio(ctx context.Context, audioURL string) ([]byte, string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", audioURL, nil)
if err != nil {
return nil, "", fmt.Errorf("create download request: %w", err)
}
resp, err := p.http.Do(req)
if err != nil {
return nil, "", fmt.Errorf("download failed: %w", err)
}
defer resp.Body.Close()
data, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) // 10 MB limit
if err != nil {
return nil, "", fmt.Errorf("read audio data: %w", err)
}
format := inferAudioFormat(audioURL, resp.Header.Get("Content-Type"))
return data, format, nil
}
// inferAudioFormat determines the audio format from URL extension or Content-Type header.
func inferAudioFormat(urlStr, contentType string) string {
u, err := url.Parse(urlStr)
if err == nil {
path := u.Path
if idx := strings.LastIndex(path, "."); idx >= 0 {
ext := strings.ToLower(path[idx+1:])
switch ext {
case "amr", "wav", "mp3", "ogg", "flac", "m4a", "aac", "opus", "webm", "pcm":
return ext
}
}
}
if strings.Contains(contentType, "audio/amr") || strings.Contains(contentType, "amr") {
return "amr"
}
if strings.Contains(contentType, "audio/wav") || strings.Contains(contentType, "wav") {
return "wav"
}
if strings.Contains(contentType, "audio/mpeg") || strings.Contains(contentType, "mp3") {
return "mp3"
}
if strings.Contains(contentType, "audio/ogg") || strings.Contains(contentType, "opus") {
return "ogg"
}
return "amr" // default for QQ voice messages
}
func (p *DashScopeASRProvider) Transcribe(ctx context.Context, audioURL, language string) (string, error) {
if !p.IsAvailable() {
return "", fmt.Errorf("DashScope ASR API key not configured")
}
audioData, format, err := p.downloadAudio(ctx, audioURL)
if err != nil {
return "", fmt.Errorf("download audio: %w", err)
}
// 转码为 16kHz mono PCM,提升识别兼容性
pcmData, err := audio.ConvertToPCM16(audioData, format)
if err != nil {
return "", fmt.Errorf("audio transcode: %w", err)
}
if language == "" || language == "auto" {
language = "zh"
}
return p.client.Transcribe(ctx, p.model, pcmData, "pcm", 16000, language)
}