6ef9e082a6
- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST) - Gateway: VoiceStreamManager代理WS流式STT到voice-service - Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码 - 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端) - 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并 - 文档: 完善gateway-api.md和voice-service.md语音API文档 - 工具: scripts/voice/ 语音转换脚本集 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
124 lines
3.4 KiB
Go
124 lines
3.4 KiB
Go
package llm
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
|
|
"git.yeij.top/AskaEth/Cyrene/pkg/audio"
|
|
"git.yeij.top/AskaEth/Cyrene/pkg/dashscope"
|
|
)
|
|
|
|
// ASRProvider handles speech-to-text transcription.
|
|
type ASRProvider interface {
|
|
Transcribe(ctx context.Context, audioURL, language string) (string, error)
|
|
IsAvailable() bool
|
|
ModelName() string
|
|
}
|
|
|
|
// DashScopeASRProvider uses DashScope Paraformer API for offline speech recognition.
|
|
type DashScopeASRProvider struct {
|
|
model string
|
|
client *dashscope.RESTClient
|
|
http *http.Client
|
|
}
|
|
|
|
// NewDashScopeASRProvider creates a DashScope ASR provider.
|
|
func NewDashScopeASRProvider(baseURL, apiKey, model string) *DashScopeASRProvider {
|
|
if model == "" {
|
|
model = "qwen3-asr-flash-2026-02-10"
|
|
}
|
|
return &DashScopeASRProvider{
|
|
model: model,
|
|
client: dashscope.NewRESTClient(apiKey),
|
|
http: &http.Client{Timeout: 60 * time.Second},
|
|
}
|
|
}
|
|
|
|
// IsAvailable returns true if the API key is configured.
|
|
func (p *DashScopeASRProvider) IsAvailable() bool {
|
|
return p.client.IsAvailable()
|
|
}
|
|
|
|
// ModelName returns the ASR model name.
|
|
func (p *DashScopeASRProvider) ModelName() string {
|
|
return p.model
|
|
}
|
|
|
|
// downloadAudio fetches audio data from a URL and returns the bytes with inferred format.
|
|
func (p *DashScopeASRProvider) downloadAudio(ctx context.Context, audioURL string) ([]byte, string, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", audioURL, nil)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("create download request: %w", err)
|
|
}
|
|
|
|
resp, err := p.http.Do(req)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("download failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
data, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) // 10 MB limit
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("read audio data: %w", err)
|
|
}
|
|
|
|
format := inferAudioFormat(audioURL, resp.Header.Get("Content-Type"))
|
|
return data, format, nil
|
|
}
|
|
|
|
// inferAudioFormat determines the audio format from URL extension or Content-Type header.
|
|
func inferAudioFormat(urlStr, contentType string) string {
|
|
u, err := url.Parse(urlStr)
|
|
if err == nil {
|
|
path := u.Path
|
|
if idx := strings.LastIndex(path, "."); idx >= 0 {
|
|
ext := strings.ToLower(path[idx+1:])
|
|
switch ext {
|
|
case "amr", "wav", "mp3", "ogg", "flac", "m4a", "aac", "opus", "webm", "pcm":
|
|
return ext
|
|
}
|
|
}
|
|
}
|
|
if strings.Contains(contentType, "audio/amr") || strings.Contains(contentType, "amr") {
|
|
return "amr"
|
|
}
|
|
if strings.Contains(contentType, "audio/wav") || strings.Contains(contentType, "wav") {
|
|
return "wav"
|
|
}
|
|
if strings.Contains(contentType, "audio/mpeg") || strings.Contains(contentType, "mp3") {
|
|
return "mp3"
|
|
}
|
|
if strings.Contains(contentType, "audio/ogg") || strings.Contains(contentType, "opus") {
|
|
return "ogg"
|
|
}
|
|
return "amr" // default for QQ voice messages
|
|
}
|
|
|
|
func (p *DashScopeASRProvider) Transcribe(ctx context.Context, audioURL, language string) (string, error) {
|
|
if !p.IsAvailable() {
|
|
return "", fmt.Errorf("DashScope ASR API key not configured")
|
|
}
|
|
|
|
audioData, format, err := p.downloadAudio(ctx, audioURL)
|
|
if err != nil {
|
|
return "", fmt.Errorf("download audio: %w", err)
|
|
}
|
|
|
|
// 转码为 16kHz mono PCM,提升识别兼容性
|
|
pcmData, err := audio.ConvertToPCM16(audioData, format)
|
|
if err != nil {
|
|
return "", fmt.Errorf("audio transcode: %w", err)
|
|
}
|
|
|
|
if language == "" || language == "auto" {
|
|
language = "zh"
|
|
}
|
|
|
|
return p.client.Transcribe(ctx, p.model, pcmData, "pcm", 16000, language)
|
|
}
|