Files
Cyrene/backend/pkg/dashscope/asr_rest.go
T
AskaEth 6ef9e082a6 feat: 语音流式输入管线 + VAD前端集成 + 插件-工具合并清理
- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST)
- Gateway: VoiceStreamManager代理WS流式STT到voice-service
- Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码
- 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端)
- 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并
- 文档: 完善gateway-api.md和voice-service.md语音API文档
- 工具: scripts/voice/ 语音转换脚本集

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-06 11:50:40 +08:00

128 lines
3.1 KiB
Go

package dashscope
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"net/http"
"time"
)
// ---- 共享类型 ----
// ASRRequest DashScope ASR REST API 请求体。
type ASRRequest struct {
Model string `json:"model"`
Input ASRInput `json:"input"`
Parameters ASRParams `json:"parameters"`
}
// ASRInput 音频输入。
type ASRInput struct {
Audio string `json:"audio"`
}
// ASRParams 识别参数。
type ASRParams struct {
Format string `json:"format,omitempty"`
SampleRate int `json:"sample_rate,omitempty"`
Language string `json:"language,omitempty"`
}
// ASRResponse DashScope ASR REST API 响应体。
type ASRResponse struct {
Output struct {
Text string `json:"text"`
} `json:"output"`
Usage struct {
TotalTokens int `json:"total_tokens"`
} `json:"usage"`
RequestID string `json:"request_id"`
Code string `json:"code,omitempty"`
Message string `json:"message,omitempty"`
}
// ---- 共享客户端 ----
// RESTClient 封装 DashScope REST API 的 HTTP 通信。
type RESTClient struct {
apiKey string
client *http.Client
}
// NewRESTClient 创建 REST 客户端。
func NewRESTClient(apiKey string) *RESTClient {
return &RESTClient{
apiKey: apiKey,
client: &http.Client{Timeout: 60 * time.Second},
}
}
// IsAvailable 检查 API Key 是否已配置。
func (c *RESTClient) IsAvailable() bool {
return c.apiKey != ""
}
// Transcribe 调用 DashScope ASR REST API 进行语音识别。
// audioData 应为 PCM 16kHz mono 格式。
func (c *RESTClient) Transcribe(ctx context.Context, model string, audioData []byte, format string, sampleRate int, language string) (string, error) {
if !c.IsAvailable() {
return "", fmt.Errorf("DashScope ASR API key not configured")
}
if language == "" || language == "auto" {
language = "zh"
}
audioB64 := base64.StdEncoding.EncodeToString(audioData)
reqBody := ASRRequest{
Model: model,
Input: ASRInput{
Audio: fmt.Sprintf("data:audio/%s;base64,%s", format, audioB64),
},
Parameters: ASRParams{
Format: format,
SampleRate: sampleRate,
Language: language,
},
}
bodyBytes, err := json.Marshal(reqBody)
if err != nil {
return "", fmt.Errorf("marshal ASR request: %w", err)
}
url := "https://dashscope.aliyuncs.com/api/v1/services/audio/asr/asr"
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(bodyBytes))
if err != nil {
return "", fmt.Errorf("create ASR request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+c.apiKey)
resp, err := c.client.Do(req)
if err != nil {
return "", fmt.Errorf("ASR request failed: %w", err)
}
defer resp.Body.Close()
respBytes, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("read ASR response: %w", err)
}
var asrResp ASRResponse
if err := json.Unmarshal(respBytes, &asrResp); err != nil {
return "", fmt.Errorf("parse ASR response: %w", err)
}
if asrResp.Code != "" && asrResp.Code != "0" {
return "", fmt.Errorf("ASR error: %s (code=%s)", asrResp.Message, asrResp.Code)
}
return asrResp.Output.Text, nil
}