6ef9e082a6
- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST) - Gateway: VoiceStreamManager代理WS流式STT到voice-service - Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码 - 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端) - 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并 - 文档: 完善gateway-api.md和voice-service.md语音API文档 - 工具: scripts/voice/ 语音转换脚本集 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
128 lines
3.1 KiB
Go
128 lines
3.1 KiB
Go
package dashscope
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"time"
|
|
)
|
|
|
|
// ---- 共享类型 ----
|
|
|
|
// ASRRequest DashScope ASR REST API 请求体。
|
|
type ASRRequest struct {
|
|
Model string `json:"model"`
|
|
Input ASRInput `json:"input"`
|
|
Parameters ASRParams `json:"parameters"`
|
|
}
|
|
|
|
// ASRInput 音频输入。
|
|
type ASRInput struct {
|
|
Audio string `json:"audio"`
|
|
}
|
|
|
|
// ASRParams 识别参数。
|
|
type ASRParams struct {
|
|
Format string `json:"format,omitempty"`
|
|
SampleRate int `json:"sample_rate,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
}
|
|
|
|
// ASRResponse DashScope ASR REST API 响应体。
|
|
type ASRResponse struct {
|
|
Output struct {
|
|
Text string `json:"text"`
|
|
} `json:"output"`
|
|
Usage struct {
|
|
TotalTokens int `json:"total_tokens"`
|
|
} `json:"usage"`
|
|
RequestID string `json:"request_id"`
|
|
Code string `json:"code,omitempty"`
|
|
Message string `json:"message,omitempty"`
|
|
}
|
|
|
|
// ---- 共享客户端 ----
|
|
|
|
// RESTClient 封装 DashScope REST API 的 HTTP 通信。
|
|
type RESTClient struct {
|
|
apiKey string
|
|
client *http.Client
|
|
}
|
|
|
|
// NewRESTClient 创建 REST 客户端。
|
|
func NewRESTClient(apiKey string) *RESTClient {
|
|
return &RESTClient{
|
|
apiKey: apiKey,
|
|
client: &http.Client{Timeout: 60 * time.Second},
|
|
}
|
|
}
|
|
|
|
// IsAvailable 检查 API Key 是否已配置。
|
|
func (c *RESTClient) IsAvailable() bool {
|
|
return c.apiKey != ""
|
|
}
|
|
|
|
// Transcribe 调用 DashScope ASR REST API 进行语音识别。
|
|
// audioData 应为 PCM 16kHz mono 格式。
|
|
func (c *RESTClient) Transcribe(ctx context.Context, model string, audioData []byte, format string, sampleRate int, language string) (string, error) {
|
|
if !c.IsAvailable() {
|
|
return "", fmt.Errorf("DashScope ASR API key not configured")
|
|
}
|
|
if language == "" || language == "auto" {
|
|
language = "zh"
|
|
}
|
|
|
|
audioB64 := base64.StdEncoding.EncodeToString(audioData)
|
|
|
|
reqBody := ASRRequest{
|
|
Model: model,
|
|
Input: ASRInput{
|
|
Audio: fmt.Sprintf("data:audio/%s;base64,%s", format, audioB64),
|
|
},
|
|
Parameters: ASRParams{
|
|
Format: format,
|
|
SampleRate: sampleRate,
|
|
Language: language,
|
|
},
|
|
}
|
|
|
|
bodyBytes, err := json.Marshal(reqBody)
|
|
if err != nil {
|
|
return "", fmt.Errorf("marshal ASR request: %w", err)
|
|
}
|
|
|
|
url := "https://dashscope.aliyuncs.com/api/v1/services/audio/asr/asr"
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(bodyBytes))
|
|
if err != nil {
|
|
return "", fmt.Errorf("create ASR request: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("Authorization", "Bearer "+c.apiKey)
|
|
|
|
resp, err := c.client.Do(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("ASR request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
respBytes, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("read ASR response: %w", err)
|
|
}
|
|
|
|
var asrResp ASRResponse
|
|
if err := json.Unmarshal(respBytes, &asrResp); err != nil {
|
|
return "", fmt.Errorf("parse ASR response: %w", err)
|
|
}
|
|
|
|
if asrResp.Code != "" && asrResp.Code != "0" {
|
|
return "", fmt.Errorf("ASR error: %s (code=%s)", asrResp.Message, asrResp.Code)
|
|
}
|
|
|
|
return asrResp.Output.Text, nil
|
|
}
|