feat: ASR语音转写管线 + 群聊身份混淆修复
- 新增ASR语音识别管线: QQ语音→下载音频→qwen3-asr-flash转录→注入用户消息 - 模型名称全部从models.json路由获取,无硬编码 - 修复群聊中AI将非管理员用户误称为管理员昵称(叶酱)的问题 - 助手回复缓存时标注[回复 昵称 (UID)],防止对话历史中身份混淆 - 群聊上下文指令改为肯定性表述,移除具体名称提及 - trace面板时间戳改为YYYY-MM-DD HH:MM:SS格式,耗时统一显示为秒 - 修复Go time.Duration纳秒值在前端显示问题(Duration/1e6转毫秒) - 新增video_tool插件模板 - 优化OpenAI adapter reasoning_content处理 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,196 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ASRProvider handles speech-to-text transcription.
|
||||
type ASRProvider interface {
|
||||
Transcribe(ctx context.Context, audioURL string) (string, error)
|
||||
IsAvailable() bool
|
||||
ModelName() string
|
||||
}
|
||||
|
||||
// DashScopeASRProvider uses DashScope Paraformer API for offline speech recognition.
|
||||
type DashScopeASRProvider struct {
|
||||
apiKey string
|
||||
baseURL string
|
||||
model string
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// NewDashScopeASRProvider creates a DashScope ASR provider.
|
||||
func NewDashScopeASRProvider(baseURL, apiKey, model string) *DashScopeASRProvider {
|
||||
if model == "" {
|
||||
model = "qwen3-asr-flash-2026-02-10"
|
||||
}
|
||||
return &DashScopeASRProvider{
|
||||
apiKey: apiKey,
|
||||
baseURL: baseURL,
|
||||
model: model,
|
||||
client: &http.Client{Timeout: 60 * time.Second},
|
||||
}
|
||||
}
|
||||
|
||||
// IsAvailable returns true if the API key is configured.
|
||||
func (p *DashScopeASRProvider) IsAvailable() bool {
|
||||
return p.apiKey != ""
|
||||
}
|
||||
|
||||
// ModelName returns the ASR model name.
|
||||
func (p *DashScopeASRProvider) ModelName() string {
|
||||
return p.model
|
||||
}
|
||||
|
||||
type asrRequest struct {
|
||||
Model string `json:"model"`
|
||||
Input asrInput `json:"input"`
|
||||
Parameters asrParams `json:"parameters"`
|
||||
}
|
||||
|
||||
type asrInput struct {
|
||||
Audio string `json:"audio"`
|
||||
}
|
||||
|
||||
type asrParams struct {
|
||||
Format string `json:"format,omitempty"`
|
||||
SampleRate int `json:"sample_rate,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
}
|
||||
|
||||
type asrResponse struct {
|
||||
Output struct {
|
||||
Text string `json:"text"`
|
||||
} `json:"output"`
|
||||
Usage struct {
|
||||
TotalTokens int `json:"total_tokens"`
|
||||
} `json:"usage"`
|
||||
RequestID string `json:"request_id"`
|
||||
Code string `json:"code,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
// downloadAudio fetches audio data from a URL and returns the bytes with inferred format.
|
||||
func (p *DashScopeASRProvider) downloadAudio(ctx context.Context, audioURL string) ([]byte, string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", audioURL, nil)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("create download request: %w", err)
|
||||
}
|
||||
|
||||
resp, err := p.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("download failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
data, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) // 10 MB limit
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("read audio data: %w", err)
|
||||
}
|
||||
|
||||
format := inferAudioFormat(audioURL, resp.Header.Get("Content-Type"))
|
||||
return data, format, nil
|
||||
}
|
||||
|
||||
// inferAudioFormat determines the audio format from URL extension or Content-Type header.
|
||||
func inferAudioFormat(urlStr, contentType string) string {
|
||||
// Try URL extension first
|
||||
u, err := url.Parse(urlStr)
|
||||
if err == nil {
|
||||
path := u.Path
|
||||
if idx := strings.LastIndex(path, "."); idx >= 0 {
|
||||
ext := strings.ToLower(path[idx+1:])
|
||||
switch ext {
|
||||
case "amr", "wav", "mp3", "ogg", "flac", "m4a", "aac", "opus", "webm", "pcm":
|
||||
return ext
|
||||
}
|
||||
}
|
||||
}
|
||||
// Fallback: use Content-Type
|
||||
if strings.Contains(contentType, "audio/amr") || strings.Contains(contentType, "amr") {
|
||||
return "amr"
|
||||
}
|
||||
if strings.Contains(contentType, "audio/wav") || strings.Contains(contentType, "wav") {
|
||||
return "wav"
|
||||
}
|
||||
if strings.Contains(contentType, "audio/mpeg") || strings.Contains(contentType, "mp3") {
|
||||
return "mp3"
|
||||
}
|
||||
if strings.Contains(contentType, "audio/ogg") || strings.Contains(contentType, "opus") {
|
||||
return "ogg"
|
||||
}
|
||||
return "amr" // default for QQ voice messages
|
||||
}
|
||||
// asrEndpoint derives the DashScope ASR REST endpoint from the provider base URL.
|
||||
func asrEndpoint(baseURL string) string {
|
||||
if u, err := url.Parse(baseURL); err == nil {
|
||||
return fmt.Sprintf("%s://%s/api/v1/services/audio/asr/asr", u.Scheme, u.Host)
|
||||
}
|
||||
return strings.TrimRight(baseURL, "/") + "/api/v1/services/audio/asr/asr"
|
||||
}
|
||||
func (p *DashScopeASRProvider) Transcribe(ctx context.Context, audioURL string) (string, error) {
|
||||
if !p.IsAvailable() {
|
||||
return "", fmt.Errorf("DashScope ASR API key not configured")
|
||||
}
|
||||
|
||||
audioData, format, err := p.downloadAudio(ctx, audioURL)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("download audio: %w", err)
|
||||
}
|
||||
|
||||
audioB64 := base64.StdEncoding.EncodeToString(audioData)
|
||||
|
||||
reqBody := asrRequest{
|
||||
Model: p.model,
|
||||
Input: asrInput{
|
||||
Audio: fmt.Sprintf("data:audio/%s;base64,%s", format, audioB64),
|
||||
},
|
||||
Parameters: asrParams{
|
||||
Format: format,
|
||||
Language: "zh",
|
||||
},
|
||||
}
|
||||
bodyBytes, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("marshal ASR request: %w", err)
|
||||
}
|
||||
|
||||
asrURL := asrEndpoint(p.baseURL)
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", asrURL, bytes.NewReader(bodyBytes))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("create ASR request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+p.apiKey)
|
||||
|
||||
resp, err := p.client.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("ASR request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read ASR response: %w", err)
|
||||
}
|
||||
|
||||
var asrResp asrResponse
|
||||
if err := json.Unmarshal(respBytes, &asrResp); err != nil {
|
||||
return "", fmt.Errorf("parse ASR response: %w", err)
|
||||
}
|
||||
|
||||
if asrResp.Code != "" && asrResp.Code != "0" {
|
||||
return "", fmt.Errorf("ASR error: %s (code=%s)", asrResp.Message, asrResp.Code)
|
||||
}
|
||||
|
||||
return asrResp.Output.Text, nil
|
||||
}
|
||||
Reference in New Issue
Block a user