feat: ASR语音转写管线 + 群聊身份混淆修复

- 新增ASR语音识别管线: QQ语音→下载音频→qwen3-asr-flash转录→注入用户消息
- 模型名称全部从models.json路由获取,无硬编码
- 修复群聊中AI将非管理员用户误称为管理员昵称(叶酱)的问题
  - 助手回复缓存时标注[回复 昵称 (UID)],防止对话历史中身份混淆
  - 群聊上下文指令改为肯定性表述,移除具体名称提及
- trace面板时间戳改为YYYY-MM-DD HH:MM:SS格式,耗时统一显示为秒
- 修复Go time.Duration纳秒值在前端显示问题(Duration/1e6转毫秒)
- 新增video_tool插件模板
- 优化OpenAI adapter reasoning_content处理

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-31 16:46:47 +08:00
parent d112fdd540
commit a9c79d7887
16 changed files with 780 additions and 67 deletions
+196
View File
@@ -0,0 +1,196 @@
package llm
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
)
// ASRProvider handles speech-to-text transcription.
type ASRProvider interface {
Transcribe(ctx context.Context, audioURL string) (string, error)
IsAvailable() bool
ModelName() string
}
// DashScopeASRProvider uses DashScope Paraformer API for offline speech recognition.
type DashScopeASRProvider struct {
apiKey string
baseURL string
model string
client *http.Client
}
// NewDashScopeASRProvider creates a DashScope ASR provider.
func NewDashScopeASRProvider(baseURL, apiKey, model string) *DashScopeASRProvider {
if model == "" {
model = "qwen3-asr-flash-2026-02-10"
}
return &DashScopeASRProvider{
apiKey: apiKey,
baseURL: baseURL,
model: model,
client: &http.Client{Timeout: 60 * time.Second},
}
}
// IsAvailable returns true if the API key is configured.
func (p *DashScopeASRProvider) IsAvailable() bool {
return p.apiKey != ""
}
// ModelName returns the ASR model name.
func (p *DashScopeASRProvider) ModelName() string {
return p.model
}
type asrRequest struct {
Model string `json:"model"`
Input asrInput `json:"input"`
Parameters asrParams `json:"parameters"`
}
type asrInput struct {
Audio string `json:"audio"`
}
type asrParams struct {
Format string `json:"format,omitempty"`
SampleRate int `json:"sample_rate,omitempty"`
Language string `json:"language,omitempty"`
}
type asrResponse struct {
Output struct {
Text string `json:"text"`
} `json:"output"`
Usage struct {
TotalTokens int `json:"total_tokens"`
} `json:"usage"`
RequestID string `json:"request_id"`
Code string `json:"code,omitempty"`
Message string `json:"message,omitempty"`
}
// downloadAudio fetches audio data from a URL and returns the bytes with inferred format.
func (p *DashScopeASRProvider) downloadAudio(ctx context.Context, audioURL string) ([]byte, string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", audioURL, nil)
if err != nil {
return nil, "", fmt.Errorf("create download request: %w", err)
}
resp, err := p.client.Do(req)
if err != nil {
return nil, "", fmt.Errorf("download failed: %w", err)
}
defer resp.Body.Close()
data, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) // 10 MB limit
if err != nil {
return nil, "", fmt.Errorf("read audio data: %w", err)
}
format := inferAudioFormat(audioURL, resp.Header.Get("Content-Type"))
return data, format, nil
}
// inferAudioFormat determines the audio format from URL extension or Content-Type header.
func inferAudioFormat(urlStr, contentType string) string {
// Try URL extension first
u, err := url.Parse(urlStr)
if err == nil {
path := u.Path
if idx := strings.LastIndex(path, "."); idx >= 0 {
ext := strings.ToLower(path[idx+1:])
switch ext {
case "amr", "wav", "mp3", "ogg", "flac", "m4a", "aac", "opus", "webm", "pcm":
return ext
}
}
}
// Fallback: use Content-Type
if strings.Contains(contentType, "audio/amr") || strings.Contains(contentType, "amr") {
return "amr"
}
if strings.Contains(contentType, "audio/wav") || strings.Contains(contentType, "wav") {
return "wav"
}
if strings.Contains(contentType, "audio/mpeg") || strings.Contains(contentType, "mp3") {
return "mp3"
}
if strings.Contains(contentType, "audio/ogg") || strings.Contains(contentType, "opus") {
return "ogg"
}
return "amr" // default for QQ voice messages
}
// asrEndpoint derives the DashScope ASR REST endpoint from the provider base URL.
func asrEndpoint(baseURL string) string {
if u, err := url.Parse(baseURL); err == nil {
return fmt.Sprintf("%s://%s/api/v1/services/audio/asr/asr", u.Scheme, u.Host)
}
return strings.TrimRight(baseURL, "/") + "/api/v1/services/audio/asr/asr"
}
func (p *DashScopeASRProvider) Transcribe(ctx context.Context, audioURL string) (string, error) {
if !p.IsAvailable() {
return "", fmt.Errorf("DashScope ASR API key not configured")
}
audioData, format, err := p.downloadAudio(ctx, audioURL)
if err != nil {
return "", fmt.Errorf("download audio: %w", err)
}
audioB64 := base64.StdEncoding.EncodeToString(audioData)
reqBody := asrRequest{
Model: p.model,
Input: asrInput{
Audio: fmt.Sprintf("data:audio/%s;base64,%s", format, audioB64),
},
Parameters: asrParams{
Format: format,
Language: "zh",
},
}
bodyBytes, err := json.Marshal(reqBody)
if err != nil {
return "", fmt.Errorf("marshal ASR request: %w", err)
}
asrURL := asrEndpoint(p.baseURL)
req, err := http.NewRequestWithContext(ctx, "POST", asrURL, bytes.NewReader(bodyBytes))
if err != nil {
return "", fmt.Errorf("create ASR request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+p.apiKey)
resp, err := p.client.Do(req)
if err != nil {
return "", fmt.Errorf("ASR request failed: %w", err)
}
defer resp.Body.Close()
respBytes, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("read ASR response: %w", err)
}
var asrResp asrResponse
if err := json.Unmarshal(respBytes, &asrResp); err != nil {
return "", fmt.Errorf("parse ASR response: %w", err)
}
if asrResp.Code != "" && asrResp.Code != "0" {
return "", fmt.Errorf("ASR error: %s (code=%s)", asrResp.Message, asrResp.Code)
}
return asrResp.Output.Text, nil
}