package llm import ( "bytes" "context" "encoding/base64" "encoding/json" "fmt" "io" "net/http" "net/url" "strings" "time" ) // ASRProvider handles speech-to-text transcription. type ASRProvider interface { Transcribe(ctx context.Context, audioURL string) (string, error) IsAvailable() bool ModelName() string } // DashScopeASRProvider uses DashScope Paraformer API for offline speech recognition. type DashScopeASRProvider struct { apiKey string baseURL string model string client *http.Client } // NewDashScopeASRProvider creates a DashScope ASR provider. func NewDashScopeASRProvider(baseURL, apiKey, model string) *DashScopeASRProvider { if model == "" { model = "qwen3-asr-flash-2026-02-10" } return &DashScopeASRProvider{ apiKey: apiKey, baseURL: baseURL, model: model, client: &http.Client{Timeout: 60 * time.Second}, } } // IsAvailable returns true if the API key is configured. func (p *DashScopeASRProvider) IsAvailable() bool { return p.apiKey != "" } // ModelName returns the ASR model name. func (p *DashScopeASRProvider) ModelName() string { return p.model } type asrRequest struct { Model string `json:"model"` Input asrInput `json:"input"` Parameters asrParams `json:"parameters"` } type asrInput struct { Audio string `json:"audio"` } type asrParams struct { Format string `json:"format,omitempty"` SampleRate int `json:"sample_rate,omitempty"` Language string `json:"language,omitempty"` } type asrResponse struct { Output struct { Text string `json:"text"` } `json:"output"` Usage struct { TotalTokens int `json:"total_tokens"` } `json:"usage"` RequestID string `json:"request_id"` Code string `json:"code,omitempty"` Message string `json:"message,omitempty"` } // downloadAudio fetches audio data from a URL and returns the bytes with inferred format. func (p *DashScopeASRProvider) downloadAudio(ctx context.Context, audioURL string) ([]byte, string, error) { req, err := http.NewRequestWithContext(ctx, "GET", audioURL, nil) if err != nil { return nil, "", fmt.Errorf("create download request: %w", err) } resp, err := p.client.Do(req) if err != nil { return nil, "", fmt.Errorf("download failed: %w", err) } defer resp.Body.Close() data, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) // 10 MB limit if err != nil { return nil, "", fmt.Errorf("read audio data: %w", err) } format := inferAudioFormat(audioURL, resp.Header.Get("Content-Type")) return data, format, nil } // inferAudioFormat determines the audio format from URL extension or Content-Type header. func inferAudioFormat(urlStr, contentType string) string { // Try URL extension first u, err := url.Parse(urlStr) if err == nil { path := u.Path if idx := strings.LastIndex(path, "."); idx >= 0 { ext := strings.ToLower(path[idx+1:]) switch ext { case "amr", "wav", "mp3", "ogg", "flac", "m4a", "aac", "opus", "webm", "pcm": return ext } } } // Fallback: use Content-Type if strings.Contains(contentType, "audio/amr") || strings.Contains(contentType, "amr") { return "amr" } if strings.Contains(contentType, "audio/wav") || strings.Contains(contentType, "wav") { return "wav" } if strings.Contains(contentType, "audio/mpeg") || strings.Contains(contentType, "mp3") { return "mp3" } if strings.Contains(contentType, "audio/ogg") || strings.Contains(contentType, "opus") { return "ogg" } return "amr" // default for QQ voice messages } // asrEndpoint derives the DashScope ASR REST endpoint from the provider base URL. func asrEndpoint(baseURL string) string { if u, err := url.Parse(baseURL); err == nil { return fmt.Sprintf("%s://%s/api/v1/services/audio/asr/asr", u.Scheme, u.Host) } return strings.TrimRight(baseURL, "/") + "/api/v1/services/audio/asr/asr" } func (p *DashScopeASRProvider) Transcribe(ctx context.Context, audioURL string) (string, error) { if !p.IsAvailable() { return "", fmt.Errorf("DashScope ASR API key not configured") } audioData, format, err := p.downloadAudio(ctx, audioURL) if err != nil { return "", fmt.Errorf("download audio: %w", err) } audioB64 := base64.StdEncoding.EncodeToString(audioData) reqBody := asrRequest{ Model: p.model, Input: asrInput{ Audio: fmt.Sprintf("data:audio/%s;base64,%s", format, audioB64), }, Parameters: asrParams{ Format: format, Language: "zh", }, } bodyBytes, err := json.Marshal(reqBody) if err != nil { return "", fmt.Errorf("marshal ASR request: %w", err) } asrURL := asrEndpoint(p.baseURL) req, err := http.NewRequestWithContext(ctx, "POST", asrURL, bytes.NewReader(bodyBytes)) if err != nil { return "", fmt.Errorf("create ASR request: %w", err) } req.Header.Set("Content-Type", "application/json") req.Header.Set("Authorization", "Bearer "+p.apiKey) resp, err := p.client.Do(req) if err != nil { return "", fmt.Errorf("ASR request failed: %w", err) } defer resp.Body.Close() respBytes, err := io.ReadAll(resp.Body) if err != nil { return "", fmt.Errorf("read ASR response: %w", err) } var asrResp asrResponse if err := json.Unmarshal(respBytes, &asrResp); err != nil { return "", fmt.Errorf("parse ASR response: %w", err) } if asrResp.Code != "" && asrResp.Code != "0" { return "", fmt.Errorf("ASR error: %s (code=%s)", asrResp.Message, asrResp.Code) } return asrResp.Output.Text, nil }