refactor: 认证系统重构 + DevTools CLI 重写 + 文档全面更新

- auth: Login 简化为管理员始终通过 .env 验证,GetProfile 修正 admin DB 查询
- devtools: .sh/.bat 同步重写为完整 CLI (start/stop/status/logs/build/db:*)
- docs: 新增 devtools.md,重写 Deploy.md (三种方式+Windows说明),更新 README/gateway-api
- voice-service: DashScope 实时流式 STT 支持
- gateway: Phase 6 多模型配置 + 多端客户端管理 + WebSocket 增强

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-24 14:55:47 +08:00
parent 83e94d9e97
commit 7eb5e984c2
18 changed files with 2405 additions and 677 deletions
@@ -2,15 +2,21 @@ package service
import (
"context"
"encoding/base64"
"encoding/json"
"fmt"
"os"
"os/exec"
"strings"
"sync"
"time"
"github.com/gorilla/websocket"
)
// DashScopeSTT 使用阿里云百炼 Gummy 模型进行语音识别。
// WebSocket API: wss://dashscope.aliyuncs.com/api-ws/v1/inference
// DashScopeSTT 使用阿里云百炼 Qwen ASR 模型进行语音识别。
// 实时模型 (qwen3-asr-flash-realtime) 通过 WebSocket realtime 端点进行流式识别,
// 基于 session/VAD 协议(类似 OpenAI Realtime API)。
type DashScopeSTT struct {
apiKey string
model string
@@ -20,7 +26,7 @@ type DashScopeSTT struct {
// NewDashScopeSTT 创建 DashScope STT 客户端。
func NewDashScopeSTT(apiKey, model string) *DashScopeSTT {
if model == "" {
model = "gummy-chat-v1"
model = "qwen3-asr-flash-realtime"
}
return &DashScopeSTT{
apiKey: apiKey,
@@ -34,232 +40,402 @@ func (d *DashScopeSTT) IsAvailable() bool {
return d.apiKey != ""
}
// sttMessage 定义 STT WebSocket 协议消息格式
type sttMessage struct {
Header sttHeader `json:"header"`
Payload sttPayload `json:"payload"`
// Model 返回模型名
func (d *DashScopeSTT) Model() string { return d.model }
// --- Realtime 端点协议消息类型 ---
type rtClientMsg struct {
EventID string `json:"event_id,omitempty"`
Type string `json:"type"`
Session interface{} `json:"session,omitempty"`
Audio string `json:"audio,omitempty"`
}
type sttHeader struct {
Streaming string `json:"streaming"`
TaskID string `json:"task_id"`
Action string `json:"action"`
type rtServerMsg struct {
EventID string `json:"event_id,omitempty"`
Type string `json:"type"`
Session json.RawMessage `json:"session,omitempty"`
Error *rtError `json:"error,omitempty"`
// response.audio_transcript.delta
Delta string `json:"delta,omitempty"`
Response *struct {
Output []struct {
Transcript string `json:"transcript,omitempty"`
} `json:"output,omitempty"`
} `json:"response,omitempty"`
// transcription completed transcript
Transcript string `json:"transcript,omitempty"`
// conversation.item.input_audio_transcription.completed
Item *struct {
Content []struct {
Transcript string `json:"transcript,omitempty"`
} `json:"content,omitempty"`
} `json:"item,omitempty"`
}
type sttPayload struct {
Model string `json:"model"`
TaskGroup string `json:"task_group"`
Task string `json:"task"`
Function string `json:"function"`
Input map[string]interface{} `json:"input,omitempty"`
Parameters sttParameters `json:"parameters"`
Output map[string]interface{} `json:"output,omitempty"`
}
type sttParameters struct {
SampleRate int `json:"sample_rate"`
Format string `json:"format"`
TranscriptionEnabled bool `json:"transcription_enabled"`
TranslationEnabled bool `json:"translation_enabled"`
SourceLanguage string `json:"source_language,omitempty"`
MaxEndSilence int `json:"max_end_silence,omitempty"`
}
// sttServerMsg 服务端返回的消息格式。
type sttServerMsg struct {
Header sttServerHeader `json:"header"`
Payload sttServerPayload `json:"payload"`
}
type sttServerHeader struct {
TaskID string `json:"task_id"`
Event string `json:"event"`
}
type sttServerPayload struct {
Output map[string]interface{} `json:"output,omitempty"`
Usage map[string]interface{} `json:"usage,omitempty"`
Error sttError `json:"error,omitempty"`
}
type sttError struct {
type rtError struct {
Type string `json:"type"`
Code string `json:"code"`
Message string `json:"message"`
Param string `json:"param,omitempty"`
}
// Transcribe 将音频数据发送到 DashScope 进行识别,返回识别文本。
// 使用 realtime 端点,通过 Server VAD 自动检测语音并触发转录。
func (d *DashScopeSTT) Transcribe(ctx context.Context, audioData []byte, format string, language string) (string, error) {
if !d.IsAvailable() {
return "", fmt.Errorf("DashScope API Key 未配置")
}
dialer := websocket.Dialer{
HandshakeTimeout: 10 * time.Second,
url := fmt.Sprintf("wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=%s", d.model)
header := map[string][]string{
"Authorization": {"Bearer " + d.apiKey},
}
header := make(map[string][]string)
header["Authorization"] = []string{"Bearer " + d.apiKey}
conn, _, err := dialer.DialContext(ctx, "wss://dashscope.aliyuncs.com/api-ws/v1/inference", header)
dialer := websocket.Dialer{HandshakeTimeout: 10 * time.Second}
conn, _, err := dialer.DialContext(ctx, url, header)
if err != nil {
return "", fmt.Errorf("连接 DashScope STT 失败: %w", err)
}
defer conn.Close()
conn.SetReadDeadline(time.Now().Add(d.timeout))
// 1. session.created
conn.SetReadDeadline(time.Now().Add(10 * time.Second))
var msg rtServerMsg
if err := conn.ReadJSON(&msg); err != nil {
return "", fmt.Errorf("等待 session.created 失败: %w", err)
}
if msg.Type != "session.created" {
return "", fmt.Errorf("预期 session.created 但收到: %s", msg.Type)
}
taskID := fmt.Sprintf("cyrene-stt-%d", time.Now().UnixNano())
// 规范化音频格式
normFormat := normalizeSTTFormat(format)
// 2. session.update
if language == "" || language == "auto" {
language = "zh"
}
// 发送 run-task
startMsg := sttMessage{
Header: sttHeader{
Streaming: "duplex",
TaskID: taskID,
Action: "run-task",
},
Payload: sttPayload{
Model: d.model,
TaskGroup: "audio",
Task: "asr",
Function: "recognition",
Parameters: sttParameters{
SampleRate: 16000,
Format: normFormat,
TranscriptionEnabled: true,
TranslationEnabled: false,
SourceLanguage: language,
updateMsg := rtClientMsg{
Type: "session.update",
Session: map[string]interface{}{
"modalities": []string{"text"},
"input_audio_format": "pcm",
"sample_rate": 16000,
"input_audio_transcription": map[string]interface{}{
"language": language,
},
"turn_detection": map[string]interface{}{
"type": "server_vad",
},
},
}
if err := conn.WriteJSON(startMsg); err != nil {
return "", fmt.Errorf("发送 run-task 失败: %w", err)
conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
if err := conn.WriteJSON(updateMsg); err != nil {
return "", fmt.Errorf("发送 session.update 失败: %w", err)
}
// 等待 task-started
var textResult string
var mu sync.Mutex
started := make(chan struct{})
errc := make(chan error, 1)
done := make(chan struct{})
// 3. session.updated
conn.SetReadDeadline(time.Now().Add(10 * time.Second))
if err := conn.ReadJSON(&msg); err != nil {
return "", fmt.Errorf("等待 session.updated 失败: %w", err)
}
if msg.Type == "error" && msg.Error != nil {
return "", fmt.Errorf("session.update 失败: %s", msg.Error.Message)
}
// 4. 规范化音频格式并发送
pcmData, err := convertToPCM16(audioData, format)
if err != nil {
return "", fmt.Errorf("音频格式转换失败: %w", err)
}
chunkSize := 3200
for i := 0; i < len(pcmData); i += chunkSize {
end := i + chunkSize
if end > len(pcmData) {
end = len(pcmData)
}
chunkB64 := base64.StdEncoding.EncodeToString(pcmData[i:end])
audioMsg := rtClientMsg{
Type: "input_audio_buffer.append",
Audio: chunkB64,
}
conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
if err := conn.WriteJSON(audioMsg); err != nil {
return "", fmt.Errorf("发送音频数据失败: %w", err)
}
}
// 5. 等待转录结果
// 用 goroutine + channel 避免 gorilla/websocket 超时后重复读取 panic
type readResult struct {
msg rtServerMsg
err error
}
msgCh := make(chan readResult, 1)
readDone := make(chan struct{})
defer close(readDone)
go func() {
defer close(done)
startedClosed := false
for {
var msg sttServerMsg
if err := conn.ReadJSON(&msg); err != nil {
select {
case errc <- fmt.Errorf("读取响应失败: %w", err):
default:
}
select {
case <-readDone:
return
default:
}
var m rtServerMsg
err := conn.ReadJSON(&m)
select {
case msgCh <- readResult{m, err}:
case <-readDone:
return
}
switch msg.Header.Event {
case "task-started":
if !startedClosed {
close(started)
startedClosed = true
}
case "result-generated":
if out, ok := msg.Payload.Output["transcription"]; ok {
if transMap, ok := out.(map[string]interface{}); ok {
if text, ok := transMap["text"].(string); ok {
mu.Lock()
textResult = text
mu.Unlock()
}
}
}
case "task-finished":
return
case "task-failed":
errMsg := msg.Payload.Error.Message
if errMsg == "" {
errMsg = "未知错误"
}
select {
case errc <- fmt.Errorf("DashScope 识别失败: %s (code=%s)", errMsg, msg.Payload.Error.Code):
default:
}
if err != nil {
return
}
}
}()
// 等待 task-started 或错误
select {
case <-started:
case err := <-errc:
return "", err
case <-ctx.Done():
return "", ctx.Err()
}
var textResult string
silenceTimeout := 3 * time.Second
timer := time.NewTimer(60 * time.Second)
defer timer.Stop()
// 发送音频数据(分块发送,每块 ~10KB)
chunkSize := 10240
for i := 0; i < len(audioData); i += chunkSize {
end := i + chunkSize
if end > len(audioData) {
end = len(audioData)
}
conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
if err := conn.WriteMessage(websocket.BinaryMessage, audioData[i:end]); err != nil {
return "", fmt.Errorf("发送音频数据失败: %w", err)
}
}
for {
select {
case result := <-msgCh:
if result.err != nil {
if websocket.IsUnexpectedCloseError(result.err) {
return "", fmt.Errorf("连接异常关闭: %w", result.err)
}
return textResult, nil
}
// 发送 finish-task
finishMsg := sttMessage{
Header: sttHeader{
Streaming: "duplex",
TaskID: taskID,
Action: "finish-task",
},
}
if err := conn.WriteJSON(finishMsg); err != nil {
return "", fmt.Errorf("发送 finish-task 失败: %w", err)
}
msg := result.msg
// 等待完成
select {
case <-done:
mu.Lock()
text := textResult
mu.Unlock()
if text == "" {
return "", fmt.Errorf("未收到识别结果")
switch msg.Type {
case "conversation.item.input_audio_transcription.completed":
if msg.Transcript != "" {
if textResult != "" {
textResult += "\n"
}
textResult += msg.Transcript
}
if textResult == "" && msg.Item != nil {
for _, c := range msg.Item.Content {
if c.Transcript != "" {
textResult = c.Transcript
}
}
}
case "response.audio_transcript.delta":
if msg.Delta != "" {
textResult += msg.Delta
}
case "response.done":
if textResult == "" && msg.Response != nil {
for _, o := range msg.Response.Output {
if o.Transcript != "" {
textResult += o.Transcript
}
}
}
if textResult != "" {
return textResult, nil
}
case "error":
if msg.Error != nil {
return "", fmt.Errorf("DashScope 识别失败: %s", msg.Error.Message)
}
return "", fmt.Errorf("DashScope 返回未知错误")
}
if textResult != "" {
timer.Reset(silenceTimeout)
}
case <-timer.C:
return textResult, nil
}
return text, nil
case err := <-errc:
return "", err
case <-ctx.Done():
return "", ctx.Err()
}
}
// normalizeSTTFormat 将音频格式映射到 DashScope 支持的格式名。
func normalizeSTTFormat(format string) string {
switch format {
case "wav":
return "wav"
case "mp3", "mpeg":
return "mp3"
case "ogg", "opus":
return "ogg"
case "flac":
return "flac"
case "m4a", "aac", "mp4":
return "aac"
default:
return "pcm"
// --- 流式识别 (StreamingSession) ---
// StreamingSession 维护一个持久的 DashScope WebSocket 连接,用于实时语音识别。
type StreamingSession struct {
conn *websocket.Conn
results chan StreamingResult
done chan struct{}
mu sync.Mutex
closed bool
}
// StreamingResult 实时识别结果。
type StreamingResult struct {
Text string `json:"text"`
IsFinal bool `json:"is_final"`
Error string `json:"error,omitempty"`
}
// StartStreaming 建立 DashScope realtime WebSocket 连接并返回 StreamingSession。
func (d *DashScopeSTT) StartStreaming(ctx context.Context, format, language string) (*StreamingSession, error) {
if !d.IsAvailable() {
return nil, fmt.Errorf("DashScope API Key 未配置")
}
url := fmt.Sprintf("wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=%s", d.model)
header := map[string][]string{
"Authorization": {"Bearer " + d.apiKey},
}
dialer := websocket.Dialer{HandshakeTimeout: 10 * time.Second}
conn, _, err := dialer.DialContext(ctx, url, header)
if err != nil {
return nil, fmt.Errorf("连接 DashScope STT 失败: %w", err)
}
// 1. session.created
conn.SetReadDeadline(time.Now().Add(10 * time.Second))
var msg rtServerMsg
if err := conn.ReadJSON(&msg); err != nil {
conn.Close()
return nil, fmt.Errorf("等待 session.created 失败: %w", err)
}
if msg.Type != "session.created" {
conn.Close()
return nil, fmt.Errorf("预期 session.created 但收到: %s", msg.Type)
}
// 2. session.update
if language == "" || language == "auto" {
language = "zh"
}
updateMsg := rtClientMsg{
Type: "session.update",
Session: map[string]interface{}{
"modalities": []string{"text"},
"input_audio_format": "pcm",
"sample_rate": 16000,
"input_audio_transcription": map[string]interface{}{
"language": language,
},
"turn_detection": map[string]interface{}{
"type": "server_vad",
},
},
}
conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
if err := conn.WriteJSON(updateMsg); err != nil {
conn.Close()
return nil, fmt.Errorf("发送 session.update 失败: %w", err)
}
// 3. session.updated
conn.SetReadDeadline(time.Now().Add(10 * time.Second))
if err := conn.ReadJSON(&msg); err != nil {
conn.Close()
return nil, fmt.Errorf("等待 session.updated 失败: %w", err)
}
if msg.Type == "error" && msg.Error != nil {
conn.Close()
return nil, fmt.Errorf("session.update 失败: %s", msg.Error.Message)
}
session := &StreamingSession{
conn: conn,
results: make(chan StreamingResult, 64),
done: make(chan struct{}),
}
go session.readLoop()
return session, nil
}
// SendAudio 发送一帧 PCM 音频数据到 DashScope。
// data 必须是 16-bit little-endian PCM16000Hzmono。
func (s *StreamingSession) SendAudio(data []byte) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.closed {
return fmt.Errorf("session 已关闭")
}
b64 := base64.StdEncoding.EncodeToString(data)
msg := rtClientMsg{
Type: "input_audio_buffer.append",
Audio: b64,
}
s.conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
return s.conn.WriteJSON(msg)
}
// Results 返回识别结果通道。
func (s *StreamingSession) Results() <-chan StreamingResult {
return s.results
}
// Close 结束会话并关闭 WebSocket 连接。
func (s *StreamingSession) Close() error {
s.mu.Lock()
if s.closed {
s.mu.Unlock()
return nil
}
s.closed = true
s.mu.Unlock()
finishMsg := rtClientMsg{Type: "session.finish"}
s.conn.SetWriteDeadline(time.Now().Add(5 * time.Second))
s.conn.WriteJSON(finishMsg)
select {
case <-s.done:
case <-time.After(5 * time.Second):
}
close(s.results)
return s.conn.Close()
}
// readLoop 读取 DashScope 服务端返回的消息并转换为 StreamingResult。
func (s *StreamingSession) readLoop() {
defer close(s.done)
for {
var msg rtServerMsg
if err := s.conn.ReadJSON(&msg); err != nil {
s.results <- StreamingResult{Error: fmt.Sprintf("读取响应失败: %v", err)}
return
}
switch msg.Type {
case "conversation.item.input_audio_transcription.completed":
if msg.Transcript != "" {
s.results <- StreamingResult{Text: msg.Transcript, IsFinal: true}
} else if msg.Item != nil {
for _, c := range msg.Item.Content {
if c.Transcript != "" {
s.results <- StreamingResult{Text: c.Transcript, IsFinal: true}
}
}
}
case "response.audio_transcript.delta":
s.results <- StreamingResult{Text: msg.Delta, IsFinal: false}
case "response.done":
// 全部完成
case "error":
errMsg := "未知错误"
if msg.Error != nil {
errMsg = msg.Error.Message
}
s.results <- StreamingResult{Error: fmt.Sprintf("DashScope 识别失败: %s", errMsg)}
return
case "response.created", "input_audio_buffer.committed",
"input_audio_buffer.speech_started", "input_audio_buffer.speech_stopped",
"conversation.item.created", "conversation.item.input_audio_transcription.text",
"response.audio_transcript.done":
// 内部事件,忽略
}
}
}
@@ -271,3 +447,74 @@ func (d *DashScopeSTT) GetStatus() map[string]interface{} {
"provider": "dashscope",
}
}
// normalizeSTTFormat 规范化音频格式字符串。
func normalizeSTTFormat(format string) string {
switch strings.ToLower(format) {
case "pcm", "wav", "mp3", "mpeg", "ogg", "opus", "flac", "m4a", "mp4", "aac", "webm":
return strings.ToLower(format)
default:
return format
}
}
// convertToPCM16 将音频数据转换为 16-bit PCM 16000Hz mono。
func convertToPCM16(data []byte, format string) ([]byte, error) {
normFormat := normalizeSTTFormat(format)
switch normFormat {
case "pcm":
return data, nil
case "wav":
if len(data) > 44 {
return data[44:], nil
}
return data, nil
default:
return transcodeToPCM(data, normFormat)
}
}
// transcodeToPCM 使用 ffmpeg 将音频数据转码为 PCM 16-bit 16000Hz mono。
func transcodeToPCM(data []byte, format string) ([]byte, error) {
inFile, err := os.CreateTemp(os.TempDir(), "cyrene-asr-in-*."+format)
if err != nil {
return nil, fmt.Errorf("创建输入临时文件失败: %w", err)
}
inPath := inFile.Name()
defer os.Remove(inPath)
if _, err := inFile.Write(data); err != nil {
inFile.Close()
return nil, fmt.Errorf("写入输入临时文件失败: %w", err)
}
inFile.Close()
outFile, err := os.CreateTemp(os.TempDir(), "cyrene-asr-out-*.pcm")
if err != nil {
return nil, fmt.Errorf("创建输出临时文件失败: %w", err)
}
outPath := outFile.Name()
outFile.Close()
defer os.Remove(outPath)
cmd := exec.Command("ffmpeg",
"-i", inPath,
"-ar", "16000",
"-ac", "1",
"-c:a", "pcm_s16le",
"-f", "s16le",
outPath,
"-y",
)
cmd.Stderr = nil
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("音频转码失败 (ffmpeg): %w", err)
}
outData, err := os.ReadFile(outPath)
if err != nil {
return nil, fmt.Errorf("读取转码结果失败: %w", err)
}
return outData, nil
}
@@ -16,21 +16,27 @@ import (
var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"}
// STTService 语音转文字服务。
// 优先使用 DashScope Gummy API,不可用时回退到本地 Whisper。
// 优先使用 DashScope API,不可用时回退到本地 Whisper。
type STTService struct {
whisperBinary string
whisperModel string
language string
dashscope *DashScopeSTT
dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime)
}
// NewSTTService 创建 STT 服务。
func NewSTTService(cfg *config.Config) *STTService {
// 实时模型用于所有 WebSocket ASR 请求(支持 one-shot 和 streaming
// 离线模型 (qwen3-asr-flash-2026-02-10) 是 HTTP REST API,暂未实现
model := cfg.DashScopeSTTRealtime
if model == "" {
model = cfg.DashScopeModel
}
return &STTService{
whisperBinary: cfg.WhisperBinary,
whisperModel: cfg.WhisperModel,
language: cfg.WhisperLanguage,
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, cfg.DashScopeModel),
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, model),
}
}
@@ -58,15 +64,30 @@ func (s *STTService) Transcribe(audioData []byte, format string, language string
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
text, err := s.dashscope.Transcribe(ctx, audioData, format, language)
if err == nil && text != "" {
if err == nil {
return text, nil
}
// DashScope 失败,返回具体错误而不是回退到 Whisper
return "", fmt.Errorf("语音识别失败: %w", err)
}
// 回退到本地 Whisper
return s.transcribeWhisper(audioData, format, language)
}
// StartStreaming 创建持久的流式语音识别会话。
func (s *STTService) StartStreaming(format, language string) (*StreamingSession, error) {
if !s.dashscope.IsAvailable() {
return nil, fmt.Errorf("流式识别需要 DashScope,请配置 DASHSCOPE_API_KEY")
}
if language == "" {
language = s.language
}
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
return s.dashscope.StartStreaming(ctx, format, language)
}
// transcribeWhisper 使用本地 Whisper 引擎转录。
func (s *STTService) transcribeWhisper(audioData []byte, format string, language string) (string, error) {
if _, err := os.Stat(s.whisperBinary); err != nil {