feat: 第五轮开发 - 14项未来路线图功能完整实现

W1-W14 全部完成:
- W1: 消息搜索 (ILIKE全文检索 + SearchModal)
- W2: 对话导出 (JSON/Markdown/TXT三格式)
- W3: 记忆时间线 DevTools 可视化
- W4: 通知推送系统 (WebSocket + Browser Notification API)
- W5: 定时提醒 (30s轮询 + 重复提醒 + WebSocket推送)
- W6: 每日简报 (08:00自动生成: 天气+新闻+提醒+AI摘要)
- W7: IoT场景自动化 (规则引擎 10s轮询 + 条件评估 + 场景执行)
- W8: 语音输入 (浏览器 Speech Recognition API)
- W9: STT服务 (voice-service + whisper.cpp)
- W10: TTS服务 (浏览器 Speech Synthesis + edge-tts三档回退)
- W11: 文件管理 (上传/下载/缩略图/纯Go bilinear缩放)
- W12: 知识库RAG (PostgreSQL tsvector + 文档分块 + 检索)
- W13: 多模态 (图片上传+分析: Vision API + 本地Go分析回退)
- W14: PWA (Service Worker + 离线页 + install prompt)

总计: 6个Go微服务 + 10+前端组件 + 10+ PostgreSQL表 + 4个后台调度器
This commit is contained in:
2026-05-19 12:01:09 +08:00
parent 78e3f450c2
commit bcf4d4e621
69 changed files with 14599 additions and 150 deletions
@@ -0,0 +1,175 @@
package service
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"github.com/yourname/cyrene-ai/voice-service/internal/config"
)
// SupportedLanguages STT 支持的语言列表
var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"}
// STTService 语音转文字服务
type STTService struct {
whisperBinary string
whisperModel string
language string
}
// NewSTTService 创建 STT 服务
func NewSTTService(cfg *config.Config) *STTService {
return &STTService{
whisperBinary: cfg.WhisperBinary,
whisperModel: cfg.WhisperModel,
language: cfg.WhisperLanguage,
}
}
// IsAvailable 检查 whisper binary 是否存在
func (s *STTService) IsAvailable() bool {
_, err := os.Stat(s.whisperBinary)
return err == nil
}
// Transcribe 将音频数据转录为文字
// audioData: 音频文件的二进制数据
// format: 音频格式 (wav, mp3, ogg, flac, m4a)
// language: 转录语言 (zh, en, ja, ko, auto),为空则使用默认语言
func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) {
if !s.IsAvailable() {
return "", fmt.Errorf("STT 引擎未安装,请运行 scripts/setup-whisper.sh")
}
// 如果未指定语言,使用默认语言
if language == "" {
language = s.language
}
// 验证语言是否支持
if !isSupportedLanguage(language) {
return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", "))
}
// 将音频数据写入临时文件
ext := normalizeExt(format)
tmpFile, err := os.CreateTemp("/tmp", "cyrene-stt-*"+ext)
if err != nil {
return "", fmt.Errorf("创建临时文件失败: %w", err)
}
tmpPath := tmpFile.Name()
defer os.Remove(tmpPath)
if _, err := tmpFile.Write(audioData); err != nil {
tmpFile.Close()
return "", fmt.Errorf("写入临时文件失败: %w", err)
}
tmpFile.Close()
// 如果不是 WAV 格式,尝试用 ffmpeg 转换
inputPath := tmpPath
if format != "wav" && format != "" {
convertedPath := tmpPath + ".wav"
if err := convertToWav(tmpPath, convertedPath); err == nil {
defer os.Remove(convertedPath)
inputPath = convertedPath
}
// 转换失败则仍使用原始文件(whisper.cpp 也支持其他格式)
}
// 调用 whisper.cpp
outputTxt := inputPath + ".txt"
cmd := exec.Command(s.whisperBinary,
"-m", s.whisperModel,
"-l", language,
"-f", inputPath,
"-otxt",
"-of", strings.TrimSuffix(inputPath, filepath.Ext(inputPath)),
)
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
os.Remove(outputTxt)
return "", fmt.Errorf("whisper 转录失败: %w", err)
}
// 读取输出文本
defer os.Remove(outputTxt)
txtData, err := os.ReadFile(outputTxt)
if err != nil {
return "", fmt.Errorf("读取转录结果失败: %w", err)
}
text := strings.TrimSpace(string(txtData))
return text, nil
}
// GetStatus 返回服务状态
func (s *STTService) GetStatus() map[string]interface{} {
binaryAvailable := s.IsAvailable()
modelExists := false
if _, err := os.Stat(s.whisperModel); err == nil {
modelExists = true
}
modelName := filepath.Base(s.whisperModel)
return map[string]interface{}{
"available": binaryAvailable && modelExists,
"binary_available": binaryAvailable,
"model_loaded": modelExists,
"binary_path": s.whisperBinary,
"model_path": s.whisperModel,
"model_name": modelName,
"default_language": s.language,
"supported_languages": SupportedLanguages,
}
}
// normalizeExt 规范化文件扩展名
func normalizeExt(format string) string {
switch strings.ToLower(format) {
case "wav":
return ".wav"
case "mp3", "mpeg":
return ".mp3"
case "ogg", "opus":
return ".ogg"
case "flac":
return ".flac"
case "m4a", "mp4", "aac":
return ".m4a"
default:
return ".wav"
}
}
// isSupportedLanguage 检查语言是否支持
func isSupportedLanguage(lang string) bool {
for _, l := range SupportedLanguages {
if l == lang {
return true
}
}
return false
}
// convertToWav 使用 ffmpeg 将音频转换为 WAV 格式
func convertToWav(inputPath, outputPath string) error {
cmd := exec.Command("ffmpeg",
"-i", inputPath,
"-ar", "16000",
"-ac", "1",
"-c:a", "pcm_s16le",
outputPath,
"-y",
)
cmd.Stderr = nil
return cmd.Run()
}
@@ -0,0 +1,294 @@
package service
import (
"fmt"
"os"
"os/exec"
"strings"
)
// TTSVoice 表示一个可用的 TTS 语音
type TTSVoice struct {
Name string `json:"name"`
DisplayName string `json:"display_name"`
Gender string `json:"gender"`
Locale string `json:"locale"`
}
// BuiltinVoices 内置的 edge-tts 中文语音列表
var BuiltinVoices = []TTSVoice{
{Name: "zh-CN-XiaoxiaoNeural", DisplayName: "晓晓 (女声)", Gender: "Female", Locale: "zh-CN"},
{Name: "zh-CN-YunxiNeural", DisplayName: "云希 (男声)", Gender: "Male", Locale: "zh-CN"},
{Name: "zh-CN-XiaoyiNeural", DisplayName: "晓伊 (女声)", Gender: "Female", Locale: "zh-CN"},
}
// TTSService 文字转语音服务
type TTSService struct{}
// NewTTSService 创建 TTS 服务
func NewTTSService() *TTSService {
return &TTSService{}
}
// IsAvailable 检查 TTS 引擎是否可用
// 优先级: edge-tts > espeak-ng > 纯 Go fallback
func (s *TTSService) IsAvailable() bool {
return s.edgeTTSAvailable() || s.espeakAvailable()
}
// edgeTTSAvailable 检查 edge-tts 是否可用
func (s *TTSService) edgeTTSAvailable() bool {
_, err := exec.LookPath("edge-tts")
return err == nil
}
// espeakAvailable 检查 espeak-ng 是否可用
func (s *TTSService) espeakAvailable() bool {
_, err := exec.LookPath("espeak-ng")
return err == nil
}
// Synthesize 将文字合成为音频
// text: 要合成的文字
// voice: 语音名称 (zh-CN-XiaoxiaoNeural 等)
// rate: 语速调整 ("+0%", "+20%", "-20%" 等)
// 返回: 音频数据, 音频格式 (mp3/wav), 错误
func (s *TTSService) Synthesize(text string, voice string, rate string) ([]byte, string, error) {
if text == "" {
return nil, "", fmt.Errorf("文字内容为空")
}
// 方案 A: edge-tts (推荐)
if s.edgeTTSAvailable() {
return s.synthesizeEdgeTTS(text, voice, rate)
}
// 方案 B: espeak-ng
if s.espeakAvailable() {
return s.synthesizeEspeak(text, voice)
}
// 方案 C: 纯 Go fallback
return s.synthesizeFallback()
}
// synthesizeEdgeTTS 使用 edge-tts 合成语音
func (s *TTSService) synthesizeEdgeTTS(text string, voice string, rate string) ([]byte, string, error) {
if voice == "" {
voice = "zh-CN-XiaoxiaoNeural"
}
if rate == "" {
rate = "+0%"
}
// 写入文本到临时文件
tmpText, err := os.CreateTemp("/tmp", "cyrene-tts-text-*.txt")
if err != nil {
return nil, "", fmt.Errorf("创建临时文本文件失败: %w", err)
}
tmpTextPath := tmpText.Name()
defer os.Remove(tmpTextPath)
if _, err := tmpText.WriteString(text); err != nil {
tmpText.Close()
return nil, "", fmt.Errorf("写入临时文本失败: %w", err)
}
tmpText.Close()
// 输出音频文件
tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-output-*.mp3")
if err != nil {
return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err)
}
tmpOutputPath := tmpOutput.Name()
tmpOutput.Close()
defer os.Remove(tmpOutputPath)
// 构建 edge-tts 命令
cmd := exec.Command("edge-tts",
"--voice", voice,
"--rate="+rate,
"--text", text,
"--write-media", tmpOutputPath,
)
output, err := cmd.CombinedOutput()
if err != nil {
return nil, "", fmt.Errorf("edge-tts 合成失败: %w\n输出: %s", err, string(output))
}
// 读取生成的音频
audioData, err := os.ReadFile(tmpOutputPath)
if err != nil {
return nil, "", fmt.Errorf("读取合成的音频失败: %w", err)
}
if len(audioData) == 0 {
return nil, "", fmt.Errorf("edge-tts 生成的音频为空")
}
return audioData, "mp3", nil
}
// synthesizeEspeak 使用 espeak-ng 合成语音
func (s *TTSService) synthesizeEspeak(text string, voice string) ([]byte, string, error) {
if voice == "" {
voice = "zh"
}
// 输出 WAV 文件
tmpOutput, err := os.CreateTemp("/tmp", "cyrene-tts-espeak-*.wav")
if err != nil {
return nil, "", fmt.Errorf("创建临时输出文件失败: %w", err)
}
tmpOutputPath := tmpOutput.Name()
tmpOutput.Close()
defer os.Remove(tmpOutputPath)
cmd := exec.Command("espeak-ng",
"-v", voice,
"-w", tmpOutputPath,
text,
)
output, err := cmd.CombinedOutput()
if err != nil {
return nil, "", fmt.Errorf("espeak-ng 合成失败: %w\n输出: %s", err, string(output))
}
audioData, err := os.ReadFile(tmpOutputPath)
if err != nil {
return nil, "", fmt.Errorf("读取合成的音频失败: %w", err)
}
if len(audioData) == 0 {
return nil, "", fmt.Errorf("espeak-ng 生成的音频为空")
}
return audioData, "wav", nil
}
// synthesizeFallback 生成静默 WAV 作为降级方案
// 生成 1 秒 16kHz 16-bit mono 静默 PCM WAV
func (s *TTSService) synthesizeFallback() ([]byte, string, error) {
// 1 秒 @ 16kHz mono 16-bit = 32000 字节采样数据
sampleRate := 16000
numChannels := 1
bitsPerSample := 16
durationSec := 1
dataSize := sampleRate * numChannels * (bitsPerSample / 8) * durationSec
// WAV header 44 bytes + data
wav := make([]byte, 44+dataSize)
// RIFF header
copy(wav[0:4], "RIFF")
writeUint32LE(wav[4:8], uint32(36+dataSize))
copy(wav[8:12], "WAVE")
// fmt chunk
copy(wav[12:16], "fmt ")
writeUint32LE(wav[16:20], 16) // chunk size
writeUint16LE(wav[20:22], 1) // PCM
writeUint16LE(wav[22:24], uint16(numChannels)) // channels
writeUint32LE(wav[24:28], uint32(sampleRate)) // sample rate
writeUint32LE(wav[28:32], uint32(sampleRate*numChannels*bitsPerSample/8)) // byte rate
writeUint16LE(wav[32:34], uint16(numChannels*bitsPerSample/8)) // block align
writeUint16LE(wav[34:36], uint16(bitsPerSample)) // bits per sample
// data chunk
copy(wav[36:40], "data")
writeUint32LE(wav[40:44], uint32(dataSize))
// 采样数据全是 0 (静默)
return wav, "wav", nil
}
func writeUint16LE(buf []byte, v uint16) {
buf[0] = byte(v)
buf[1] = byte(v >> 8)
}
func writeUint32LE(buf []byte, v uint32) {
buf[0] = byte(v)
buf[1] = byte(v >> 8)
buf[2] = byte(v >> 16)
buf[3] = byte(v >> 24)
}
// GetVoices 返回可用语音列表
func (s *TTSService) GetVoices() []TTSVoice {
// 检查 edge-tts 是否可用,尝试获取完整语音列表
if s.edgeTTSAvailable() {
cmd := exec.Command("edge-tts", "--list-voices")
output, err := cmd.Output()
if err == nil {
voices := s.parseEdgeTTSVoices(string(output))
if len(voices) > 0 {
return voices
}
}
}
return BuiltinVoices
}
// parseEdgeTTSVoices 解析 edge-tts --list-voices 输出
// 简单解析:查找包含 "zh-CN" 的语音
func (s *TTSService) parseEdgeTTSVoices(output string) []TTSVoice {
var voices []TTSVoice
for _, line := range strings.Split(output, "\n") {
line = strings.TrimSpace(line)
if !strings.Contains(line, "zh-CN") {
continue
}
voice := TTSVoice{
Name: "",
Gender: "Unknown",
Locale: "zh-CN",
}
// 简单解析 "Name: zh-CN-XiaoxiaoNeural" 和 "Gender: Female" 格式
for _, field := range strings.Split(line, ",") {
field = strings.TrimSpace(field)
if strings.HasPrefix(field, "Name:") {
voice.Name = strings.TrimSpace(strings.TrimPrefix(field, "Name:"))
}
if strings.HasPrefix(field, "Gender:") {
voice.Gender = strings.TrimSpace(strings.TrimPrefix(field, "Gender:"))
}
}
if voice.Name != "" {
voice.DisplayName = voice.Name
voices = append(voices, voice)
}
}
if len(voices) == 0 {
return nil
}
return voices
}
// GetEngineStatus 返回 TTS 引擎状态
func (s *TTSService) GetEngineStatus() map[string]interface{} {
status := map[string]interface{}{
"available": s.IsAvailable(),
"edge_tts": s.edgeTTSAvailable(),
"espeak_ng": s.espeakAvailable(),
"engine": "none",
"default_voice": "zh-CN-XiaoxiaoNeural",
"builtin_voices": len(BuiltinVoices),
}
if s.edgeTTSAvailable() {
status["engine"] = "edge-tts"
} else if s.espeakAvailable() {
status["engine"] = "espeak-ng"
} else {
status["engine"] = "fallback (silent WAV)"
}
return status
}