diff --git a/.env.example b/.env.example index 941b23c..dbd76d2 100644 --- a/.env.example +++ b/.env.example @@ -1,27 +1,51 @@ # MinIO 配置 -VOICEFLOW_MINIO_ENDPOINT='localhost:9000' # MinIO 服务地址 -VOICEFLOW_MINIO_ACCESS_KEY='minioadmin' # MinIO 访问密钥 -VOICEFLOW_MINIO_SECRET_KEY='minioadmin' # MinIO 密钥 +VOICEFLOW_MINIO_ENDPOINT='s3.api..cc' # MinIO 服务地址 +VOICEFLOW_MINIO_ACCESS_KEY='' # MinIO 访问密钥 +VOICEFLOW_MINIO_SECRET_KEY='' # MinIO 密钥 # Azure 配置 -VOICEFLOW_AZURE_STT_KEY='your_azure_stt_key' # Azure 语音转文本密钥 -VOICEFLOW_AZURE_TTS_KEY='your_azure_tts_key' # Azure 文本转语音密钥 -VOICEFLOW_AZURE_REGION='eastus' # Azure 服务区域 +VOICEFLOW_AZURE_STT_KEY='' # Azure STT 密钥 +VOICEFLOW_AZURE_TTS_KEY='' # Azure TTS 密钥 +VOICEFLOW_AZURE_SPEECH_KEY='' # Azure 语音密钥 +VOICEFLOW_AZURE_REGION='japaneast' # Azure 区域 + +# AWS 配置 +VOICEFLOW_AWS_SECRET_ACCESS_KEY='' # AWS 秘密访问密钥 +VOICEFLOW_AWS_ACCESS_KEY_ID='' # AWS 访问密钥 ID # Google 配置 -VOICEFLOW_GOOGLE_STT_KEY='your_google_stt_key' # Google 语音转文本密钥 -VOICEFLOW_GOOGLE_TTS_KEY='your_google_tts_key' # Google 文本转语音密钥 +VOICEFLOW_GOOGLE_STT_KEY='' # Google STT 密钥 +VOICEFLOW_GOOGLE_TTS_KEY='' # Google TTS 密钥 # OpenAI 配置 -VOICEFLOW_OPENAI_API_KEY='your_openai_api_key' # OpenAI API 密钥 +VOICEFLOW_OPENAI_API_KEY='' # OpenAI API 密钥 +VOICEFLOW_OPENAI_BASE_URL='' # OpenAI 基础 URL # AssemblyAI 配置 -VOICEFLOW_ASSEMBLYAI_API_KEY='your_assemblyai_api_key' # AssemblyAI API 密钥 +VOICEFLOW_ASSEMBLYAI_API_KEY='' # AssemblyAI API 密钥 -# 语音服务端口配置 -VOICEFLOW_SERVER_PORT=80 # VoiceFlow 服务端口, 默认是 80 +# VOLCENGINE STT 配置 +VOICEFLOW_VOLCENGINE_STT_WS_URL='wss://openspeech.bytedance.com/api/v3/sauc/bigmodel' # STT WebSocket URL +VOICEFLOW_VOLCENGINE_STT_UID='test' # STT 用户ID +VOICEFLOW_VOLCENGINE_STT_RATE='16000' # STT 采样率 +VOICEFLOW_VOLCENGINE_STT_FORMAT='pcm' # STT 音频格式 +VOICEFLOW_VOLCENGINE_STT_BITS='16' # STT 位深度 +VOICEFLOW_VOLCENGINE_STT_CHANNEL='1' # STT 声道数 +VOICEFLOW_VOLCENGINE_STT_CODEC='pcm' # STT 编码格式 +VOICEFLOW_VOLCENGINE_STT_ACCESS_KEY='' # STT 访问密钥 +VOICEFLOW_VOLCENGINE_STT_APP_KEY='' # STT 应用密钥 +VOICEFLOW_VOLCENGINE_STT_RESOURCE_ID='volc.bigasr.sauc.duration' # STT 资源ID -# VOLCENGINE 配置 -VOICEFLOW_VOLCENGINE_ACCESS_KEY='' -VOICEFLOW_VOLCENGINE_APP_KEY='' -VOICEFLOW_VOLCENGINE_WS_URL='wss://openspeech.bytedance.com/api/v3/sauc/bigmode \ No newline at end of file +# VOLCENGINE TTS 配置 +VOICEFLOW_VOLCENGINE_TTS_WS_URL='wss://openspeech.bytedance.com/api/v1/tts/ws_binary' # TTS WebSocket URL +VOICEFLOW_VOLCENGINE_TTS_APP_ID='' # TTS 应用ID +VOICEFLOW_VOLCENGINE_TTS_TOKEN='' # TTS 令牌 +VOICEFLOW_VOLCENGINE_TTS_CLUSTER='volcano_tts' # TTS 集群名称 +VOICEFLOW_VOLCENGINE_TTS_VOICE_TYPE='zh_female_1' # TTS 音色类型 +VOICEFLOW_VOLCENGINE_TTS_ENCODING='mp3' # TTS 音频编码 +VOICEFLOW_VOLCENGINE_TTS_SPEED_RATIO='1.0' # TTS 语速比例 +VOICEFLOW_VOLCENGINE_TTS_VOLUME_RATIO='1.0' # TTS 音量比例 +VOICEFLOW_VOLCENGINE_TTS_PITCH_RATIO='1.0' # TTS 音调比例 + +# 语音服务端口配置 +VOICEFLOW_SERVER_PORT=18080 # 语音服务端口 diff --git a/cmd/voiceflow/realtime.go b/cmd/voiceflow/realtime.go new file mode 100644 index 0000000..a997e32 --- /dev/null +++ b/cmd/voiceflow/realtime.go @@ -0,0 +1,27 @@ +// cmd/voiceflow/realtime.go +package main + +import ( + "fmt" + "github.com/spf13/cobra" + "github.com/telepace/voiceflow/pkg/voiceprocessor" +) + +var realtimeCmd = &cobra.Command{ + Use: "realtime", + Short: "在终端中实时监听语音并翻译", + RunE: runRealtime, +} + +func init() { + rootCmd.AddCommand(realtimeCmd) +} + +func runRealtime(cmd *cobra.Command, args []string) error { + fmt.Println("启动实时语音监听...") + err := voiceprocessor.StartRealtime() + if err != nil { + return err + } + return nil +} diff --git a/cmd/voiceflow/root.go b/cmd/voiceflow/root.go index 0fe4f27..a4d6e3e 100644 --- a/cmd/voiceflow/root.go +++ b/cmd/voiceflow/root.go @@ -1,18 +1,21 @@ -// root.go +// cmd/voiceflow/root.go package main import ( "context" "embed" "fmt" - "github.com/joho/godotenv" - "github.com/telepace/voiceflow/pkg/config" "io/fs" + "io/ioutil" "net/http" "os" "strings" "time" + "github.com/joho/godotenv" + "github.com/telepace/voiceflow/pkg/config" + "github.com/telepace/voiceflow/pkg/sttservice" + "github.com/spf13/cobra" "github.com/spf13/viper" @@ -64,9 +67,20 @@ var rootCmd = &cobra.Command{ RunE: run, } +// 添加新的子命令 transcribe +var transcribeCmd = &cobra.Command{ + Use: "transcribe", + Short: "Transcribe an audio file using STT service", + Long: `Transcribe an audio file by specifying its path and using the configured STT service.`, + RunE: runTranscribe, +} + func run(cmd *cobra.Command, args []string) error { ctx := context.Background() + if err := ensureDirectories(); err != nil { + logger.Fatalf("Failed to ensure directories: %v", err) + } // Load configuration cfg, err := config.GetConfig() if err != nil { @@ -102,7 +116,7 @@ func run(cmd *cobra.Command, args []string) error { // Set up HTTP server mux := http.NewServeMux() if err := setupFileServers(mux); err != nil { - return fmt.Errorf("failed to setup file servers: %w", err) + logger.Fatalf("Failed to setup file servers: %v", err) } // Initialize WebSocket server @@ -158,6 +172,8 @@ func Execute() { } } +var transcribeFile string + func init() { cobra.OnInitialize(initConfig) @@ -180,6 +196,13 @@ func init() { // 绑定到 viper viper.BindPFlags(rootCmd.PersistentFlags()) + + // 配置 transcribe 子命令的标志 + transcribeCmd.Flags().StringVarP(&transcribeFile, "file", "f", "", "Path to the audio file to transcribe") + transcribeCmd.MarkFlagRequired("file") // 标记为必需 + + // 将 transcribe 子命令添加到 rootCmd + rootCmd.AddCommand(transcribeCmd) } func initConfig() { @@ -187,7 +210,8 @@ func initConfig() { if err := godotenv.Load(); err != nil { logger.Warn("No .env file found or failed to load, proceeding without it") } else { - logger.Info(".env file loaded") + envPath, _ := os.Getwd() + logger.Info(fmt.Sprintf(".env file loaded from: %s/.env", envPath)) } if cfgFile != "" { @@ -228,8 +252,72 @@ func setDefaults() { viper.SetDefault("logging.compress", true) viper.SetDefault("logging.report_caller", true) + // AWS 默认配置 + viper.SetDefault("aws.region", "us-east-2") + // 其他服务配置... viper.SetDefault("web.port", 18090) viper.SetDefault("minio.enabled", true) viper.SetDefault("minio.endpoint", "localhost:9000") } + +// runTranscribe 处理 transcribe 子命令的逻辑 +func runTranscribe(cmd *cobra.Command, args []string) error { + ctx := context.Background() + + // 初始化配置 + if err := ensureDirectories(); err != nil { + logger.Fatalf("Failed to ensure directories: %v", err) + } + + cfg, err := config.GetConfig() + if err != nil { + return fmt.Errorf("failed to get config: %w", err) + } + + // 初始化日志 + logCfg := logger.Config{ + Level: cfg.Logging.Level, + Format: cfg.Logging.Format, + Filename: cfg.Logging.Filename, + MaxSize: cfg.Logging.MaxSize, + MaxBackups: cfg.Logging.MaxBackups, + MaxAge: cfg.Logging.MaxAge, + Compress: cfg.Logging.Compress, + ReportCaller: cfg.Logging.ReportCaller, + } + + fields := logger.StandardFields{ + ServiceID: "voiceflow", + InstanceID: fmt.Sprintf("instance-%d", time.Now().Unix()), + } + + if err := logger.Init(logCfg, fields); err != nil { + return fmt.Errorf("failed to initialize logger: %w", err) + } + + // 记录启动信息 + logger.InfoContextf(ctx, "Starting VoiceFlow transcribe command with config: %+v", cfg) + + // 初始化服务 + serverpkg.InitServices() + + // 读取音频文件 + audioData, err := ioutil.ReadFile(transcribeFile) + if err != nil { + logger.Errorf("Failed to read audio file: %v", err) + return fmt.Errorf("failed to read audio file: %w", err) + } + + // 调用 STT 服务进行转录 + transcript, err := sttservice.Recognize(audioData) + if err != nil { + logger.Errorf("STT Recognize error: %v", err) + return fmt.Errorf("STT Recognize error: %w", err) + } + + // 输出转录结果 + fmt.Printf("Transcript:\n%s\n", transcript) + + return nil +} diff --git a/cmd/voiceflow/transcribe.go b/cmd/voiceflow/transcribe.go new file mode 100644 index 0000000..2737aad --- /dev/null +++ b/cmd/voiceflow/transcribe.go @@ -0,0 +1,29 @@ +// cmd/voiceflow/transcribe.go +package main + +//import ( +// "fmt" +// "github.com/spf13/cobra" +// "github.com/telepace/voiceflow/pkg/voiceprocessor" +//) +// +//var transcribeCmd = &cobra.Command{ +// Use: "transcribe [音频文件路径]", +// Short: "转录并翻译指定的音频文件", +// Args: cobra.ExactArgs(1), +// RunE: runTranscribe, +//} +// +//func init() { +// rootCmd.AddCommand(transcribeCmd) +//} +// +//func runTranscribe(cmd *cobra.Command, args []string) error { +// audioFile := args[0] +// fmt.Printf("正在转录音频文件:%s\n", audioFile) +// err := voiceprocessor.TranscribeFile(audioFile) +// if err != nil { +// return err +// } +// return nil +//} diff --git a/cmd/voiceflow/voiceflow.go b/cmd/voiceflow/voiceflow.go index 736ef31..d10934e 100644 --- a/cmd/voiceflow/voiceflow.go +++ b/cmd/voiceflow/voiceflow.go @@ -1,3 +1,5 @@ +// cmd/voiceflow/voiceflow.go + package main func main() { diff --git a/cmd/voiceflow/web/script.js b/cmd/voiceflow/web/script.js index 249387c..7726896 100644 --- a/cmd/voiceflow/web/script.js +++ b/cmd/voiceflow/web/script.js @@ -8,12 +8,44 @@ ws.onopen = () => { appendSystemMessage('提示:您可以长按麦克风按钮 & 长按 键盘 V 进行录音'); }; -ws.onmessage = (event) => { - const data = JSON.parse(event.data); - if (data.text) { - appendMessage('助手', data.text); - } else if (data.audio_url) { - appendAudioMessage('助手', data.audio_url); +ws.onmessage = function(event) { + if (typeof event.data === 'string') { + const response = JSON.parse(event.data); + console.log('收到 WebSocket 响应:', response); + + if (response.type) { + // 处理带有 type 字段的消息(语音识别等) + switch(response.type) { + case 'audio_stored': + appendAudioMessage('你', response.audio_url); + break; + + case 'recognition_complete': + appendMessage('你', response.text); + break; + + case 'recognition_error': + appendSystemMessage(`识别错误: ${response.error}`); + break; + + case 'tts_complete': + // 移除"正在生成语音..."的系统消息 + const systemMessages = document.querySelectorAll('.message.system'); + systemMessages.forEach(msg => { + if (msg.textContent === '正在生成语音...') { + msg.remove(); + } + }); + + // 添加 AI 的文本和音频消息 + appendMessage('AI', response.text); + appendAudioMessage('AI', response.audio_url); + break; + + default: + console.log('Unknown message type:', response.type); + } + } } }; @@ -21,6 +53,28 @@ ws.onerror = (error) => { console.error('WebSocket 错误:', error); }; +// 添加重连逻辑 +let reconnectAttempts = 0; +const maxReconnectAttempts = 5; + +ws.onclose = (event) => { + console.log('WebSocket connection closed:', event); + + if (reconnectAttempts < maxReconnectAttempts) { + reconnectAttempts++; + const timeout = Math.min(1000 * Math.pow(2, reconnectAttempts), 10000); + + appendSystemMessage(`连接已断开,${timeout/1000}秒后尝试重新连接...`); + + setTimeout(() => { + ws = new WebSocket(WEBSOCKET_URL); + // 重新绑定事件处理器 + }, timeout); + } else { + appendSystemMessage('连接已断开,请刷新页面重试'); + } +}; + const chatWindow = document.getElementById('chat-window'); const textInput = document.getElementById('text-input'); const sendTextBtn = document.getElementById('send-text-btn'); @@ -48,9 +102,25 @@ let audioChunks = []; let isRecording = false; let mediaStream = null; +// 添加会话 ID 生成函数 +function generateSessionId() { + return 'session_' + Date.now() + '_' + Math.random().toString(36).substr(2, 9); +} + +let currentSessionId = null; + function startRecording() { if (isRecording) return; - + + // 生成新的会话 ID + currentSessionId = generateSessionId(); + + // 发送开始信号 + ws.send(JSON.stringify({ + type: "audio_start", + session_id: currentSessionId + })); + navigator.mediaDevices.getUserMedia({ audio: true }) .then(stream => { isRecording = true; @@ -58,32 +128,65 @@ function startRecording() { recordVoiceBtn.classList.add('recording'); mediaRecorder = new MediaRecorder(stream); - mediaRecorder.start(); + const timeslice = 250; + + mediaRecorder.start(timeslice); mediaRecorder.ondataavailable = e => { - audioChunks.push(e.data); + if (e.data && e.data.size > 0) { + // 直接发送二进制数据 + ws.send(e.data); + } }; mediaRecorder.onstop = () => { - const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); - audioChunks = []; - sendAudioMessage(audioBlob); isRecording = false; recordVoiceBtn.classList.remove('recording'); - // 停止所有音频轨道,释放麦克风 + // 停止所有音频轨道 mediaStream.getTracks().forEach(track => track.stop()); mediaStream = null; + + // 发送结束信号 + ws.send(JSON.stringify({ + type: "audio_end", + session_id: currentSessionId + })); + + currentSessionId = null; }; }) .catch(err => { - console.error('麦克风访问错误:', err); + console.error('获取麦克风权限失败:', err); + appendSystemMessage('错误:无法访问麦克风'); }); } +function sendAudioChunk(audioBlob, sessionId) { + const reader = new FileReader(); + reader.onload = () => { + // 发送二进制数据前,先发送元数据 + ws.send(JSON.stringify({ + type: 'audio_metadata', + session_id: sessionId, + is_start: true + })); + + // 然后发送音频数据 + ws.send(reader.result); + }; + reader.readAsArrayBuffer(audioBlob); +} + function stopRecording() { if (mediaRecorder && isRecording) { mediaRecorder.stop(); + // 发送结束信号时包含会话 ID + ws.send(JSON.stringify({ + type: 'audio_end', + session_id: currentSessionId + })); + currentSessionId = null; } } @@ -125,8 +228,17 @@ uploadAudioInput.addEventListener('change', () => { }); function sendTextMessage(text) { - // 通过 WebSocket 发送文字消息 - ws.send(JSON.stringify({ text: text })); + // 显示发送的消息 + appendMessage('你', text); + + // ��过 WebSocket 发送文字消息 + ws.send(JSON.stringify({ + text: text, + require_tts: true + })); + + // 可以添加一个加载提示 + appendSystemMessage('正在生成语音...'); } function sendAudioMessage(audioBlob) { @@ -141,7 +253,54 @@ function sendAudioMessage(audioBlob) { reader.readAsArrayBuffer(audioBlob); } +let partialMessageDiv; + +function updatePartialMessage(user, text) { + if (!partialMessageDiv) { + partialMessageDiv = document.createElement('div'); + partialMessageDiv.classList.add('message'); + + const userSpan = document.createElement('span'); + userSpan.classList.add('user'); + userSpan.textContent = `${user}: `; + + const textSpan = document.createElement('span'); + textSpan.classList.add('partial-text'); + + partialMessageDiv.appendChild(userSpan); + partialMessageDiv.appendChild(textSpan); + chatWindow.appendChild(partialMessageDiv); + } + + const textSpan = partialMessageDiv.querySelector('.partial-text'); + textSpan.textContent = text; + chatWindow.scrollTop = chatWindow.scrollHeight; +} + +// 当录音结束时,清除部分消息 +function clearPartialMessage() { + if (partialMessageDiv) { + partialMessageDiv.remove(); + partialMessageDiv = null; + } +} + +// 修改录音停止的函数,添加清除部分消息的逻辑 +function stopRecording() { + if (mediaRecorder && isRecording) { + mediaRecorder.stop(); + clearPartialMessage(); + } +} + + +// 当最终文本到达时,替换部分转录文本 function appendMessage(user, text) { + if (partialMessageDiv) { + partialMessageDiv.remove(); + partialMessageDiv = null; + } + // 继续现有代码,添加消息 const messageDiv = document.createElement('div'); messageDiv.classList.add('message'); @@ -169,6 +328,17 @@ function appendAudioMessage(user, audioUrl) { const audio = document.createElement('audio'); audio.src = audioUrl; audio.controls = true; + + // 添加音频加载错误处理 + audio.onerror = function() { + console.error('音频加载失败:', audioUrl); + appendSystemMessage('音频加载失败,请重试'); + }; + + // 添加音频加载成功处理 + audio.onloadeddata = function() { + console.log('音频加载成功:', audioUrl); + }; messageDiv.appendChild(userSpan); messageDiv.appendChild(audio); @@ -178,12 +348,8 @@ function appendAudioMessage(user, audioUrl) { function appendSystemMessage(text) { const messageDiv = document.createElement('div'); - messageDiv.classList.add('message', 'system-message'); - - const textSpan = document.createElement('span'); - textSpan.textContent = text; - - messageDiv.appendChild(textSpan); + messageDiv.className = 'message system'; + messageDiv.textContent = text; chatWindow.appendChild(messageDiv); chatWindow.scrollTop = chatWindow.scrollHeight; } \ No newline at end of file diff --git a/cmd/voiceflow/web/styles.css b/cmd/voiceflow/web/styles.css index 331ae7c..8952f30 100644 --- a/cmd/voiceflow/web/styles.css +++ b/cmd/voiceflow/web/styles.css @@ -30,18 +30,28 @@ body { } .message { - margin-bottom: 10px; + margin-bottom: 15px; + padding: 10px; + border-radius: 8px; + background-color: #fff; + box-shadow: 0 1px 2px rgba(0,0,0,0.1); } .message .user { font-weight: bold; - margin-right: 5px; + color: #007bff; +} + +.message audio { + margin-top: 5px; + width: 100%; } .system-message { text-align: center; color: #6c757d; - font-size: 14px; + font-style: italic; + margin: 10px 0; } .chat-input { diff --git a/configs/config.yaml b/configs/config.yaml index 7a02cb0..2b8114a 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,33 +1,42 @@ server: - port: 80 + port: 18080 enable_tls: false minio: enabled: true - endpoint: '' - access_key: "" - secret_key: "" - bucket_name: 'telepace-pipeline' + endpoint: "localhost:9000" + access_key: "your_access_key" + secret_key: "your_secret_key" + bucket_name: "telepace-pipeline" secure: true + storage_path: "voiceflow/audio/" stt: - provider: volcengine # 可选值:azure、google、local、assemblyai、volcengine + # 可选值:azure、google、local、assemblyai、volcengine、 aws + provider: assemblyai tts: - provider: google # 可选值:azure、google、local + # 可选值:azure、google、local、volcengine + provider: volcengine llm: - provider: openai # 可选值:openai、local + # 可选值:openai、local + provider: openai azure: - stt_key: "your_azure_stt_key" - tts_key: "your_azure_tts_key" + stt_key: "" + tts_key: "" region: "eastus" google: stt_key: "your_google_stt_key" tts_key: "your_google_tts_key" +aws: + region: "us-east-1" + access_key_id: '' + secret_access_key: '' + openai: api_key: "" # OPENAI_BASE_URL='https://api.lqqq.cc/v1' # Global provider @@ -37,19 +46,36 @@ openai: base_url: "" volcengine: - access_key: '' - app_key: '' - ws_url: '' - uid: "test" - rate: 16000 - format: "pcm" - bits: 16 - channel: 1 - codec: "pcm" + # 语音识别(STT)配置 + stt: + ws_url: '' + uid: "test" + rate: 16000 + format: "pcm" + bits: 16 + channel: 1 + codec: "pcm" + access_key: '' + app_key: '' + # 小时版:volc.bigasr.sauc.duration + # 并发版:volc.bigasr.sauc.concurrent + resource_id: 'volc.bigasr.sauc.duration' + + # 语音合成(TTS)配置 + tts: + ws_url: "wss://openspeech.bytedance.com/api/v1/tts/ws_binary" + app_id: "your_app_id" + token: "your_token" + cluster: "volcano_tts" + voice_type: "zh_female_sajiaonvyou_moon_bigtts" + encoding: "mp3" + speed_ratio: 1.0 + volume_ratio: 1.0 + pitch_ratio: 1.0 # 日志配置 logging: - # 日志级别(选项:debug(调试)、info(信息)、warn(警告)、error(错误)、fatal(致命错误)) + # 日志级别(选项:debug���调试)、info(信息)、warn(警告)、error(错误)、fatal(致命错误)) level: "info" # 日志格式(选项:json(JSON 格式)、text(文本格式)) format: "text" @@ -64,7 +90,7 @@ logging: # 是否压缩旧日志文件 compress: true # 是否在日志中包含调用者信息 - report_caller: false + report_caller: true assemblyai: - api_key: "" \ No newline at end of file + api_key: \ No newline at end of file diff --git a/go.mod b/go.mod index ae337a1..4d118ca 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,8 @@ module github.com/telepace/voiceflow go 1.22.5 require ( - github.com/AssemblyAI/assemblyai-go-sdk v1.8.1 + github.com/Microsoft/cognitive-services-speech-sdk-go v1.33.0 + github.com/aws/aws-sdk-go v1.55.5 github.com/go-audio/audio v1.0.0 github.com/go-audio/wav v1.1.0 github.com/google/uuid v1.6.0 @@ -20,16 +21,15 @@ require ( ) require ( - github.com/cenkalti/backoff v2.2.1+incompatible // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/go-audio/riff v1.0.0 // indirect github.com/go-ini/ini v1.67.0 // indirect github.com/goccy/go-json v0.10.3 // indirect - github.com/google/go-querystring v1.1.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/klauspost/compress v1.17.11 // indirect github.com/klauspost/cpuid/v2 v2.2.8 // indirect github.com/magiconair/properties v1.8.7 // indirect @@ -40,6 +40,7 @@ require ( github.com/rs/xid v1.6.0 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/satori/go.uuid v1.2.0 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/afero v1.11.0 // indirect github.com/spf13/cast v1.6.0 // indirect @@ -54,5 +55,4 @@ require ( golang.org/x/text v0.19.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - nhooyr.io/websocket v1.8.7 // indirect ) diff --git a/go.sum b/go.sum index 8327df3..b5454a2 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ -github.com/AssemblyAI/assemblyai-go-sdk v1.8.1 h1:5mhpeEWEHQtuJZ7eKjoZrjvYG5tXzH2lsrJ14xnIEGM= -github.com/AssemblyAI/assemblyai-go-sdk v1.8.1/go.mod h1:ytTvsjAVL+nXZnzBfDagQ/LxDQaKL9W/eTiCo3ZuPJA= -github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= -github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= +github.com/Microsoft/cognitive-services-speech-sdk-go v1.33.0 h1:xPYSgs3nGr5J9ucPfzItDTj2jpQkz5OUPZM4z//8xUg= +github.com/Microsoft/cognitive-services-speech-sdk-go v1.33.0/go.mod h1:ct4bG95K1Lu/c5y60PVGI1XOjo9aAcl80DD5dvu6zsg= +github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= +github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -13,10 +13,6 @@ github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHk github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE= -github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= -github.com/gin-gonic/gin v1.6.3 h1:ahKqKTFpO5KTPHxWZjEdPScmYaGtLo8Y4DMHoEsnp14= -github.com/gin-gonic/gin v1.6.3/go.mod h1:75u5sXoLsGZoRN5Sgbi1eraJ4GU3++wFwWzhwvtwp4M= github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs= github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA= @@ -25,49 +21,26 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g= github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE= github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A= github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8= -github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= -github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q= -github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8= -github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no= -github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA= -github.com/go-playground/validator/v10 v10.2.0 h1:KgJ0snyC2R9VXYN2rneOtQcw5aHQB1Vv0sFl1UcHBOY= -github.com/go-playground/validator/v10 v10.2.0/go.mod h1:uOYAAleCW8F/7oMFd6aG0GOhaH6EGOAJShg8Id5JGkI= -github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0= -github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo= -github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8= -github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= -github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo= -github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM= github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA= github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= -github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= -github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= -github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5 h1:5AlozfqaVjGYGhms2OsdUyfdJME76E6rx5MdGpjzZpc= github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5/go.mod h1:WY8R6YKlI2ZI3UyzFk7P6yGSuS+hFwNtEzrexRyD7Es= -github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= +github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= -github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= -github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/klauspost/compress v1.10.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= @@ -77,25 +50,14 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y= -github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= -github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= -github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= -github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= github.com/minio/minio-go/v7 v7.0.78 h1:LqW2zy52fxnI4gg8C2oZviTaKHcBV36scS+RzJnxUFs= github.com/minio/minio-go/v7 v7.0.78/go.mod h1:84gmIilaX4zcvAWWzJ5Z1WI5axN+hAbM5w25xf8xvC0= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= -github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= @@ -112,6 +74,8 @@ github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6ke github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4= github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= +github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= +github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= @@ -131,7 +95,6 @@ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSS github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= @@ -140,10 +103,6 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= -github.com/ugorji/go v1.1.7 h1:/68gy2h+1mWMrwZFeD1kQialdSzAb432dtpeJ42ovdo= -github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= -github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs= -github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= @@ -154,19 +113,12 @@ golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjs golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= -golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= -golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -174,11 +126,8 @@ gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -nhooyr.io/websocket v1.8.7 h1:usjR2uOr/zjjkVMy0lW+PPohFok7PCow5sDjLgX4P4g= -nhooyr.io/websocket v1.8.7/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0= diff --git a/go.work b/go.work new file mode 100644 index 0000000..0489f1a --- /dev/null +++ b/go.work @@ -0,0 +1,6 @@ +go 1.22.5 + +use ( + . + ./test/assemblyai +) \ No newline at end of file diff --git a/go.work.sum b/go.work.sum new file mode 100644 index 0000000..ba96c88 --- /dev/null +++ b/go.work.sum @@ -0,0 +1,68 @@ +cloud.google.com/go v0.112.1/go.mod h1:+Vbu+Y1UU+I1rjmzeMOb/8RfkKJK2Gyxi1X6jJCZLo4= +cloud.google.com/go/compute v1.24.0/go.mod h1:kw1/T+h/+tK2LJK0wiPPx1intgdAM3j/g3hFDlscY40= +cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA= +cloud.google.com/go/firestore v1.15.0/go.mod h1:GWOxFXcv8GZUtYpWHw/w6IuYNux/BtmeVTMmjrm4yhk= +cloud.google.com/go/iam v1.1.5/go.mod h1:rB6P/Ic3mykPbFio+vo7403drjlgvoWfYpJhMXEbzv8= +cloud.google.com/go/longrunning v0.5.5/go.mod h1:WV2LAxD8/rg5Z1cNW6FJ/ZpX4E4VnDnoTk0yawPBB7s= +cloud.google.com/go/storage v1.35.1/go.mod h1:M6M/3V/D3KpzMTJyPOR/HU6n2Si5QdaXYEsng2xgOs8= +github.com/armon/go-metrics v0.4.1/go.mod h1:E6amYzXo6aW1tqzoZGT755KkbgrJsSdpwZ+3JqfkOG4= +github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= +github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/fatih/color v1.14.1/go.mod h1:2oHN61fhTpgcxD3TSWCgKDiH1+x4OiDVVGH8WlgGZGg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw= +github.com/googleapis/enterprise-certificate-proxy v0.3.2/go.mod h1:VLSiSSBs/ksPL8kq3OBOQ6WRI2QnaFynd1DCjZ62+V0= +github.com/googleapis/gax-go/v2 v2.12.3/go.mod h1:AKloxT6GtNbaLm8QTNSidHUVsHYcBHwWRvkNFJUQcS4= +github.com/googleapis/google-cloud-go-testing v0.0.0-20210719221736-1c9a4c676720/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= +github.com/hashicorp/consul/api v1.28.2/go.mod h1:KyzqzgMEya+IZPcD65YFoOVAgPpbfERu4I/tzG6/ueE= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-hclog v1.5.0/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= +github.com/hashicorp/go-immutable-radix v1.3.1/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= +github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= +github.com/hashicorp/serf v0.10.1/go.mod h1:yL2t6BqATOLGc5HF7qbFkTfXoPIY0WZdWHfEvMqbG+4= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/nats-io/nats.go v1.34.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= +github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc= +github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/sftp v1.13.6/go.mod h1:tz1ryNURKu77RL+GuCzmoJYxQczL3wLNNpPWagdg4Qk= +github.com/sagikazarmark/crypt v0.19.0/go.mod h1:c6vimRziqqERhtSe0MhIvzE1w54FrCHtrXb5NH/ja78= +go.etcd.io/etcd/api/v3 v3.5.12/go.mod h1:Ot+o0SWSyT6uHhA56al1oCED0JImsRiU9Dc26+C2a+4= +go.etcd.io/etcd/client/pkg/v3 v3.5.12/go.mod h1:seTzl2d9APP8R5Y2hFL3NVlD6qC/dOT+3kvrqPyTas4= +go.etcd.io/etcd/client/v2 v2.305.12/go.mod h1:aQ/yhsxMu+Oht1FOupSr60oBvcS9cKXHrzBpDsPTf9E= +go.etcd.io/etcd/client/v3 v3.5.12/go.mod h1:tSbBCakoWmmddL+BKVAJHa9km+O/E+bumDe9mSbPiqw= +go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0/go.mod h1:Mjt1i1INqiaoZOMGR1RIUJN+i3ChKoFRqzrRQhlkbs0= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw= +go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= +go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= +go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= +go.uber.org/zap v1.21.0/go.mod h1:wjWOCqI0f2ZZrJF/UufIOkiC8ii6tm1iqIsLo76RfJw= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/oauth2 v0.18.0/go.mod h1:Wf7knwG0MPoWIMMBgFlEaSUDaKskp0dCfrlJRJXbBi8= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= +google.golang.org/api v0.171.0/go.mod h1:Hnq5AHm4OTMt2BUVjael2CWZFD6vksJdWCWiUAmjC9o= +google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= +google.golang.org/genproto v0.0.0-20240213162025-012b6fc9bca9/go.mod h1:mqHbVIp48Muh7Ywss/AD6I5kNVKZMmAa/QEW58Gxp2s= +google.golang.org/genproto/googleapis/api v0.0.0-20240311132316-a219d84964c2/go.mod h1:O1cOfN1Cy6QEYr7VxtjOyP5AdAuR0aJ/MYZaaof623Y= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240314234333-6e1732d8331c/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY= +google.golang.org/grpc v1.62.1/go.mod h1:IWTG0VlJLCh1SkC58F7np9ka9mx/WNkjl4PGJaiq+QE= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= diff --git a/internal/llm/openai/openai.go b/internal/llm/openai/openai.go index 0783ec6..220254e 100644 --- a/internal/llm/openai/openai.go +++ b/internal/llm/openai/openai.go @@ -3,12 +3,13 @@ package openai import ( "bytes" "encoding/json" - "errors" "fmt" - "github.com/telepace/voiceflow/pkg/config" - "github.com/telepace/voiceflow/pkg/logger" "io/ioutil" "net/http" + "time" + + "github.com/telepace/voiceflow/pkg/config" + "github.com/telepace/voiceflow/pkg/logger" ) // OpenAILLM 结构体存储 OpenAI 交互所需的信息 @@ -31,54 +32,63 @@ func NewOpenAILLM() *OpenAILLM { // GetResponse 调用 OpenAI API,获取对话模型的回复 func (o *OpenAILLM) GetResponse(prompt string) (string, error) { - // 定义请求体结构 - requestBody, err := json.Marshal(map[string]interface{}{ - "model": "text-davinci-003", // 或者其他模型 - "prompt": prompt, - "max_tokens": 150, - }) + if o.apiKey == "" { + return "", fmt.Errorf("OpenAI API key not configured") + } + + url := o.endpoint + + requestBody := map[string]interface{}{ + "model": "gpt-3.5-turbo", + "messages": []map[string]string{ + { + "role": "user", + "content": prompt, + }, + }, + "temperature": 0.7, + } + + jsonData, err := json.Marshal(requestBody) if err != nil { - return "", err + return "", fmt.Errorf("failed to marshal request: %w", err) } - // 创建请求 - req, err := http.NewRequest("POST", o.endpoint, bytes.NewBuffer(requestBody)) + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) if err != nil { - return "", err + return "", fmt.Errorf("failed to create request: %w", err) } - req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", o.apiKey)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+o.apiKey) - // 发送请求 - client := &http.Client{} + client := &http.Client{Timeout: 30 * time.Second} resp, err := client.Do(req) if err != nil { - return "", err + return "", fmt.Errorf("failed to send request: %w", err) } defer resp.Body.Close() - // 处理响应 if resp.StatusCode != http.StatusOK { - return "", errors.New("failed to get response from OpenAI") + body, _ := ioutil.ReadAll(resp.Body) + return "", fmt.Errorf("OpenAI API error (status %d): %s", resp.StatusCode, string(body)) } - body, err := ioutil.ReadAll(resp.Body) - if err != nil { - return "", err + var result struct { + Choices []struct { + Message struct { + Content string `json:"content"` + } `json:"message"` + } `json:"choices"` } - // 解析响应 - var result map[string]interface{} - if err := json.Unmarshal(body, &result); err != nil { - return "", err + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return "", fmt.Errorf("failed to decode response: %w", err) } - // 返回模型生成的文本 - if choices, ok := result["choices"].([]interface{}); ok && len(choices) > 0 { - if choice, ok := choices[0].(map[string]interface{}); ok { - return choice["text"].(string), nil - } + if len(result.Choices) == 0 { + return "", fmt.Errorf("no response from OpenAI") } - return "", errors.New("invalid response format from OpenAI") + return result.Choices[0].Message.Content, nil } diff --git a/internal/models/models.go b/internal/models/models.go index 9cb8a91..3e4e1a3 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -1,21 +1,22 @@ +// models.go package models import ( - "time" + "time" ) type Session struct { - ID string `json:"id"` - UserID string `json:"user_id"` - CreatedAt time.Time `json:"created_at"` - UpdatedAt time.Time `json:"updated_at"` + ID string `json:"id"` + UserID string `json:"user_id"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` } type Message struct { - ID string `json:"id"` - SessionID string `json:"session_id"` - Sender string `json:"sender"` // "user" 或 "assistant" - Content string `json:"content"` - AudioURL string `json:"audio_url"` - CreatedAt time.Time `json:"created_at"` -} \ No newline at end of file + ID string `json:"id"` + SessionID string `json:"session_id"` + Sender string `json:"sender"` // "user" 或 "assistant" + Content string `json:"content"` + AudioURL string `json:"audio_url"` + CreatedAt time.Time `json:"created_at"` +} diff --git a/internal/server/handlers.go b/internal/server/handlers.go index 71e3081..de6c9f4 100644 --- a/internal/server/handlers.go +++ b/internal/server/handlers.go @@ -3,13 +3,13 @@ package server import ( "encoding/json" - "github.com/telepace/voiceflow/pkg/config" - "github.com/telepace/voiceflow/pkg/logger" "net/http" "sync" + "github.com/telepace/voiceflow/pkg/config" + "github.com/telepace/voiceflow/pkg/logger" + "github.com/gorilla/websocket" - "github.com/telepace/voiceflow/internal/llm" "github.com/telepace/voiceflow/internal/storage" "github.com/telepace/voiceflow/internal/stt" "github.com/telepace/voiceflow/internal/tts" @@ -17,10 +17,10 @@ import ( var ( // 服务实例和锁 - serviceLock sync.RWMutex - sttService stt.Service - ttsService tts.Service - llmService llm.Service + serviceLock sync.RWMutex + sttService stt.Service + ttsService tts.Service + // llmService llm.Service storageService storage.Service ) @@ -31,90 +31,102 @@ func InitServices() { logger.Fatalf("配置初始化失败: %v", err) } sttService = stt.NewService(cfg.STT.Provider) - //ttsService = tts.NewService(cfg.TTS.Provider) - //llmService = llm.NewService(cfg.LLM.Provider) - //storageService = storage.NewService() + ttsService = tts.NewService(cfg.TTS.Provider) + // llmService = llm.NewService(cfg.LLM.Provider) + storageService = storage.NewService() } -func (s *Server) handleConnections(w http.ResponseWriter, r *http.Request) { - - // 检查服务实例是否为空 - if s == nil { - logger.Error("Server instance is nil in handleConnections") - http.Error(w, "Internal Server Error", http.StatusInternalServerError) - return - } else { - logger.Infof("Server instance is not nil in handleConnections: %v", s) - } +// 修改消息结构 +type TextMessage struct { + Text string `json:"text"` + RequireTTS bool `json:"require_tts"` +} - // 升级 WebSocket 连接 +func (s *Server) handleConnections(w http.ResponseWriter, r *http.Request) { ws, err := s.upgrader.Upgrade(w, r, nil) if err != nil { - logger.Errorf("WebSocket Upgrade error: %v", err) + logger.Error("WebSocket upgrade error: %v", err) return } defer ws.Close() + // 创建会话管理器 + sessionManager := NewSessionManager() + for { mt, data, err := ws.ReadMessage() if err != nil { - logger.Errorf("Read error: %v", err) + if websocket.IsCloseError(err, websocket.CloseGoingAway, websocket.CloseNormalClosure) { + logger.Info("WebSocket connection closed normally") + } else { + logger.Error("WebSocket read error", "error", err) + } break } - // 获取最新的服务实例 - serviceLock.RLock() - currentSTTService := sttService - currentTTSService := ttsService - currentLLMService := llmService - currentStorageService := storageService - serviceLock.RUnlock() - - if mt == websocket.TextMessage { - logger.Debug("Received text message") - // 处理文字消息 - var msg map[string]string + switch mt { + case websocket.TextMessage: + var msg map[string]interface{} if err := json.Unmarshal(data, &msg); err != nil { - logger.Error("JSON parse error: %v", err) + logger.Error("解析消息失败", "error", err) continue } - text := msg["text"] - // 调用 TTS 服务,将文字转换为语音 - audioData, err := currentTTSService.Synthesize(text) - if err != nil { - logger.Error("TTS error: %v", err) - continue + // 检查是否是音频相关的控制消息 + if msgType, ok := msg["type"].(string); ok { + switch msgType { + case "audio_start": + sessionID, _ := msg["session_id"].(string) + sessionManager.StartSession(sessionID) + case "audio_end": + sessionID, _ := msg["session_id"].(string) + if err := sessionManager.EndSession(sessionID, ws); err != nil { + logger.Error("处理会话结束失败", "error", err) + } + } + } else { + // 处理普通文本消息 + text, _ := msg["text"].(string) + requireTTS, _ := msg["require_tts"].(bool) + + if requireTTS { + // 调用 TTS 服务 + audio, err := ttsService.Synthesize(text) + if err != nil { + logger.Error("语音合成失败", "error", err) + continue + } + + // 存储音频文件 + audioURL, err := storageService.StoreAudio(audio) + if err != nil { + logger.Error("存储音频失败", "error", err) + continue + } + + // 发送响应给客户端 + response := map[string]interface{}{ + "type": "tts_complete", + "text": text, + "audio_url": audioURL, + } + + if err := ws.WriteJSON(response); err != nil { + logger.Error("发送响应失败", "error", err) + } + } } - // 存储音频并获取 URL - audioURL, err := currentStorageService.StoreAudio(audioData) - if err != nil { - logger.Error("Storage error: %v", err) + case websocket.BinaryMessage: + currentSession := sessionManager.GetCurrentSession() + if currentSession == "" { + logger.Error("收到二进制数据但没有活动会话") continue } - // 返回音频 URL 给前端 - ws.WriteJSON(map[string]string{"audio_url": audioURL}) - } else if mt == websocket.BinaryMessage { - logger.Debug("Received binary message") - // 处理音频消息 - // 使用 STT 服务将语音转换为文字 - text, err := currentSTTService.Recognize(data) - if err != nil { - logger.Errorf("STT error: %v", err) - continue + if err := sessionManager.AppendAudioData(currentSession, data); err != nil { + logger.Error("追加音频数据失败", "error", err) } - - // 调用 LLM 服务获取响应 - responseText, err := currentLLMService.GetResponse(text) - if err != nil { - logger.Errorf("LLM error: %v", err) - continue - } - - // 返回文本响应给前端 - ws.WriteJSON(map[string]string{"text": responseText}) } } } diff --git a/internal/server/message/binary_handler.go b/internal/server/message/binary_handler.go new file mode 100644 index 0000000..4718ce1 --- /dev/null +++ b/internal/server/message/binary_handler.go @@ -0,0 +1,131 @@ +package message + +import ( + "bytes" + "fmt" + "sync" + + "github.com/gorilla/websocket" + "github.com/telepace/voiceflow/internal/storage" + "github.com/telepace/voiceflow/internal/stt" + "github.com/telepace/voiceflow/internal/tts" +) + +type BinaryMessageHandler struct { + stt stt.Service + tts tts.Service + storage storage.Service + audioBuffers map[string]*bytes.Buffer + bufferMutex sync.RWMutex +} + +func NewBinaryMessageHandler(stt stt.Service, tts tts.Service, storage storage.Service) *BinaryMessageHandler { + return &BinaryMessageHandler{ + stt: stt, + tts: tts, + storage: storage, + audioBuffers: make(map[string]*bytes.Buffer), + bufferMutex: sync.RWMutex{}, + } +} + +// HandleStart 处理音频开始信号 +func (h *BinaryMessageHandler) HandleStart(sessionID string) error { + h.bufferMutex.Lock() + defer h.bufferMutex.Unlock() + + if _, exists := h.audioBuffers[sessionID]; exists { + return fmt.Errorf("session already exists: %s", sessionID) + } + + h.audioBuffers[sessionID] = &bytes.Buffer{} + return nil +} + +// HandleAudioData 处理音频二进制数据 +func (h *BinaryMessageHandler) HandleAudioData(sessionID string, data []byte) error { + h.bufferMutex.Lock() + defer h.bufferMutex.Unlock() + + buffer, exists := h.audioBuffers[sessionID] + if !exists { + return fmt.Errorf("no active session found: %s", sessionID) + } + + _, err := buffer.Write(data) + if err != nil { + return fmt.Errorf("failed to write audio data: %w", err) + } + + return nil +} + +// HandleEnd 处理音频结束信号 +func (h *BinaryMessageHandler) HandleEnd(conn *websocket.Conn, sessionID string) error { + // 1. 获取并清理音频数据 + audioData, err := h.getAndCleanAudioData(sessionID) + if err != nil { + return fmt.Errorf("failed to get audio data: %w", err) + } + + // 2. 存储到 MinIO + audioURL, err := h.storage.StoreAudio(audioData) + if err != nil { + return fmt.Errorf("failed to store audio: %w", err) + } + + // 3. 立即发送存储成功的响应 + err = conn.WriteJSON(map[string]interface{}{ + "type": "audio_stored", + "session_id": sessionID, + "audio_url": audioURL, + }) + if err != nil { + return fmt.Errorf("failed to send audio storage response: %w", err) + } + + // 4. 异步进行语音识别 + go func() { + text, err := h.stt.Recognize(audioData) + if err != nil { + // 发送错误响应 + conn.WriteJSON(map[string]interface{}{ + "type": "recognition_error", + "session_id": sessionID, + "error": err.Error(), + }) + return + } + + // 发送识别结果 + conn.WriteJSON(map[string]interface{}{ + "type": "recognition_complete", + "session_id": sessionID, + "text": text, + }) + }() + + return nil +} + +// getAndCleanAudioData 获取并清理音频数据 +func (h *BinaryMessageHandler) getAndCleanAudioData(sessionID string) ([]byte, error) { + h.bufferMutex.Lock() + defer h.bufferMutex.Unlock() + + buffer, exists := h.audioBuffers[sessionID] + if !exists { + return nil, fmt.Errorf("no audio data found for session: %s", sessionID) + } + + audioData := buffer.Bytes() + delete(h.audioBuffers, sessionID) + return audioData, nil +} + +// CleanupSession 清理指定会话的资源 +func (h *BinaryMessageHandler) CleanupSession(sessionID string) { + h.bufferMutex.Lock() + defer h.bufferMutex.Unlock() + delete(h.audioBuffers, sessionID) +} diff --git a/internal/server/message/text_handler.go b/internal/server/message/text_handler.go new file mode 100644 index 0000000..51f6a54 --- /dev/null +++ b/internal/server/message/text_handler.go @@ -0,0 +1,45 @@ +package message + +import ( + "github.com/gorilla/websocket" + "github.com/telepace/voiceflow/internal/storage" + "github.com/telepace/voiceflow/internal/tts" + "fmt" +) + +type TextMessageHandler struct { + tts tts.Service + storage storage.Service +} + +func NewTextMessageHandler(tts tts.Service, storage storage.Service) *TextMessageHandler { + return &TextMessageHandler{ + tts: tts, + storage: storage, + } +} + +func (h *TextMessageHandler) Handle(conn *websocket.Conn, msg *TextMessage) error { + // 如果需要TTS,直接合成语音 + if msg.RequireTTS { + audio, err := h.tts.Synthesize(msg.Text) + if err != nil { + return fmt.Errorf("failed to synthesize speech: %w", err) + } + + audioURL, err := h.storage.StoreAudio(audio) + if err != nil { + return fmt.Errorf("failed to store audio: %w", err) + } + + return conn.WriteJSON(map[string]string{ + "text": msg.Text, + "audio_url": audioURL, + }) + } + + // 如果不需要TTS,直接返回文本 + return conn.WriteJSON(map[string]string{ + "text": msg.Text, + }) +} diff --git a/internal/server/message/types.go b/internal/server/message/types.go new file mode 100644 index 0000000..8d3fbad --- /dev/null +++ b/internal/server/message/types.go @@ -0,0 +1,42 @@ +package message + +// MessageType 定义消息类型 +type MessageType int + +const ( + TextType MessageType = iota + BinaryType + SignalType +) + +// SignalMessage 定义信号消息 +type SignalMessage struct { + Type string `json:"type"` // 信号类型:"end" 等 + Session string `json:"session"` // 会话ID,用于关联音频片段 +} + +// TextMessage 保持不变 +type TextMessage struct { + Text string `json:"text"` + RequireTTS bool `json:"require_tts"` +} + +// BinaryMessage 添加会话信息 +type BinaryMessage struct { + Data []byte +} + +type AudioStartMessage struct { + Type string `json:"type"` // "audio_start" + SessionID string `json:"session_id"` +} + +type AudioEndMessage struct { + Type string `json:"type"` // "audio_end" + SessionID string `json:"session_id"` +} + +type MessageHandler interface { + HandleTextMessage(msg *TextMessage) error + HandleBinaryMessage(msg *BinaryMessage) error +} diff --git a/internal/server/middleware/error.go b/internal/server/middleware/error.go new file mode 100644 index 0000000..48e3f96 --- /dev/null +++ b/internal/server/middleware/error.go @@ -0,0 +1,24 @@ +package middleware + +import ( + "github.com/gorilla/websocket" +) + +type ErrorResponse struct { + Error string `json:"error"` + Code int `json:"code"` +} + +func ErrorHandler(handler func(*websocket.Conn, interface{}) error) func(*websocket.Conn, interface{}) error { + return func(conn *websocket.Conn, msg interface{}) error { + if err := handler(conn, msg); err != nil { + response := ErrorResponse{ + Error: err.Error(), + Code: 500, + } + return conn.WriteJSON(response) + } + return nil + + } +} diff --git a/internal/server/server.go b/internal/server/server.go index ae057f4..a481d2c 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -1,10 +1,11 @@ -// server.go +// internal/server/server.go package server import ( - "github.com/telepace/voiceflow/pkg/logger" "net/http" + "github.com/telepace/voiceflow/pkg/logger" + "github.com/gorilla/websocket" ) diff --git a/internal/server/session_manager.go b/internal/server/session_manager.go new file mode 100644 index 0000000..64b16fe --- /dev/null +++ b/internal/server/session_manager.go @@ -0,0 +1,101 @@ +package server + +import ( + "bytes" + "fmt" + "sync" + + "github.com/gorilla/websocket" +) + +type SessionManager struct { + sessions map[string]*bytes.Buffer + currentSession string + mu sync.RWMutex +} + +func NewSessionManager() *SessionManager { + return &SessionManager{ + sessions: make(map[string]*bytes.Buffer), + } +} + +func (sm *SessionManager) StartSession(sessionID string) { + sm.mu.Lock() + defer sm.mu.Unlock() + + sm.sessions[sessionID] = &bytes.Buffer{} + sm.currentSession = sessionID +} + +func (sm *SessionManager) GetCurrentSession() string { + sm.mu.RLock() + defer sm.mu.RUnlock() + return sm.currentSession +} + +func (sm *SessionManager) AppendAudioData(sessionID string, data []byte) error { + sm.mu.Lock() + defer sm.mu.Unlock() + + buffer, exists := sm.sessions[sessionID] + if !exists { + return fmt.Errorf("session not found: %s", sessionID) + } + + _, err := buffer.Write(data) + return err +} + +func (sm *SessionManager) EndSession(sessionID string, ws *websocket.Conn) error { + sm.mu.Lock() + defer sm.mu.Unlock() + + buffer, exists := sm.sessions[sessionID] + if !exists { + return fmt.Errorf("session not found: %s", sessionID) + } + + // 获取音频数据 + audioData := buffer.Bytes() + + // 1. 首先存储音频文件 + audioURL, err := storageService.StoreAudio(audioData) + if err != nil { + return fmt.Errorf("failed to store audio: %w", err) + } + + // 2. 发送音频存储成功的消息 + if err := ws.WriteJSON(map[string]interface{}{ + "type": "audio_stored", + "audio_url": audioURL, + }); err != nil { + return fmt.Errorf("failed to send audio storage response: %w", err) + } + + // 3. 进行语音识别 + text, err := sttService.Recognize(audioData) + if err != nil { + // 发送识别错误消息 + return ws.WriteJSON(map[string]interface{}{ + "type": "recognition_error", + "error": err.Error(), + }) + } + + // 4. 发送识别完成的消息 + if err := ws.WriteJSON(map[string]interface{}{ + "type": "recognition_complete", + "text": text, + }); err != nil { + return fmt.Errorf("failed to send recognition result: %w", err) + } + + // 5. 清理会话数据 + delete(sm.sessions, sessionID) + if sm.currentSession == sessionID { + sm.currentSession = "" + } + + return nil +} diff --git a/internal/storage/local.go b/internal/storage/local.go index 3ad2411..d75e0e8 100644 --- a/internal/storage/local.go +++ b/internal/storage/local.go @@ -1,3 +1,4 @@ +// local.go package storage import ( @@ -6,6 +7,8 @@ import ( "os" "path/filepath" "time" + "github.com/telepace/voiceflow/pkg/config" + "github.com/telepace/voiceflow/pkg/logger" ) type LocalStorageService struct { @@ -14,8 +17,12 @@ type LocalStorageService struct { // NewLocalStorageService 创建并返回一个新的本地存储服务 func NewLocalStorageService() *LocalStorageService { + cfg, err := config.GetConfig() + if err != nil { + logger.Fatalf("配置初始化失败: %v", err) + } return &LocalStorageService{ - storagePath: "./audio_files", // 设置本地存储路径 + storagePath: cfg.MinIO.StoragePath, // 使用相同的配置路径或单独配置 } } diff --git a/internal/storage/minio.go b/internal/storage/minio.go index 6650473..3112f47 100644 --- a/internal/storage/minio.go +++ b/internal/storage/minio.go @@ -1,23 +1,26 @@ +// storage.go package storage import ( "context" "fmt" - "github.com/google/uuid" // 用于生成唯一文件名 - "github.com/minio/minio-go/v7" - "github.com/minio/minio-go/v7/pkg/credentials" - "github.com/telepace/voiceflow/pkg/config" - "github.com/telepace/voiceflow/pkg/logger" "io/ioutil" "log" "net/url" "os" "time" + + "github.com/google/uuid" // 用于生成唯一文件名 + "github.com/minio/minio-go/v7" + "github.com/minio/minio-go/v7/pkg/credentials" + "github.com/telepace/voiceflow/pkg/config" + "github.com/telepace/voiceflow/pkg/logger" ) type MinIOService struct { - client *minio.Client - bucketName string + client *minio.Client + bucketName string + storagePath string } // NewMinIOService 创建并返回 MinIO 客户端 @@ -37,8 +40,9 @@ func NewMinIOService() *MinIOService { } return &MinIOService{ - client: minioClient, - bucketName: cfg.MinIO.BucketName, + client: minioClient, + bucketName: cfg.MinIO.BucketName, + storagePath: cfg.MinIO.StoragePath, } } @@ -60,8 +64,8 @@ func (m *MinIOService) StoreAudio(audioData []byte) (string, error) { return "", fmt.Errorf("error writing audio to temp file: %v", err) } - // 生成唯一文件名 - objectName := uuid.New().String() + ".wav" + // 生成唯一文件名,并添加存储路径前缀 + objectName := m.storagePath + uuid.New().String() + ".wav" // 上传文件到 MinIO info, err := m.client.FPutObject(ctx, m.bucketName, objectName, tempFile.Name(), minio.PutObjectOptions{ diff --git a/internal/stt/assemblyai/assemblyai.go b/internal/stt/assemblyai/assemblyai.go index 2e41611..6ed02ba 100644 --- a/internal/stt/assemblyai/assemblyai.go +++ b/internal/stt/assemblyai/assemblyai.go @@ -5,12 +5,12 @@ import ( "bytes" "encoding/json" "fmt" - "github.com/go-audio/audio" - "github.com/go-audio/wav" - "github.com/telepace/voiceflow/pkg/config" - "github.com/telepace/voiceflow/pkg/logger" "io" "net/http" + "time" + + "github.com/telepace/voiceflow/pkg/config" + "github.com/telepace/voiceflow/pkg/logger" ) const WAVE_FORMAT_PCM = 1 @@ -46,29 +46,41 @@ func (a *AssemblyAI) Recognize(audioData []byte) (string, error) { if err != nil { return "", fmt.Errorf("failed to transcribe audio: %v", err) } + + // 打印转录文本 + logger.Infof("Transcription result: %s", transcriptText) + return transcriptText, nil } func (a *AssemblyAI) uploadAudioData(audioData []byte) (string, error) { - uploadURL := "https://api.assemblyai.com/v2/upload" + url := "https://api.assemblyai.com/v2/upload" - req, err := http.NewRequest("POST", uploadURL, bytes.NewReader(audioData)) + req, err := http.NewRequest("POST", url, bytes.NewReader(audioData)) if err != nil { - return "", err + return "", fmt.Errorf("failed to create request: %v", err) } req.Header.Set("Authorization", a.apiKey) - req.Header.Set("Content-Type", "application/octet-stream") + req.Header.Set("Content-Type", "audio/wav") + req.Header.Set("Transfer-Encoding", "chunked") + + logger.Infof("Uploading audio data, size: %d bytes", len(audioData)) + + client := &http.Client{ + Timeout: 30 * time.Second, + } - client := &http.Client{} resp, err := client.Do(req) if err != nil { - return "", err + return "", fmt.Errorf("failed to upload: %v", err) } defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + logger.Infof("Upload response status: %d, body: %s", resp.StatusCode, string(body)) + if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(resp.Body) return "", fmt.Errorf("upload failed with status %d: %s", resp.StatusCode, string(body)) } @@ -76,8 +88,8 @@ func (a *AssemblyAI) uploadAudioData(audioData []byte) (string, error) { UploadURL string `json:"upload_url"` } - if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { - return "", err + if err := json.Unmarshal(body, &result); err != nil { + return "", fmt.Errorf("failed to decode response: %v", err) } return result.UploadURL, nil @@ -86,12 +98,13 @@ func (a *AssemblyAI) uploadAudioData(audioData []byte) (string, error) { func (a *AssemblyAI) requestTranscription(uploadURL string) (string, error) { transcriptURL := "https://api.assemblyai.com/v2/transcript" + logger.Infof("Sending transcription request for audio URL: %s", uploadURL) + requestBody := map[string]interface{}{ - "audio_url": uploadURL, - "language_code": "en_us", - "punctuate": true, - "format_text": true, - "wait_for_completion": true, + "audio_url": uploadURL, + // "language_code": "zh", + "punctuate": true, + "format_text": true, } requestBodyBytes, err := json.Marshal(requestBody) @@ -122,54 +135,96 @@ func (a *AssemblyAI) requestTranscription(uploadURL string) (string, error) { var result struct { ID string `json:"id"` Status string `json:"status"` - Text string `json:"text"` - Error string `json:"error"` } if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return "", err } - if result.Status != "completed" { - return "", fmt.Errorf("transcription failed with status %s: %s", result.Status, result.Error) - } - - return result.Text, nil + // 轮询等待转录完成 + pollURL := fmt.Sprintf("%s/%s", transcriptURL, result.ID) + for i := 0; i < 30; i++ { // 最多等待30次,每次3秒 + time.Sleep(3 * time.Second) + + req, err := http.NewRequest("GET", pollURL, nil) + if err != nil { + return "", err + } + req.Header.Set("Authorization", a.apiKey) + + resp, err := client.Do(req) + if err != nil { + return "", err + } + + var pollResult struct { + Status string `json:"status"` + Text string `json:"text"` + Error string `json:"error"` + } + + if err := json.NewDecoder(resp.Body).Decode(&pollResult); err != nil { + resp.Body.Close() + return "", err + } + resp.Body.Close() + + logger.Infof("Transcription status: %s", pollResult.Status) + + switch pollResult.Status { + case "completed": + // 打印最终转录文本 + logger.Infof("Final transcription text: %s", pollResult.Text) + return pollResult.Text, nil + case "error": + return "", fmt.Errorf("transcription failed: %s", pollResult.Error) + case "processing", "queued": + continue + default: + return "", fmt.Errorf("unknown status: %s", pollResult.Status) + } + } + + return "", fmt.Errorf("transcription timeout after 90 seconds") } func wrapPCMDataToWAV(pcmData []byte) ([]byte, error) { + outBuffer := bytes.NewBuffer(nil) + const ( - sampleRate = 16000 - bitDepth = 16 - numChannels = 1 + sampleRate = 16000 + numChannels = 1 + bitsPerSample = 16 ) - buf := &BufferWriteSeeker{} + outBuffer.WriteString("RIFF") + writeInt32(outBuffer, uint32(len(pcmData)+36)) + outBuffer.WriteString("WAVE") - encoder := wav.NewEncoder(buf, sampleRate, bitDepth, numChannels, WAVE_FORMAT_PCM) + outBuffer.WriteString("fmt ") + writeInt32(outBuffer, 16) + writeInt16(outBuffer, 1) + writeInt16(outBuffer, numChannels) + writeInt32(outBuffer, sampleRate) + writeInt32(outBuffer, sampleRate*numChannels*bitsPerSample/8) + writeInt16(outBuffer, numChannels*bitsPerSample/8) + writeInt16(outBuffer, bitsPerSample) - // 将 PCM 数据转换为 audio.IntBuffer - intBuf := &audio.IntBuffer{ - Data: make([]int, len(pcmData)/2), - Format: &audio.Format{SampleRate: sampleRate, NumChannels: numChannels}, - SourceBitDepth: bitDepth, - } - - // 假设 PCM 数据是 16 位有符号整数(小端序) - for i := 0; i+1 < len(pcmData); i += 2 { - sample := int16(pcmData[i]) | int16(pcmData[i+1])<<8 - intBuf.Data[i/2] = int(sample) - } + outBuffer.WriteString("data") + writeInt32(outBuffer, uint32(len(pcmData))) + outBuffer.Write(pcmData) - // 写入缓冲区 - if err := encoder.Write(intBuf); err != nil { - return nil, err - } + return outBuffer.Bytes(), nil +} - // 关闭编码器以刷新数据 - if err := encoder.Close(); err != nil { - return nil, err - } +func writeInt16(w *bytes.Buffer, value uint16) { + w.WriteByte(byte(value)) + w.WriteByte(byte(value >> 8)) +} - return buf.Bytes(), nil +func writeInt32(w *bytes.Buffer, value uint32) { + w.WriteByte(byte(value)) + w.WriteByte(byte(value >> 8)) + w.WriteByte(byte(value >> 16)) + w.WriteByte(byte(value >> 24)) } diff --git a/internal/stt/aws/aws.go b/internal/stt/aws/aws.go new file mode 100644 index 0000000..ff5988a --- /dev/null +++ b/internal/stt/aws/aws.go @@ -0,0 +1,156 @@ +// internal/stt/aws/aws.go +package aws + +import ( + "context" + "fmt" + "github.com/telepace/voiceflow/pkg/sttservice" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + transcribe "github.com/aws/aws-sdk-go/service/transcribestreamingservice" + "github.com/aws/aws-sdk-go/service/transcribestreamingservice/transcribestreamingserviceiface" + "github.com/telepace/voiceflow/pkg/config" +) + +type Service struct { + client transcribestreamingserviceiface.TranscribeStreamingServiceAPI + config *config.AWSConfig +} + +// 确保 Service 实现了 sttservice.Service 接口 +var _ sttservice.Service = (*Service)(nil) + +// NewService 创建新的 AWS STT 服务 +func NewService(cfg *config.AWSConfig) (sttservice.Service, error) { + awsConfig := &aws.Config{ + Region: aws.String(cfg.Region), + Credentials: credentials.NewStaticCredentials(cfg.AccessKeyID, cfg.SecretAccessKey, ""), + } + sess, err := session.NewSession(awsConfig) + if err != nil { + return nil, fmt.Errorf("无法创建 AWS 会话:%v", err) + } + client := transcribe.New(sess) + return &Service{ + client: client, + config: cfg, + }, nil +} + +// Recognize 实现了 stt.Service 接口的 Recognize 方法 +func (s *Service) Recognize(audioData []byte) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + + input := &transcribe.StartStreamTranscriptionInput{ + LanguageCode: aws.String("en-US"), + MediaEncoding: aws.String("pcm"), + MediaSampleRateHertz: aws.Int64(16000), + } + + output, err := s.client.StartStreamTranscriptionWithContext(ctx, input) + if err != nil { + return "", fmt.Errorf("无法开始转录流:%v", err) + } + + eventStream := output.GetStream() + + // 发送音频数据 + go func() { + defer eventStream.Close() + err := eventStream.Send(ctx, &transcribe.AudioEvent{ + AudioChunk: audioData, + }) + if err != nil { + fmt.Printf("发送音频数据时出错:%v\n", err) + return + } + // 发送完成后关闭发送方向的流 + eventStream.Close() + }() + + // 接收转录结果 + var transcript string + for event := range eventStream.Events() { + switch e := event.(type) { + case *transcribe.TranscriptEvent: + results := e.Transcript.Results + for _, result := range results { + if !aws.BoolValue(result.IsPartial) { + for _, alt := range result.Alternatives { + transcript += aws.StringValue(alt.Transcript) + } + } + } + } + } + + if err := eventStream.Err(); err != nil { + return "", fmt.Errorf("转录错误:%v", err) + } + + return transcript, nil +} + +// StreamRecognize 实现了 stt.Service 接口的 StreamRecognize 方法 +func (s *Service) StreamRecognize(ctx context.Context, audioDataChan <-chan []byte, transcriptChan chan<- string) error { + input := &transcribe.StartStreamTranscriptionInput{ + LanguageCode: aws.String("en-US"), + MediaEncoding: aws.String("pcm"), + MediaSampleRateHertz: aws.Int64(16000), + } + + output, err := s.client.StartStreamTranscriptionWithContext(ctx, input) + if err != nil { + return fmt.Errorf("无法开始转录流:%v", err) + } + + eventStream := output.GetStream() + + // 发送音频数据的协程 + go func() { + defer eventStream.Close() + for { + select { + case audioChunk, ok := <-audioDataChan: + if !ok { + // 音频数据通道已关闭,结束发送 + return + } + err := eventStream.Send(ctx, &transcribe.AudioEvent{ + AudioChunk: audioChunk, + }) + if err != nil { + fmt.Printf("发送音频块时出错:%v\n", err) + return + } + case <-ctx.Done(): + return + } + } + }() + + // 接收转录结果 + for event := range eventStream.Events() { + switch e := event.(type) { + case *transcribe.TranscriptEvent: + results := e.Transcript.Results + for _, result := range results { + for _, alt := range result.Alternatives { + transcript := aws.StringValue(alt.Transcript) + // 发送部分转录结果 + transcriptChan <- transcript + } + } + } + } + + if err := eventStream.Err(); err != nil { + return fmt.Errorf("转录错误:%v", err) + } + + return nil +} diff --git a/internal/stt/volcengine/huoshan.go b/internal/stt/volcengine/huoshan.go deleted file mode 100644 index 920876b..0000000 --- a/internal/stt/volcengine/huoshan.go +++ /dev/null @@ -1,338 +0,0 @@ -// volcengine.go -package volcengine - -import ( - "bytes" - "compress/gzip" - "encoding/binary" - "encoding/json" - "fmt" - "github.com/google/uuid" - "github.com/gorilla/websocket" - "github.com/telepace/voiceflow/pkg/config" - "github.com/telepace/voiceflow/pkg/logger" - "io" - "net/http" -) - -type STT struct { - wsURL string - uid string - rate int - format string - bits int - channel int - codec string - accessKey string - appKey string -} - -func NewVolcengineSTT() *STT { - cfg, err := config.GetConfig() - if err != nil { - logger.Fatalf("配置初始化失败: %v", err) - } - return &STT{ - wsURL: cfg.Volcengine.WsURL, - uid: cfg.Volcengine.UID, - rate: cfg.Volcengine.Rate, - format: cfg.Volcengine.Format, - bits: cfg.Volcengine.Bits, - channel: cfg.Volcengine.Channel, - codec: cfg.Volcengine.Codec, - accessKey: cfg.Volcengine.AccessKey, - appKey: cfg.Volcengine.AppKey, - } -} - -func (s *STT) Recognize(audioData []byte) (string, error) { - reqID := uuid.New().String() - connectorID := uuid.New().String() - - header := http.Header{} - header.Set("X-Api-Resource-Id", "volc.bigasr.sauc.duration") - header.Set("X-Api-Access-Key", s.accessKey) - header.Set("X-Api-App-Key", s.appKey) - header.Set("X-Api-Connect-Id", connectorID) - - dialer := websocket.DefaultDialer - conn, resp, err := dialer.Dial(s.wsURL, header) - if err != nil { - logger.Error("WebSocket 连接错误:", "reqID:", reqID, "ws URL:", s.wsURL, "api Key:", s.accessKey, err) - return "", err - } - defer conn.Close() - - // 检查并打印 X-Api-Connect-Id 和 X-Tt-Logid - if connectID := resp.Header.Get("X-Api-Connect-Id"); connectID != "" { - logger.Infof("连接追踪ID: X-Api-Connect-Id = %s", connectID) - } - if logID := resp.Header.Get("X-Tt-Logid"); logID != "" { - logger.Infof("服务端返回的logid: X-Tt-Logid = %s", logID) - } - - // 构建并发送初始请求 - req := map[string]interface{}{ - "user": map[string]interface{}{ - "uid": s.uid, - }, - "audio": map[string]interface{}{ - "format": s.format, - "rate": s.rate, - "bits": s.bits, - "channel": s.channel, - "codec": s.codec, - }, - } - - payloadBytes, err := json.Marshal(req) - if err != nil { - return "", err - } - - compressedPayload, err := gzipCompress(payloadBytes) - if err != nil { - return "", err - } - - err = s.sendMessage(conn, FULL_CLIENT_REQUEST, POS_SEQUENCE, JSON_SERIALIZATION, compressedPayload, 1) - if err != nil { - logger.Errorf("发送初始消息错误: %v", err) - return "", err - } - - // 处理响应 - var finalText string - for { - _, respData, err := conn.ReadMessage() - if err != nil { - if websocket.IsCloseError(err, websocket.CloseNormalClosure) { - logger.Info("WebSocket 连接正常关闭") - break - } else { - logger.Errorf("读取响应错误: %v", err) - return "", err - } - } - - result, err := parseResponse(respData) - if err != nil { - logger.Errorf("解析响应错误: %v", err) - return "", err - } - - if payloadMsg, ok := result["payload_msg"]; ok { - if payloadMap, ok := payloadMsg.(map[string]interface{}); ok { - if resultMap, ok := payloadMap["result"].(map[string]interface{}); ok { - if text, ok := resultMap["text"].(string); ok { - logger.Infof("识别结果: %s", text) - finalText = text - } - } - } - } - - if isLast, ok := result["is_last_package"].(bool); ok && isLast { - break - } - } - - if finalText == "" { - return "", fmt.Errorf("未在响应中找到识别结果") - } - return finalText, nil -} - -func (s *STT) sendMessage(conn *websocket.Conn, messageType, flags, serialization byte, payload []byte, sequence int32) error { - header := generateHeader(messageType, flags, serialization, GZIP_COMPRESSION, 0x00) - beforePayload := generateBeforePayload(sequence) - payloadSize := make([]byte, 4) - binary.BigEndian.PutUint32(payloadSize, uint32(len(payload))) - - fullMessage := bytes.NewBuffer(header) - fullMessage.Write(beforePayload) - fullMessage.Write(payloadSize) - fullMessage.Write(payload) - - return conn.WriteMessage(websocket.BinaryMessage, fullMessage.Bytes()) -} - -// gzipCompress 压缩数据 -func gzipCompress(data []byte) ([]byte, error) { - var buf bytes.Buffer - gz := gzip.NewWriter(&buf) - if _, err := gz.Write(data); err != nil { - return nil, err - } - if err := gz.Close(); err != nil { - return nil, err - } - return buf.Bytes(), nil -} // 定义协议相关的常量和函数 - -const ( - PROTOCOL_VERSION byte = 0x01 - DEFAULT_HEADER_SIZE byte = 0x01 - - // 消息类型 - FULL_CLIENT_REQUEST byte = 0x01 - AUDIO_ONLY_REQUEST byte = 0x02 - FULL_SERVER_RESPONSE byte = 0x09 - SERVER_ACK byte = 0x0B - SERVER_ERROR_RESPONSE byte = 0x0F - - POS_SEQUENCE byte = 0x01 - NEG_SEQUENCE byte = 0x02 - NEG_WITH_SEQUENCE byte = 0x03 - - // 序列化方法 - NO_SERIALIZATION byte = 0x00 - JSON_SERIALIZATION byte = 0x01 - - // 压缩类型 - NO_COMPRESSION byte = 0x00 - GZIP_COMPRESSION byte = 0x01 -) - -func generateHeader( - messageType byte, - messageTypeSpecificFlags byte, - serialMethod byte, - compressionType byte, - reservedData byte, -) []byte { - protocolVersion := PROTOCOL_VERSION - headerSize := DEFAULT_HEADER_SIZE - header := []byte{ - (protocolVersion << 4) | headerSize, - (messageType << 4) | messageTypeSpecificFlags, - (serialMethod << 4) | compressionType, - reservedData, - } - return header -} - -func generateBeforePayload(sequence int32) []byte { - buf := new(bytes.Buffer) - err := binary.Write(buf, binary.BigEndian, sequence) - if err != nil { - logger.Errorf("Error in generateBeforePayload: %v", err) - return nil - } - return buf.Bytes() -} - -func parseResponse(data []byte) (map[string]interface{}, error) { - if len(data) < 4 { - return nil, fmt.Errorf("响应数据过短") - } - //protocolVersion := data[0] >> 4 - headerSize := data[0] & 0x0F - messageType := data[1] >> 4 - messageTypeSpecificFlags := data[1] & 0x0F - serializationMethod := data[2] >> 4 - compressionType := data[2] & 0x0F - // reserved := data[3] - - payloadData := data[headerSize*4:] - - result := make(map[string]interface{}) - result["is_last_package"] = false - - if messageTypeSpecificFlags&0x01 != 0 { - // 带序列号的帧 - if len(payloadData) < 4 { - return nil, fmt.Errorf("payload 长度不足以包含序列号") - } - var seq int32 - buf := bytes.NewReader(payloadData[:4]) - err := binary.Read(buf, binary.BigEndian, &seq) - if err != nil { - return nil, err - } - result["payload_sequence"] = seq - payloadData = payloadData[4:] - } - - if messageTypeSpecificFlags&0x02 != 0 { - // 最后一个包 - result["is_last_package"] = true - } - - var payloadMsg []byte - var payloadSize uint32 - if messageType == FULL_SERVER_RESPONSE { - if len(payloadData) < 4 { - return nil, fmt.Errorf("payload 长度不足以包含大小信息") - } - payloadSize = binary.BigEndian.Uint32(payloadData[:4]) - payloadMsg = payloadData[4:] - } else if messageType == SERVER_ACK { - if len(payloadData) < 4 { - return nil, fmt.Errorf("payload 长度不足以包含序列号") - } - var seq int32 - buf := bytes.NewReader(payloadData[:4]) - err := binary.Read(buf, binary.BigEndian, &seq) - if err != nil { - return nil, err - } - result["seq"] = seq - if len(payloadData) >= 8 { - payloadSize = binary.BigEndian.Uint32(payloadData[4:8]) - payloadMsg = payloadData[8:] - } - } else if messageType == SERVER_ERROR_RESPONSE { - if len(payloadData) < 8 { - return nil, fmt.Errorf("payload 长度不足以包含错误代码和大小信息") - } - code := binary.BigEndian.Uint32(payloadData[:4]) - result["code"] = code - payloadSize = binary.BigEndian.Uint32(payloadData[4:8]) - payloadMsg = payloadData[8:] - } - - if payloadMsg != nil { - if compressionType == GZIP_COMPRESSION { - gr, err := gzip.NewReader(bytes.NewReader(payloadMsg)) - if err != nil { - return nil, err - } - decompressedData, err := io.ReadAll(gr) - gr.Close() - if err != nil { - return nil, err - } - payloadMsg = decompressedData - } - - if serializationMethod == JSON_SERIALIZATION { - var payloadObj interface{} - if err := json.Unmarshal(payloadMsg, &payloadObj); err != nil { - return nil, err - } - result["payload_msg"] = payloadObj - } else if serializationMethod != NO_SERIALIZATION { - result["payload_msg"] = string(payloadMsg) - } - result["payload_size"] = payloadSize - } - - // 打印解析后的响应内容 - logger.Infof("解析后的响应内容: %+v", result) - - return result, nil -} - -func sliceData(data []byte, chunkSize int) [][]byte { - var chunks [][]byte - dataLen := len(data) - for i := 0; i < dataLen; i += chunkSize { - end := i + chunkSize - if end > dataLen { - end = dataLen - } - chunks = append(chunks, data[i:end]) - } - return chunks -} diff --git a/internal/stt/volcengine/volcengine.go b/internal/stt/volcengine/volcengine.go new file mode 100644 index 0000000..a56a984 --- /dev/null +++ b/internal/stt/volcengine/volcengine.go @@ -0,0 +1,396 @@ +// volcengine.go +package volcengine + +import ( + "encoding/binary" + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/google/uuid" + "github.com/gorilla/websocket" + "github.com/telepace/voiceflow/pkg/config" + "github.com/telepace/voiceflow/pkg/logger" +) + +type STT struct { + wsURL string + uid string + rate int + format string + bits int + channel int + codec string + accessKey string + appKey string + resourceID string +} + +func NewVolcengineSTT() *STT { + cfg, err := config.GetConfig() + if err != nil { + logger.Fatalf("配置初始化失败: %v", err) + } + + sttCfg := cfg.Volcengine.STT + return &STT{ + wsURL: sttCfg.WsURL, + uid: sttCfg.UID, + rate: sttCfg.Rate, + format: sttCfg.Format, + bits: sttCfg.Bits, + channel: sttCfg.Channel, + codec: sttCfg.Codec, + accessKey: sttCfg.AccessKey, + appKey: sttCfg.AppKey, + resourceID: sttCfg.ResourceID, + } +} + +func (s *STT) Recognize(audioData []byte) (string, error) { + //reqID := uuid.New().String() + connectID := uuid.New().String() + + header := http.Header{} + header.Set("X-Api-Access-Key", s.accessKey) + header.Set("X-Api-App-Key", s.appKey) + header.Set("X-Api-Resource-Id", s.resourceID) + //header.Set("X-Api-Request-Id", reqID) + header.Set("X-Api-Connect-Id", connectID) + + logger.Infof("Connecting to WebSocket URL: %s", s.wsURL) + logger.Infof("Request Headers: %v", header) + + dialer := websocket.DefaultDialer + conn, resp, err := dialer.Dial(s.wsURL, header) + if err != nil { + logger.Errorf("WebSocket 连接错误: %v", err) + return "", err + } + defer conn.Close() + + // 检查并打印 X-Api-Connect-Id 和 X-Tt-Logid + if connectID := resp.Header.Get("X-Api-Connect-Id"); connectID != "" { + logger.Infof("连接追踪ID: X-Api-Connect-Id = %s", connectID) + } + if logID := resp.Header.Get("X-Tt-Logid"); logID != "" { + logger.Infof("服务端返回的 logid: X-Tt-Logid = %s", logID) + } + + // 构建并发送初始请求 + req := map[string]interface{}{ + "user": map[string]interface{}{ + "uid": s.uid, + }, + "audio": map[string]interface{}{ + "format": s.format, + "rate": s.rate, + "bits": s.bits, + "channel": s.channel, + "codec": s.codec, + "language": "zh-CN", + }, + "request": map[string]interface{}{ + "model_name": "bigmodel", + "enable_itn": false, + "enable_punc": false, + "enable_ddc": false, + "show_utterances": false, + "result_type": "full", + }, + } + + payloadBytes, err := json.Marshal(req) + if err != nil { + return "", err + } + + // 不使用压缩,直接发送 + fullClientRequest := generateHeader( + FULL_CLIENT_REQUEST, + NOT_LAST_PACKAGE_NO_SEQUENCE, + JSON_SERIALIZATION, + NO_COMPRESSION, + 0x00, + ) + payloadSize := make([]byte, 4) + binary.BigEndian.PutUint32(payloadSize, uint32(len(payloadBytes))) + + message := append(fullClientRequest, payloadSize...) + message = append(message, payloadBytes...) + + err = conn.WriteMessage(websocket.BinaryMessage, message) + if err != nil { + logger.Errorf("发送初始消息错误: %v", err) + return "", err + } + + // 接收服务器的初始响应 + _, resData, err := conn.ReadMessage() + if err != nil { + logger.Errorf("读取响应错误: %v", err) + return "", err + } + + result, err := parseResponse(resData) + if err != nil { + logger.Errorf("解析响应错误: %v", err) + return "", err + } + + if errCode, ok := result["error_code"]; ok { + logger.Errorf("服务器返回错误码 %v: %v", errCode, result["error_msg"]) + return "", fmt.Errorf("服务器返回错误码 %v: %v", errCode, result["error_msg"]) + } + + logger.Infof("初始响应: %+v", result) + + // 发送音频数据 + // 将音频数据按照块大小分片发送 + chunkSize := 3200 // 根据需求调整,每个包的音频时长约 100ms(16kHz 采样率,16 位深度,单声道) + audioChunks := sliceData(audioData, chunkSize) + + for i, chunk := range audioChunks { + isLast := i == len(audioChunks)-1 + + flags := NOT_LAST_PACKAGE_NO_SEQUENCE + if isLast { + flags = LAST_PACKAGE_NO_SEQUENCE + } + + audioRequest := generateHeader( + AUDIO_ONLY_REQUEST, + flags, + NO_SERIALIZATION, + NO_COMPRESSION, + 0x00, + ) + + payloadSize := make([]byte, 4) + binary.BigEndian.PutUint32(payloadSize, uint32(len(chunk))) + + message := append(audioRequest, payloadSize...) + message = append(message, chunk...) + + err = conn.WriteMessage(websocket.BinaryMessage, message) + if err != nil { + logger.Errorf("发送音频数据错误: %v", err) + return "", err + } + + logger.Debugf("发送音频数据包 %d", i+1) + + // 接收服务器响应 + if !isLast { + // 非最后一包,尝试读取中间结果 + conn.SetReadDeadline(time.Now().Add(500 * time.Millisecond)) + _, resData, err = conn.ReadMessage() + if err != nil { + if websocket.IsUnexpectedCloseError(err) { + logger.Errorf("读取响应错误: %v", err) + return "", err + } else { + // 超时或非致命错误,继续发送 + continue + } + } + + result, err = parseResponse(resData) + if err != nil { + logger.Errorf("解析响应错误: %v", err) + continue + } + + if errCode, ok := result["error_code"]; ok { + logger.Errorf("服务器返回错误码 %v: %v", errCode, result["error_msg"]) + return "", fmt.Errorf("服务器返回错误码 %v: %v", errCode, result["error_msg"]) + } + + logger.Infof("中间响应: %+v", result) + } + } + + // 接收服务器的最终响应 + var finalText string + for { + _, resData, err := conn.ReadMessage() + if err != nil { + if websocket.IsCloseError(err, websocket.CloseNormalClosure) { + logger.Info("WebSocket 连接正常关闭") + break + } else { + logger.Errorf("读取响应错误: %v", err) + return "", err + } + } + + result, err = parseResponse(resData) + if err != nil { + logger.Errorf("解析响应错误: %v", err) + return "", err + } + + if errCode, ok := result["error_code"]; ok { + logger.Errorf("服务器返回错误码 %v: %v", errCode, result["error_msg"]) + return "", fmt.Errorf("服务器返回错误码 %v: %v", errCode, result["error_msg"]) + } + + logger.Infof("收到响应: %+v", result) + + if payloadMsg, ok := result["payload_msg"]; ok { + if payloadMap, ok := payloadMsg.(map[string]interface{}); ok { + if resultMap, ok := payloadMap["result"].(map[string]interface{}); ok { + if text, ok := resultMap["text"].(string); ok { + finalText = text + logger.Infof("识别结果: %s", text) + } + } + } + } + + if isLast, ok := result["is_last_package"].(bool); ok && isLast { + break + } + } + + if finalText == "" { + return "", fmt.Errorf("未在响应中找到识别结果") + } + return finalText, nil +} + +// 定义协议相关的常量和函数 +const ( + PROTOCOL_VERSION byte = 0x01 + DEFAULT_HEADER_SIZE byte = 0x01 + + // 消息类型 + FULL_CLIENT_REQUEST byte = 0x01 + AUDIO_ONLY_REQUEST byte = 0x02 + FULL_SERVER_RESPONSE byte = 0x09 + SERVER_ERROR_RESPONSE byte = 0x0F + + // Message Type Specific Flags + NOT_LAST_PACKAGE_NO_SEQUENCE byte = 0x00 + LAST_PACKAGE_NO_SEQUENCE byte = 0x02 + + // 序列化方法 + NO_SERIALIZATION byte = 0x00 + JSON_SERIALIZATION byte = 0x01 + + // 压缩类型 + NO_COMPRESSION byte = 0x00 + GZIP_COMPRESSION byte = 0x01 +) + +func generateHeader( + messageType byte, + messageTypeSpecificFlags byte, + serialMethod byte, + compressionType byte, + reservedData byte, +) []byte { + protocolVersion := PROTOCOL_VERSION + headerSize := DEFAULT_HEADER_SIZE + header := []byte{ + (protocolVersion << 4) | headerSize, + (messageType << 4) | messageTypeSpecificFlags, + (serialMethod << 4) | compressionType, + reservedData, + } + return header +} + +func parseResponse(data []byte) (map[string]interface{}, error) { + if len(data) < 4 { + return nil, fmt.Errorf("响应数据过短") + } + //protocolVersion := data[0] >> 4 + headerSize := data[0] & 0x0F + messageType := data[1] >> 4 + messageTypeSpecificFlags := data[1] & 0x0F + serializationMethod := data[2] >> 4 + compressionType := data[2] & 0x0F + // reserved := data[3] + + headerLength := int(headerSize) * 4 + if len(data) < headerLength { + return nil, fmt.Errorf("数据长度不足以包含完整的头部") + } + + payload := data[headerLength:] + + result := make(map[string]interface{}) + + if messageType == FULL_SERVER_RESPONSE { + if len(payload) < 8 { + return nil, fmt.Errorf("payload 长度不足以包含序列号和大小信息") + } + sequence := binary.BigEndian.Uint32(payload[0:4]) + payloadSize := binary.BigEndian.Uint32(payload[4:8]) + + if len(payload) < int(8+payloadSize) { + return nil, fmt.Errorf("payload 长度不足以包含完整的消息") + } + + payloadMsg := payload[8 : 8+payloadSize] + + if compressionType == GZIP_COMPRESSION { + // 本例中不使用压缩,保留此代码以备后用 + } + + if serializationMethod == JSON_SERIALIZATION { + var payloadObj interface{} + if err := json.Unmarshal(payloadMsg, &payloadObj); err != nil { + return nil, err + } + result["payload_msg"] = payloadObj + } else if serializationMethod != NO_SERIALIZATION { + result["payload_msg"] = string(payloadMsg) + } + + result["sequence"] = sequence + + if messageTypeSpecificFlags&0x02 != 0 { + result["is_last_package"] = true + } else { + result["is_last_package"] = false + } + } else if messageType == SERVER_ERROR_RESPONSE { + // 解析错误响应 + if len(payload) < 8 { + return nil, fmt.Errorf("payload 长度不足以包含错误代码和大小信息") + } + errorCode := binary.BigEndian.Uint32(payload[:4]) + errorMsgSize := binary.BigEndian.Uint32(payload[4:8]) + if len(payload) < int(8+errorMsgSize) { + return nil, fmt.Errorf("payload 长度不足以包含完整的错误消息") + } + errorMsg := payload[8 : 8+errorMsgSize] + + if compressionType == GZIP_COMPRESSION { + // 本例中不使用压缩,保留此代码以备后用 + } + + result["error_code"] = errorCode + result["error_msg"] = string(errorMsg) + } else { + logger.Warn("收到未知的消息类型") + } + + return result, nil +} + +func sliceData(data []byte, chunkSize int) [][]byte { + var chunks [][]byte + dataLen := len(data) + for i := 0; i < dataLen; i += chunkSize { + end := i + chunkSize + if end > dataLen { + end = dataLen + } + chunks = append(chunks, data[i:end]) + } + return chunks +} diff --git a/internal/tts/assemblyai/assemblyai.go b/internal/tts/assemblyai/assemblyai.go deleted file mode 100644 index 5e4b796..0000000 --- a/internal/tts/assemblyai/assemblyai.go +++ /dev/null @@ -1 +0,0 @@ -package assemblyai diff --git a/internal/tts/aws/aws.go b/internal/tts/aws/aws.go new file mode 100644 index 0000000..afe7628 --- /dev/null +++ b/internal/tts/aws/aws.go @@ -0,0 +1,2 @@ +// internal/tts/aws/aws.go +package aws diff --git a/internal/tts/tts.go b/internal/tts/tts.go index dc8e87c..5b7f0ab 100644 --- a/internal/tts/tts.go +++ b/internal/tts/tts.go @@ -1,9 +1,12 @@ +// internal/tts/tts.go + package tts import ( "github.com/telepace/voiceflow/internal/tts/azure" "github.com/telepace/voiceflow/internal/tts/google" "github.com/telepace/voiceflow/internal/tts/local" + "github.com/telepace/voiceflow/internal/tts/volcengine" "github.com/telepace/voiceflow/pkg/logger" ) @@ -20,6 +23,8 @@ func NewService(provider string) Service { return azure.NewAzureTTS() // 调用 Azure TTS 实现 case "google": return google.NewGoogleTTS() // 调用 Google TTS 实现 + case "volcengine": + return volcengine.NewVolcengineTTS() case "local": return local.NewLocalTTS() // 调用本地 TTS 实现 default: diff --git a/internal/tts/volcengine/volcengine.go b/internal/tts/volcengine/volcengine.go new file mode 100644 index 0000000..625db42 --- /dev/null +++ b/internal/tts/volcengine/volcengine.go @@ -0,0 +1,240 @@ +package volcengine + +import ( + "bytes" + "compress/gzip" + "encoding/binary" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "time" + + "github.com/gorilla/websocket" + "github.com/telepace/voiceflow/pkg/config" + "github.com/telepace/voiceflow/pkg/logger" +) + +type VolcengineTTS struct { + wsURL string + appID string + token string + cluster string + voiceType string + encoding string + speedRatio float64 + volume float64 + pitch float64 +} + +func NewVolcengineTTS() *VolcengineTTS { + cfg, err := config.GetConfig() + if err != nil { + logger.Fatalf("配置初始化失败: %v", err) + } + + ttsCfg := cfg.Volcengine.TTS + return &VolcengineTTS{ + wsURL: ttsCfg.WsURL, + appID: ttsCfg.AppID, + token: ttsCfg.Token, + cluster: ttsCfg.Cluster, + voiceType: ttsCfg.VoiceType, + encoding: ttsCfg.Encoding, + speedRatio: ttsCfg.SpeedRatio, + volume: ttsCfg.VolumeRatio, + pitch: ttsCfg.PitchRatio, + } +} + +func (v *VolcengineTTS) Synthesize(text string) ([]byte, error) { + // 构建 WebSocket URL + u, err := url.Parse(v.wsURL) + if err != nil { + return nil, fmt.Errorf("invalid WebSocket URL: %v", err) + } + + // 设置请求头 + header := http.Header{ + "Authorization": []string{fmt.Sprintf("Bearer;%s", v.token)}, + } + + // 建立 WebSocket 连接 + conn, _, err := websocket.DefaultDialer.Dial(u.String(), header) + if err != nil { + return nil, fmt.Errorf("WebSocket连接失败: %v", err) + } + defer conn.Close() + + // 修改请求参数 + params := map[string]map[string]interface{}{ + "app": { + "appid": v.appID, + "token": v.token, + "cluster": v.cluster, + }, + "user": { + "uid": fmt.Sprintf("user_%d", time.Now().UnixNano()), + }, + "audio": { + "voice_type": v.voiceType, + "encoding": v.encoding, + "speed_ratio": v.speedRatio, + "volume_ratio": v.volume, + "pitch_ratio": v.pitch, + }, + "request": { + "reqid": generateReqID(), + "text": text, + "text_type": "plain", + "operation": "submit", + }, + } + + // 序列化并压缩请求数据 + jsonData, err := json.Marshal(params) + if err != nil { + return nil, fmt.Errorf("JSON序列化失败: %v", err) + } + + compressedData := gzipCompress(jsonData) + + // 构建二进制消息头 + message := buildMessage(compressedData) + + // 发送请求 + if err := conn.WriteMessage(websocket.BinaryMessage, message); err != nil { + return nil, fmt.Errorf("发送请求失败: %v", err) + } + + // 修改响应处理 + var audioBuffer bytes.Buffer + for { + _, message, err := conn.ReadMessage() + if err != nil { + if websocket.IsCloseError(err, websocket.CloseNormalClosure) { + break + } + return nil, fmt.Errorf("读取响应失败: %v", err) + } + + // 解析响应 + resp, err := parseResponse(message) + if err != nil { + return nil, fmt.Errorf("解析响应失败: %v", err) + } + + // 检查错误 + if resp.Code != 0 { + return nil, fmt.Errorf("服务端错误(code=%d): %s", resp.Code, resp.Message) + } + + // 如果有音频数据,追加到 buffer + if len(resp.Audio) > 0 { + audioBuffer.Write(resp.Audio) + } + + // 如果是最后一包数据,退出循环 + if resp.IsLast { + break + } + } + + return audioBuffer.Bytes(), nil +} + +// 工具函数 +func generateReqID() string { + return fmt.Sprintf("req_%d", time.Now().UnixNano()) +} + +func gzipCompress(input []byte) []byte { + var b bytes.Buffer + w := gzip.NewWriter(&b) + w.Write(input) + w.Close() + return b.Bytes() +} + +func buildMessage(payload []byte) []byte { + header := []byte{0x11, 0x10, 0x11, 0x00} // 默认消息头 + payloadSize := make([]byte, 4) + binary.BigEndian.PutUint32(payloadSize, uint32(len(payload))) + + message := make([]byte, 0, len(header)+len(payloadSize)+len(payload)) + message = append(message, header...) + message = append(message, payloadSize...) + message = append(message, payload...) + + return message +} + +// 添加响应结构 +type Response struct { + Code int `json:"code"` + Message string `json:"message"` + Audio []byte + IsLast bool +} + +func parseResponse(res []byte) (*Response, error) { + if len(res) < 4 { + return nil, fmt.Errorf("响应数据长度不足") + } + + // 解析二进制协议头 + // protoVersion := res[0] >> 4 + headSize := res[0] & 0x0f + messageType := res[1] >> 4 + messageTypeSpecificFlags := res[1] & 0x0f + // serializationMethod := res[2] >> 4 + messageCompression := res[2] & 0x0f + payload := res[headSize*4:] + + resp := &Response{} + + // audio-only server response + if messageType == 0xb { + if messageTypeSpecificFlags == 0 { + return resp, nil + } + + sequenceNumber := int32(binary.BigEndian.Uint32(payload[0:4])) + // payloadSize := int32(binary.BigEndian.Uint32(payload[4:8])) + resp.Audio = append(resp.Audio, payload[8:]...) + + if sequenceNumber < 0 { + resp.IsLast = true + } + return resp, nil + } + + // error response + if messageType == 0xf { + code := int32(binary.BigEndian.Uint32(payload[0:4])) + errMsg := payload[8:] + if messageCompression == 1 { + var err error + errMsg, err = gzipDecompress(errMsg) + if err != nil { + return nil, fmt.Errorf("解压错误消息失败: %v", err) + } + } + resp.Code = int(code) + resp.Message = string(errMsg) + return resp, fmt.Errorf("服务端错误(code=%d): %s", code, errMsg) + } + + return nil, fmt.Errorf("未知的消息类型: %d", messageType) +} + +func gzipDecompress(input []byte) ([]byte, error) { + reader, err := gzip.NewReader(bytes.NewReader(input)) + if err != nil { + return nil, err + } + defer reader.Close() + + return io.ReadAll(reader) +} diff --git a/pkg/config/config.go b/pkg/config/config.go index de51d6f..5d1511b 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -3,20 +3,28 @@ package config import ( "fmt" - "github.com/spf13/viper" "sync" + + "github.com/spf13/viper" ) type VolcengineConfig struct { - AccessKey string `mapstructure:"access_key"` - AppKey string `mapstructure:"app_key"` - WsURL string `mapstructure:"ws_url"` - UID string `yaml:"uid"` - Rate int `yaml:"rate"` - Format string `yaml:"format"` - Bits int `yaml:"bits"` - Channel int `yaml:"channel"` - Codec string `yaml:"codec"` + AccessKey string `mapstructure:"access_key"` + AppKey string `mapstructure:"app_key"` + WsURL string `mapstructure:"ws_url"` + ResourceID string `mapstructure:"resource_id"` + UID string `yaml:"uid"` + Rate int `yaml:"rate"` + Format string `yaml:"format"` + Bits int `yaml:"bits"` + Channel int `yaml:"channel"` + Codec string `yaml:"codec"` +} + +type AWSConfig struct { + AccessKeyID string `mapstructure:"access_key_id"` + SecretAccessKey string `mapstructure:"secret_access_key"` + Region string `yaml:"region"` } type Config struct { @@ -52,8 +60,34 @@ type Config struct { STTKey string `mapstructure:"stt_key"` Region string } - Volcengine VolcengineConfig `yaml:"volcengine"` - MinIO struct { + AWS AWSConfig `yaml:"aws"` + Volcengine struct { + STT struct { + WsURL string `mapstructure:"ws_url"` + UID string `mapstructure:"uid"` + Rate int `mapstructure:"rate"` + Format string `mapstructure:"format"` + Bits int `mapstructure:"bits"` + Channel int `mapstructure:"channel"` + Codec string `mapstructure:"codec"` + AccessKey string `mapstructure:"access_key"` + AppKey string `mapstructure:"app_key"` + ResourceID string `mapstructure:"resource_id"` + } `mapstructure:"stt"` + + TTS struct { + WsURL string `mapstructure:"ws_url"` + AppID string `mapstructure:"app_id"` + Token string `mapstructure:"token"` + Cluster string `mapstructure:"cluster"` + VoiceType string `mapstructure:"voice_type"` + Encoding string `mapstructure:"encoding"` + SpeedRatio float64 `mapstructure:"speed_ratio"` + VolumeRatio float64 `mapstructure:"volume_ratio"` + PitchRatio float64 `mapstructure:"pitch_ratio"` + } `mapstructure:"tts"` + } `mapstructure:"volcengine"` + MinIO struct { Enabled bool `mapstructure:"enabled"` BucketName string `mapstructure:"bucket_name"` Endpoint string `mapstructure:"endpoint"` @@ -61,6 +95,7 @@ type Config struct { SecretKey string `mapstructure:"secret_key"` UseSSL bool `mapstructure:"use_ssl"` Secure bool `mapstructure:"secure"` + StoragePath string `mapstructure:"storage_path"` } Logging struct { Level string diff --git a/pkg/sttservice/service.go b/pkg/sttservice/service.go new file mode 100644 index 0000000..14a8179 --- /dev/null +++ b/pkg/sttservice/service.go @@ -0,0 +1,28 @@ +// pkg/sttservice/service.go +package sttservice + +import ( + "context" + "fmt" +) + +type Service interface { + Recognize(audioData []byte) (string, error) + StreamRecognize(ctx context.Context, audioDataChan <-chan []byte, transcriptChan chan<- string) error +} + +// 需要一个全局的 STT 服务实例 +var sttInstance Service + +// 提供一个方法来设置 STT 服务实例 +func SetService(s Service) { + sttInstance = s +} + +// 提供全局可调用的 Recognize 方法 +func Recognize(audioData []byte) (string, error) { + if sttInstance == nil { + return "", fmt.Errorf("STT 服务未初始化") + } + return sttInstance.Recognize(audioData) +} diff --git a/pkg/voiceprocessor/voiceprocessor.go b/pkg/voiceprocessor/voiceprocessor.go new file mode 100644 index 0000000..76f6b89 --- /dev/null +++ b/pkg/voiceprocessor/voiceprocessor.go @@ -0,0 +1,39 @@ +// pkg/voiceprocessor/voiceprocessor.go +package voiceprocessor + +import ( + "fmt" + "github.com/telepace/voiceflow/pkg/sttservice" + "os" +) + +func StartRealtime() error { + // 实现实时语音监听和翻译的逻辑 + fmt.Println("实时语音处理已启动。") + // 例如,使用麦克风输入并处理音频流 + // 这里可以调用 sttservice 中的 StreamRecognize 方法 + return nil +} + +func TranscribeFile(audioFile string) error { + // 检查文件是否存在 + if _, err := os.Stat(audioFile); os.IsNotExist(err) { + return fmt.Errorf("文件不存在:%s", audioFile) + } + + // 读取音频文件数据 + audioData, err := os.ReadFile(audioFile) + if err != nil { + return fmt.Errorf("无法读取音频文件:%v", err) + } + + // 调用 STT 服务进行转录 + transcript, err := sttservice.Recognize(audioData) + if err != nil { + return fmt.Errorf("转录失败:%v", err) + } + + // 输出转录结果 + fmt.Printf("转录结果:\n%s\n", transcript) + return nil +} diff --git a/test/assemblyai/.env.example b/test/assemblyai/.env.example new file mode 100644 index 0000000..1daebcd --- /dev/null +++ b/test/assemblyai/.env.example @@ -0,0 +1 @@ +ASSEMBLYAI_API_KEY='' diff --git a/test/assemblyai/README.md b/test/assemblyai/README.md new file mode 100644 index 0000000..d7b9f5e --- /dev/null +++ b/test/assemblyai/README.md @@ -0,0 +1,22 @@ +# AssemblyAI 测试 + +## 安装依赖 + +```bash +go mod tidy +``` + +## 配置 + +```bash +cp .env.example .env +``` + +并且设置环境变量 + + +## 运行 + +```bash +go run . +``` diff --git a/test/assemblyai/go.mod b/test/assemblyai/go.mod new file mode 100644 index 0000000..30f2610 --- /dev/null +++ b/test/assemblyai/go.mod @@ -0,0 +1,15 @@ +module github.com/telepace/voiceflow/test/assemblyai + +go 1.22.5 + +require ( + github.com/AssemblyAI/assemblyai-go-sdk v1.9.0 + github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5 + github.com/joho/godotenv v1.5.1 +) + +require ( + github.com/cenkalti/backoff v2.2.1+incompatible // indirect + github.com/coder/websocket v1.8.12 // indirect + github.com/google/go-querystring v1.1.0 // indirect +) diff --git a/test/assemblyai/go.sum b/test/assemblyai/go.sum new file mode 100644 index 0000000..5e4ea93 --- /dev/null +++ b/test/assemblyai/go.sum @@ -0,0 +1,24 @@ +github.com/AssemblyAI/assemblyai-go-sdk v1.9.0 h1:0/bViC5xeTZF7V1paOWlB7ftn+q8S43FBtA+fuVX9lY= +github.com/AssemblyAI/assemblyai-go-sdk v1.9.0/go.mod h1:dwv8jDdg+UKPU9ClZzhQNXIVj3Yw68IaTVRuyKRLigw= +github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= +github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= +github.com/coder/websocket v1.8.12 h1:5bUXkEPPIbewrnkU8LTCLVaxi4N4J8ahufH2vlo4NAo= +github.com/coder/websocket v1.8.12/go.mod h1:LNVeNrXQZfe5qhS9ALED3uA+l5pPqvwXg3CKoDBB2gs= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= +github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= +github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5 h1:5AlozfqaVjGYGhms2OsdUyfdJME76E6rx5MdGpjzZpc= +github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5/go.mod h1:WY8R6YKlI2ZI3UyzFk7P6yGSuS+hFwNtEzrexRyD7Es= +github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= +github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/test/assemblyai/main.go b/test/assemblyai/main.go index 19288e5..0e16a8b 100644 --- a/test/assemblyai/main.go +++ b/test/assemblyai/main.go @@ -65,7 +65,7 @@ func main() { slog.Info("connected to real-time API", "sample_rate", sampleRate, "frames_per_buffer", framesPerBuffer) - rec, err := newRecorder(sampleRate, framesPerBuffer) + rec, err := NewRecorder(sampleRate, framesPerBuffer) checkErr(err) err = rec.Start() diff --git a/test/assemblyai/recorder.go b/test/assemblyai/recorder.go index 5a31bcd..341f017 100644 --- a/test/assemblyai/recorder.go +++ b/test/assemblyai/recorder.go @@ -12,7 +12,7 @@ type recorder struct { buffer []int16 } -func newRecorder(sampleRate int, framesPerBuffer int) (*recorder, error) { +func NewRecorder(sampleRate int, framesPerBuffer int) (*recorder, error) { buffer := make([]int16, framesPerBuffer) stream, err := portaudio.OpenDefaultStream(1, 0, float64(sampleRate), framesPerBuffer, buffer) diff --git a/test/aws/main.go b/test/aws/main.go new file mode 100644 index 0000000..86283d0 --- /dev/null +++ b/test/aws/main.go @@ -0,0 +1,165 @@ +package main + +import ( + "context" + "fmt" + "log" + "os" + "os/signal" + "sync" + "syscall" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/session" + transcribe "github.com/aws/aws-sdk-go/service/transcribestreamingservice" + "github.com/gordonklaus/portaudio" +) + +func main() { + // 创建 AWS 会话 + sess, err := session.NewSession(&aws.Config{ + Region: aws.String("us-east-2"), // 根据您的实际区域修改 + }) + if err != nil { + log.Fatal("无法创建 AWS 会话:", err) + } + + // 创建 AWS Transcribe Streaming 客户端 + client := transcribe.New(sess) + + // 初始化 PortAudio + err = portaudio.Initialize() + if err != nil { + log.Fatal("无法初始化 PortAudio:", err) + } + defer portaudio.Terminate() + + // 音频流参数 + const sampleRate = 16000 + const channels = 1 + const framesPerBuffer = 512 // 设置较小的缓冲区 + + // 创建音频数据通道,带缓冲区防止阻塞 + audioChan := make(chan []int16, 100) + + // 创建 PortAudio 输入流,使用回调函数 + stream, err := portaudio.OpenDefaultStream(channels, 0, sampleRate, framesPerBuffer, func(in []int16) { + // 复制输入数据 + data := make([]int16, len(in)) + copy(data, in) + // 将数据发送到通道,如果通道已满则丢弃数据以防止阻塞 + select { + case audioChan <- data: + default: + // 通道已满,丢弃数据 + } + }) + if err != nil { + log.Fatal("无法打开音频流:", err) + } + defer stream.Close() + + // 启动音频流 + err = stream.Start() + if err != nil { + log.Fatal("无法启动音频流:", err) + } + defer stream.Stop() + + fmt.Println("请开始说话... 按下 Ctrl+C 结束") + + // 设置 AWS Transcribe Streaming 输入参数 + input := &transcribe.StartStreamTranscriptionInput{ + LanguageCode: aws.String("zh-CN"), + MediaEncoding: aws.String("pcm"), + MediaSampleRateHertz: aws.Int64(sampleRate), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // 开始转录流 + output, err := client.StartStreamTranscriptionWithContext(ctx, input) + if err != nil { + log.Fatal("无法开始转录流:", err) + } + + eventStream := output.GetStream() + + // 处理系统信号,支持优雅退出 + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + // 创建 WaitGroup 等待 Goroutine 完成 + var wg sync.WaitGroup + wg.Add(2) + + // Goroutine:从通道读取音频数据并发送到 AWS Transcribe + go func() { + defer wg.Done() + for { + select { + case data := <-audioChan: + // 将 []int16 转换为 []byte + audioBytes := int16SliceToByteSlice(data) + // 发送音频数据到 AWS Transcribe + err := eventStream.Send(ctx, &transcribe.AudioEvent{ + AudioChunk: audioBytes, + }) + if err != nil { + log.Println("发送音频事件失败:", err) + cancel() + return + } + case <-ctx.Done(): + return + } + } + }() + + // Goroutine:接收并处理转录结果 + go func() { + defer wg.Done() + for event := range eventStream.Events() { + switch e := event.(type) { + case *transcribe.TranscriptEvent: + results := e.Transcript.Results + for _, result := range results { + if !aws.BoolValue(result.IsPartial) { + for _, alt := range result.Alternatives { + fmt.Println("转录结果:", aws.StringValue(alt.Transcript)) + } + } + } + default: + // 处理其他事件 + } + } + if err := eventStream.Err(); err != nil { + log.Println("事件流出错:", err) + cancel() + } + }() + + // 等待退出信号 + <-sigChan + fmt.Println("录音结束") + + // 取消上下文,停止 Goroutine + cancel() + + // 等待 Goroutine 完成 + wg.Wait() + + // 关闭事件流 + eventStream.Close() +} + +func int16SliceToByteSlice(data []int16) []byte { + buf := make([]byte, len(data)*2) + for i, v := range data { + buf[i*2] = byte(v) + buf[i*2+1] = byte(v >> 8) + } + return buf +} diff --git a/test/azure/main.go b/test/azure/main.go new file mode 100644 index 0000000..d93f83c --- /dev/null +++ b/test/azure/main.go @@ -0,0 +1,102 @@ +package main + +import ( + "bufio" + "fmt" + "os" + "strings" + "time" + + "github.com/Microsoft/cognitive-services-speech-sdk-go/audio" + "github.com/Microsoft/cognitive-services-speech-sdk-go/common" + "github.com/Microsoft/cognitive-services-speech-sdk-go/speech" +) + +func synthesizeStartedHandler(event speech.SpeechSynthesisEventArgs) { + defer event.Close() + fmt.Println("Synthesis started.") +} + +func synthesizingHandler(event speech.SpeechSynthesisEventArgs) { + defer event.Close() + fmt.Printf("Synthesizing, audio chunk size %d.\n", len(event.Result.AudioData)) +} + +func synthesizedHandler(event speech.SpeechSynthesisEventArgs) { + defer event.Close() + fmt.Printf("Synthesized, audio length %d.\n", len(event.Result.AudioData)) +} + +func cancelledHandler(event speech.SpeechSynthesisEventArgs) { + defer event.Close() + fmt.Println("Received a cancellation.") +} + +func main() { + // This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION" + speechKey := os.Getenv("SPEECH_KEY") + speechRegion := os.Getenv("SPEECH_REGION") + + audioConfig, err := audio.NewAudioConfigFromDefaultSpeakerOutput() + if err != nil { + fmt.Println("Got an error: ", err) + return + } + defer audioConfig.Close() + speechConfig, err := speech.NewSpeechConfigFromSubscription(speechKey, speechRegion) + if err != nil { + fmt.Println("Got an error: ", err) + return + } + defer speechConfig.Close() + + speechConfig.SetSpeechSynthesisVoiceName("en-US-AvaMultilingualNeural") + + speechSynthesizer, err := speech.NewSpeechSynthesizerFromConfig(speechConfig, audioConfig) + if err != nil { + fmt.Println("Got an error: ", err) + return + } + defer speechSynthesizer.Close() + + speechSynthesizer.SynthesisStarted(synthesizeStartedHandler) + speechSynthesizer.Synthesizing(synthesizingHandler) + speechSynthesizer.SynthesisCompleted(synthesizedHandler) + speechSynthesizer.SynthesisCanceled(cancelledHandler) + + for { + fmt.Printf("Enter some text that you want to speak, or enter empty text to exit.\n> ") + text, _ := bufio.NewReader(os.Stdin).ReadString('\n') + text = strings.TrimSuffix(text, "\n") + if len(text) == 0 { + break + } + + task := speechSynthesizer.SpeakTextAsync(text) + var outcome speech.SpeechSynthesisOutcome + select { + case outcome = <-task: + case <-time.After(60 * time.Second): + fmt.Println("Timed out") + return + } + defer outcome.Close() + if outcome.Error != nil { + fmt.Println("Got an error: ", outcome.Error) + return + } + + if outcome.Result.Reason == common.SynthesizingAudioCompleted { + fmt.Printf("Speech synthesized to speaker for text [%s].\n", text) + } else { + cancellation, _ := speech.NewCancellationDetailsFromSpeechSynthesisResult(outcome.Result) + fmt.Printf("CANCELED: Reason=%d.\n", cancellation.Reason) + + if cancellation.Reason == common.Error { + fmt.Printf("CANCELED: ErrorCode=%d\nCANCELED: ErrorDetails=[%s]\nCANCELED: Did you set the speech resource key and region values?\n", + cancellation.ErrorCode, + cancellation.ErrorDetails) + } + } + } +} diff --git a/test/test.mp3 b/test/test.mp3 new file mode 100644 index 0000000..bee2fd4 Binary files /dev/null and b/test/test.mp3 differ diff --git a/test/volcengine/tts_websocket_demo.go b/test/volcengine/tts_websocket_demo.go new file mode 100644 index 0000000..7c46f3c --- /dev/null +++ b/test/volcengine/tts_websocket_demo.go @@ -0,0 +1,279 @@ +package main + +import ( + "bytes" + "compress/gzip" + "encoding/binary" + "encoding/json" + "errors" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "os" + + "github.com/gorilla/websocket" + uuid "github.com/satori/go.uuid" +) + +var ( + enumMessageType = map[byte]string{ + 11: "audio-only server response", + 12: "frontend server response", + 15: "error message from server", + } + enumMessageTypeSpecificFlags = map[byte]string{ + 0: "no sequence number", + 1: "sequence number > 0", + 2: "last message from server (seq < 0)", + 3: "sequence number < 0", + } + enumMessageSerializationMethods = map[byte]string{ + 0: "no serialization", + 1: "JSON", + 15: "custom type", + } + enumMessageCompression = map[byte]string{ + 0: "no compression", + 1: "gzip", + 15: "custom compression method", + } +) + +const ( + optQuery string = "query" + optSubmit string = "submit" +) + +var addr = "openspeech.bytedance.com" +var u = url.URL{Scheme: "wss", Host: addr, Path: "/api/v1/tts/ws_binary"} + +var appid = os.Getenv("VOICEFLOW_VOLCENGINE_TTS_APP_ID") +var token = os.Getenv("VOICEFLOW_VOLCENGINE_TTS_TOKEN") +var header = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", token)}} + +type synResp struct { + Audio []byte + IsLast bool +} + +// version: b0001 (4 bits) +// header size: b0001 (4 bits) +// message type: b0001 (Full client request) (4bits) +// message type specific flags: b0000 (none) (4bits) +// message serialization method: b0001 (JSON) (4 bits) +// message compression: b0001 (gzip) (4bits) +// reserved data: 0x00 (1 byte) +var defaultHeader = []byte{0x11, 0x10, 0x11, 0x00} + +func setupInput(text, voiceType, opt string) []byte { + reqID := uuid.Must(uuid.NewV4(), nil).String() + params := make(map[string]map[string]interface{}) + params["app"] = make(map[string]interface{}) + // 平台上查看具体appid + params["app"]["appid"] = appid + params["app"]["token"] = "N81BXUVgn7fJ2ySnNlHKEd2WLMVKmM" // 根据 API 文档设置正确的值 + // 平台上查看具体集群名称 + params["app"]["cluster"] = "volcano_tts" + params["user"] = make(map[string]interface{}) + params["user"]["uid"] = "uid" + params["audio"] = make(map[string]interface{}) + params["audio"]["voice_type"] = voiceType + params["audio"]["encoding"] = "mp3" + params["audio"]["speed_ratio"] = 1.0 + params["audio"]["volume_ratio"] = 1.0 + params["audio"]["pitch_ratio"] = 1.0 + params["request"] = make(map[string]interface{}) + params["request"]["reqid"] = reqID + params["request"]["text"] = text + params["request"]["text_type"] = "plain" + params["request"]["operation"] = opt + fmt.Println("Request parameters:", params) + resStr, _ := json.Marshal(params) + return resStr +} + +func gzipCompress(input []byte) []byte { + var b bytes.Buffer + w := gzip.NewWriter(&b) + w.Write(input) + w.Close() + return b.Bytes() +} + +func gzipDecompress(input []byte) []byte { + b := bytes.NewBuffer(input) + r, _ := gzip.NewReader(b) + out, _ := ioutil.ReadAll(r) + r.Close() + return out +} + +func parseResponse(res []byte) (resp synResp, err error) { + protoVersion := res[0] >> 4 + headSize := res[0] & 0x0f + messageType := res[1] >> 4 + messageTypeSpecificFlags := res[1] & 0x0f + serializationMethod := res[2] >> 4 + messageCompression := res[2] & 0x0f + reserve := res[3] + headerExtensions := res[4 : headSize*4] + payload := res[headSize*4:] + + fmt.Printf(" Protocol version: %x - version %d\n", + protoVersion, protoVersion) + fmt.Printf(" Header size: %x - %d bytes\n", + headSize, headSize*4) + fmt.Printf(" Message type: %x - %s\n", messageType, + enumMessageType[messageType]) + fmt.Printf(" Message type specific flags: %x - %s\n", messageTypeSpecificFlags, + enumMessageTypeSpecificFlags[messageTypeSpecificFlags]) + fmt.Printf("Message serialization method: %x - %s\n", + serializationMethod, enumMessageSerializationMethods[serializationMethod]) + fmt.Printf(" Message compression: %x - %s\n", + messageCompression, enumMessageCompression[messageCompression]) + fmt.Printf(" Reserved: %d\n", reserve) + if headSize != 1 { + fmt.Printf(" Header extensions: %s\n", + headerExtensions) + } + // audio-only server response + if messageType == 0xb { + // no sequence number as ACK + if messageTypeSpecificFlags == 0 { + fmt.Println(" Payload size: 0") + } else { + sequenceNumber := int32(binary.BigEndian.Uint32(payload[0:4])) + payloadSize := int32(binary.BigEndian.Uint32(payload[4:8])) + payload = payload[8:] + resp.Audio = append(resp.Audio, payload...) + fmt.Printf(" Sequence number: %d\n", + sequenceNumber) + fmt.Printf(" Payload size: %d\n", payloadSize) + if sequenceNumber < 0 { + resp.IsLast = true + } + } + } else if messageType == 0xf { + code := int32(binary.BigEndian.Uint32(payload[0:4])) + errMsg := payload[8:] + if messageCompression == 1 { + errMsg = gzipDecompress(errMsg) + } + fmt.Printf(" Error code: %d\n", code) + fmt.Printf(" Error msg: %s\n", string(errMsg)) + err = errors.New(string(errMsg)) + return + } else if messageType == 0xc { + // msgSize = int32(binary.BigEndian.Uint32(payload[0:4])) + payload = payload[4:] + if messageCompression == 1 { + payload = gzipDecompress(payload) + } + fmt.Printf(" Frontend message: %s\n", string(payload)) + } else { + fmt.Printf(" wrong message type:%d\n", messageType) + err = errors.New("wrong message type") + return + } + return +} + +// 一次性合成 +func nonStreamSynth(text, voiceType, outFile string) { + input := setupInput(text, voiceType, optQuery) + fmt.Println(string(input)) + input = gzipCompress(input) + payloadSize := len(input) + payloadArr := make([]byte, 4) + binary.BigEndian.PutUint32(payloadArr, uint32(payloadSize)) + clientRequest := make([]byte, len(defaultHeader)) + copy(clientRequest, defaultHeader) + clientRequest = append(clientRequest, payloadArr...) + clientRequest = append(clientRequest, input...) + c, _, err := websocket.DefaultDialer.Dial(u.String(), header) + if err != nil { + fmt.Println("dial err:", err) + return + } + defer c.Close() + err = c.WriteMessage(websocket.BinaryMessage, clientRequest) + if err != nil { + fmt.Println("write message fail, err:", err.Error()) + return + } + _, message, err := c.ReadMessage() + if err != nil { + fmt.Println("read message fail, err:", err.Error()) + return + } + resp, err := parseResponse(message) + if err != nil { + fmt.Println("parse response fail, err:", err.Error()) + return + } + err = ioutil.WriteFile(outFile, resp.Audio, 0644) + if err != nil { + fmt.Println("write audio to fail fail, err:", err.Error()) + return + } +} + +// 流式合成 +func streamSynth(text, voiceType, outFile string) { + input := setupInput(text, voiceType, optSubmit) + fmt.Println(string(input)) + input = gzipCompress(input) + payloadSize := len(input) + payloadArr := make([]byte, 4) + binary.BigEndian.PutUint32(payloadArr, uint32(payloadSize)) + clientRequest := make([]byte, len(defaultHeader)) + copy(clientRequest, defaultHeader) + clientRequest = append(clientRequest, payloadArr...) + clientRequest = append(clientRequest, input...) + c, _, err := websocket.DefaultDialer.Dial(u.String(), header) + if err != nil { + fmt.Println("dial err:", err) + return + } + defer c.Close() + err = c.WriteMessage(websocket.BinaryMessage, clientRequest) + if err != nil { + fmt.Println("write message fail, err:", err.Error()) + return + } + var audio []byte + for { + var message []byte + _, message, err := c.ReadMessage() + if err != nil { + fmt.Println("read message fail, err:", err.Error()) + break + } + resp, err := parseResponse(message) + if err != nil { + fmt.Println("parse response fail, err:", err.Error()) + break + } + audio = append(audio, resp.Audio...) + if resp.IsLast { + break + } + } + if err != nil { + fmt.Println("stream synthesis fail, err:", err.Error()) + return + } + err = ioutil.WriteFile(outFile, audio, 0644) + if err != nil { + fmt.Println("write audio to fail fail, err:", err.Error()) + return + } +} + +func main() { + fmt.Println("appid:", appid) + fmt.Println("token:", token) + // 此处替换成需要调用的音色 + streamSynth("我想测试下语音合成的效果", "zh_male_beijingxiaoye_moon_bigtts", "test.mp3") +} diff --git a/user.md b/user.md new file mode 100644 index 0000000..794caf2 --- /dev/null +++ b/user.md @@ -0,0 +1,203 @@ +### 代码分析 + +### API 和 WebSocket 接口文档 + +#### 1. HTTP API 接口 + +##### 1.1 配置更新接口 + +- **URL**:`/config` +- **方法**:`POST` +- **描述**:用于更新服务器的服务配置,包括指定使用的服务类型和提供商。 +- **请求头**: + - `Content-Type: application/json` +- **请求体**: + + ```json + { + "service": "string", // 服务名称,例如 "STT", "TTS", "LLM" + "provider": "string" // 提供商名称,例如 "Google", "AWS", "Azure" + } + ``` + +- **成功响应**: + - **状态码**:`200 OK` + - **响应体**: + + ``` + Configuration updated + ``` + +- **错误响应**: + - **状态码**:`400 Bad Request` + - **响应体**: + + ``` + Invalid request body + ``` + + +#### 2. WebSocket 接口 + +##### 2.1 建立连接 + +- **URL**:`ws://<服务器地址>/ws` +- **协议**:WebSocket +- **描述**:客户端通过WebSocket与服务器建立连接,以进行实时的双向通信,包括文本处理和音频数据传输。 + +##### 2.2 消息类型 + +WebSocket连接支持两种类型的消息: + +1. **文本消息**: + - **格式**:JSON对象 + - **用途**:发送需要处理的文本,服务器会返回处理结果和相关音频URL。 + - **示例消息**: + + ```json + { + "text": "你好,今天的天气怎么样?" + } + ``` + + - **服务器响应**: + - **格式**:JSON对象,包含处理后的文本和音频文件的URL。 + - **示例响应**: + + ```json + { + "text": "今天天气晴朗,气温适中。", + "audio_url": "http://example.com/audio/12345.mp3" + } + ``` + +2. **二进制消息**: + - **格式**:二进制音频数据 + - **用途**:发送音频流,服务器将进行语音转文字(STT),并返回转录结果。 + - **服务器响应**: + - **格式**:JSON对象,包含转录的文本或结束事件。 + - **示例响应**(转录中): + + ```json + { + "event": "result", + "result": { + "Text": "这是转录的内容。" + }, + "code": 0, + "message": "这是转录的内容。" + } + ``` + + - **示例响应**(结束): + + ```json + { + "event": "end", + "code": 0, + "message": "" + } + ``` + +##### 2.3 消息流程 + +1. **文本处理流程**: + - 客户端发送包含`text`字段的JSON消息。 + - 服务器接收后,调用LLM服务生成响应文本。 + - 使用TTS服务合成音频,并将音频存储后返回音频URL。 + - 服务器通过WebSocket发送包含响应文本和音频URL的JSON消息给客户端。 + +2. **音频转录流程**: + - 客户端发送二进制音频数据。 + - 服务器接收音频数据并传递给STT服务进行转录。 + - 服务器通过WebSocket发送转录结果的JSON消息给客户端。 + - 当音频数据传输完成,服务器发送结束事件的JSON消息。 + + +#### 3. 示例 + +##### 3.1 使用WebSocket进行文本交互 + +**客户端发送**: + +```json +{ + "text": "请告诉我一个笑话。" +} +``` + +**服务器响应**: + +```json +{ + "text": "当然,为什么程序员喜欢在夜晚工作?因为晚上调试错误更容易!", + "audio_url": "http://example.com/audio/67890.mp3" +} +``` + +##### 3.2 使用WebSocket进行音频转录 + +**客户端发送**:二进制音频数据(例如录制的语音) + +**服务器响应**: + +```json +{ + "event": "result", + "code": 0, + "result": [ + { + "definite": true, + "end_time": 860, + "start_time": 0, + "text": "这是", + "words": [ + { + "blank_duration": 0, + "end_time": 1020, + "start_time": 860, + "text": "这" + }, + { + "blank_duration": 0, + "end_time": 1180, + "start_time": 1020, + "text": "是" + } + ], + "word_size": 2 + }, + { + "definite": true, + "end_time": 1705, + "start_time": 0, + "text": "这是字节跳动,", + "words": [ + { + "blank_duration": 0, + "end_time": 860, + "start_time": 740, + "text": "这" + }, + { + "blank_duration": 0, + "end_time": 1020, + "start_time": 860, + "text": "是" + } + ], + "word_size": 2 + } + ] +} +``` + +**当音频传输结束**: + +```json +{ + "event": "end", + "code": 0, + "message": "" +} +```