直接生成opus格式数据

jiangtao · jiangtao · commit b75b860cc89c · 2025-02-26T21:50:32.000+08:00
diff --git a/core/connection.py b/core/connection.py
@@ -77,7 +77,6 @@ def __init__(self, config: Dict[str, Any], _vad, _asr, _llm, _tts):
         for cmd in self.cmd_exit:
             if len(cmd) > self.max_cmd_length:
                 self.max_cmd_length = len(cmd)
-        self.lock = threading.Lock()
         
         self.private_config = None
         self.auth_code_gen = AuthCodeGenerator.get_instance()
@@ -299,7 +298,6 @@ def _priority_thread(self):
                     self.logger.bind(tag=TAG).error(f"TTS 任务出错: {e}")
                     continue
                 if not self.client_abort:
-                    with self.lock:
                         #sleep_time = max(duration, 1 if len(text) <= 3 else len(text) * 0.2)
                         # 使用实例锁来确保顺序传输
                         self.logger.bind(tag=TAG).info(f"发送TTS语音: {text}, 时长:{duration}, sleep_time:{duration}")
diff --git a/core/providers/tts/base.py b/core/providers/tts/base.py
@@ -42,13 +42,52 @@ def to_tts(self, text):
     async def text_to_speak(self, text, output_file):
         pass
 
-    def wav_to_opus_data(self, wav_file_path):
-        # 使用pydub加载PCM文件
-        # 获取文件后缀名
-        file_type = os.path.splitext(wav_file_path)[1]
+    def get_opus_data(self, file_path):
+        """直接从opus文件获取数据和时长"""
+        try:
+            # 读取opus文件
+            with open(file_path, 'rb') as f:
+                opus_data = f.read()
+            
+            # 获取音频时长（从API响应中获取）
+            duration = self.get_audio_duration(file_path)
+            
+            # 按照每帧大小分割数据
+            frame_size = 960  # opus标准帧大小
+            opus_datas = []
+            
+            # 分帧处理
+            current_pos = 0
+            while current_pos < len(opus_data):
+                # 读取帧长度（前2个字节）
+                frame_length = int.from_bytes(opus_data[current_pos:current_pos + 2], 'little')
+                current_pos += 2
+                
+                # 读取帧数据
+                frame_data = opus_data[current_pos:current_pos + frame_length]
+                opus_datas.append(frame_data)
+                current_pos += frame_length
+            
+            return opus_datas, duration
+            
+        except Exception as e:
+            logger.bind(tag=TAG).error(f"处理opus文件失败: {e}")
+            return [], 0
+    
+    @abstractmethod
+    def get_audio_duration(self, file_path):
+        """获取音频时长的抽象方法，由具体实现类提供"""
+        pass
+
+    def wav_to_opus_data(self, file_path):
+        """保持原有接口兼容"""
+        if file_path.endswith('.opus'):
+            return self.get_opus_data(file_path)
+
+        file_type = os.path.splitext(file_path)[1]
         if file_type:
             file_type = file_type.lstrip('.')
-        audio = AudioSegment.from_file(wav_file_path, format=file_type)
+        audio = AudioSegment.from_file(file_path, format=file_type)
 
         duration = len(audio) / 1000.0
 
diff --git a/core/providers/tts/doubao.py b/core/providers/tts/doubao.py
@@ -6,7 +6,7 @@
 from datetime import datetime
 from core.providers.tts.base import TTSProviderBase
 
-
+TAG = __name__
 class TTSProvider(TTSProviderBase):
     def __init__(self, config, delete_audio_file):
         super().__init__(config, delete_audio_file)
@@ -19,7 +19,7 @@ def __init__(self, config, delete_audio_file):
         self.api_url = f"https://{self.host}/api/v1/tts"
         self.header = {"Authorization": f"Bearer;{self.access_token}"}
 
-    def generate_filename(self, extension=".wav"):
+    def generate_filename(self, extension=".opus"):
         return os.path.join(self.output_file, f"tts-{datetime.now().date()}@{uuid.uuid4().hex}{extension}")
 
     async def text_to_speak(self, text, output_file):
@@ -34,7 +34,7 @@ async def text_to_speak(self, text, output_file):
             },
             "audio": {
                 "voice_type": self.voice,
-                "encoding": "wav",
+                "encoding": "ogg_opus",
                 "speed_ratio": 1.0,
                 "volume_ratio": 1.0,
                 "pitch_ratio": 1.0,
@@ -52,6 +52,27 @@ async def text_to_speak(self, text, output_file):
 
         resp = requests.post(self.api_url, json.dumps(request_json), headers=self.header)
         if "data" in resp.json():
+            duration = resp.json()["addition"]["duration"]
             data = resp.json()["data"]
-            file_to_save = open(output_file, "wb")
-            file_to_save.write(base64.b64decode(data))
+            
+            # 保存音频数据
+            with open(output_file, "wb") as f:
+                f.write(base64.b64decode(data))
+            
+            # 保存duration信息到同名的.duration文件
+            duration_file = output_file + '.duration'
+            with open(duration_file, "w") as f:
+                f.write(str(duration))
+                
+            self.logger.bind(tag=TAG).info(f"音频文件生成成功: {text}")
+
+    def get_audio_duration(self, file_path):
+        """从duration文件中读取音频时长"""
+        try:
+            duration_file = file_path + '.duration'
+            with open(duration_file, "r") as f:
+                duration = float(f.read().strip()) / 1000  # 转换为秒
+            return duration
+        except Exception as e:
+            self.logger.bind(tag=TAG).error(f"读取音频时长失败: {e}")
+            return 0
diff --git a/tts.py b/tts.py
@@ -0,0 +1,59 @@
+#coding=utf-8
+
+'''
+requires Python 3.6 or later
+pip install requests
+'''
+import base64
+import json
+import uuid
+import requests
+
+# 填写平台申请的appid, access_token以及cluster
+appid = "3981743413"
+access_token= "_jYCQn1dqmJTlTtWEXivEmoWvQBFyAuS"
+cluster = "volcano_tts"
+
+voice_type = "zh_male_yangguangqingnian_moon_bigtts"
+host = "openspeech.bytedance.com"
+api_url = f"https://{host}/api/v1/tts"
+
+header = {"Authorization": f"Bearer;{access_token}"}
+
+request_json = {
+    "app": {
+        "appid": appid,
+        "token": "access_token",
+        "cluster": cluster
+    },
+    "user": {
+        "uid": "388808087185088"
+    },
+    "audio": {
+        "voice_type": voice_type,
+        "encoding": "pcm",
+        "speed_ratio": 1.0,
+        "volume_ratio": 1.0,
+        "pitch_ratio": 1.0,
+    },
+    "request": {
+        "reqid": str(uuid.uuid4()),
+        "text": "字节跳动语音合成",
+        "text_type": "plain",
+        "operation": "query",
+        "with_frontend": 1,
+        "frontend_type": "unitTson"
+
+    }
+}
+
+if __name__ == '__main__':
+    try:
+        resp = requests.post(api_url, json.dumps(request_json), headers=header)
+        print(f"resp body: \n{resp.json()["addition"]["duration"]}")
+        if "data" in resp.json():
+            data = resp.json()["data"]
+            file_to_save = open("test_submit.mp3", "wb")
+            file_to_save.write(base64.b64decode(data))
+    except Exception as e:
+        e.with_traceback()