Skip to content

Commit b75b860

Browse files
author
jiangtao
committed
直接生成opus格式数据
1 parent 96a3555 commit b75b860

File tree

4 files changed

+129
-12
lines changed

4 files changed

+129
-12
lines changed

core/connection.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ def __init__(self, config: Dict[str, Any], _vad, _asr, _llm, _tts):
7777
for cmd in self.cmd_exit:
7878
if len(cmd) > self.max_cmd_length:
7979
self.max_cmd_length = len(cmd)
80-
self.lock = threading.Lock()
8180

8281
self.private_config = None
8382
self.auth_code_gen = AuthCodeGenerator.get_instance()
@@ -299,7 +298,6 @@ def _priority_thread(self):
299298
self.logger.bind(tag=TAG).error(f"TTS 任务出错: {e}")
300299
continue
301300
if not self.client_abort:
302-
with self.lock:
303301
#sleep_time = max(duration, 1 if len(text) <= 3 else len(text) * 0.2)
304302
# 使用实例锁来确保顺序传输
305303
self.logger.bind(tag=TAG).info(f"发送TTS语音: {text}, 时长:{duration}, sleep_time:{duration}")

core/providers/tts/base.py

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,52 @@ def to_tts(self, text):
4242
async def text_to_speak(self, text, output_file):
4343
pass
4444

45-
def wav_to_opus_data(self, wav_file_path):
46-
# 使用pydub加载PCM文件
47-
# 获取文件后缀名
48-
file_type = os.path.splitext(wav_file_path)[1]
45+
def get_opus_data(self, file_path):
46+
"""直接从opus文件获取数据和时长"""
47+
try:
48+
# 读取opus文件
49+
with open(file_path, 'rb') as f:
50+
opus_data = f.read()
51+
52+
# 获取音频时长(从API响应中获取)
53+
duration = self.get_audio_duration(file_path)
54+
55+
# 按照每帧大小分割数据
56+
frame_size = 960 # opus标准帧大小
57+
opus_datas = []
58+
59+
# 分帧处理
60+
current_pos = 0
61+
while current_pos < len(opus_data):
62+
# 读取帧长度(前2个字节)
63+
frame_length = int.from_bytes(opus_data[current_pos:current_pos + 2], 'little')
64+
current_pos += 2
65+
66+
# 读取帧数据
67+
frame_data = opus_data[current_pos:current_pos + frame_length]
68+
opus_datas.append(frame_data)
69+
current_pos += frame_length
70+
71+
return opus_datas, duration
72+
73+
except Exception as e:
74+
logger.bind(tag=TAG).error(f"处理opus文件失败: {e}")
75+
return [], 0
76+
77+
@abstractmethod
78+
def get_audio_duration(self, file_path):
79+
"""获取音频时长的抽象方法,由具体实现类提供"""
80+
pass
81+
82+
def wav_to_opus_data(self, file_path):
83+
"""保持原有接口兼容"""
84+
if file_path.endswith('.opus'):
85+
return self.get_opus_data(file_path)
86+
87+
file_type = os.path.splitext(file_path)[1]
4988
if file_type:
5089
file_type = file_type.lstrip('.')
51-
audio = AudioSegment.from_file(wav_file_path, format=file_type)
90+
audio = AudioSegment.from_file(file_path, format=file_type)
5291

5392
duration = len(audio) / 1000.0
5493

core/providers/tts/doubao.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from datetime import datetime
77
from core.providers.tts.base import TTSProviderBase
88

9-
9+
TAG = __name__
1010
class TTSProvider(TTSProviderBase):
1111
def __init__(self, config, delete_audio_file):
1212
super().__init__(config, delete_audio_file)
@@ -19,7 +19,7 @@ def __init__(self, config, delete_audio_file):
1919
self.api_url = f"https://{self.host}/api/v1/tts"
2020
self.header = {"Authorization": f"Bearer;{self.access_token}"}
2121

22-
def generate_filename(self, extension=".wav"):
22+
def generate_filename(self, extension=".opus"):
2323
return os.path.join(self.output_file, f"tts-{datetime.now().date()}@{uuid.uuid4().hex}{extension}")
2424

2525
async def text_to_speak(self, text, output_file):
@@ -34,7 +34,7 @@ async def text_to_speak(self, text, output_file):
3434
},
3535
"audio": {
3636
"voice_type": self.voice,
37-
"encoding": "wav",
37+
"encoding": "ogg_opus",
3838
"speed_ratio": 1.0,
3939
"volume_ratio": 1.0,
4040
"pitch_ratio": 1.0,
@@ -52,6 +52,27 @@ async def text_to_speak(self, text, output_file):
5252

5353
resp = requests.post(self.api_url, json.dumps(request_json), headers=self.header)
5454
if "data" in resp.json():
55+
duration = resp.json()["addition"]["duration"]
5556
data = resp.json()["data"]
56-
file_to_save = open(output_file, "wb")
57-
file_to_save.write(base64.b64decode(data))
57+
58+
# 保存音频数据
59+
with open(output_file, "wb") as f:
60+
f.write(base64.b64decode(data))
61+
62+
# 保存duration信息到同名的.duration文件
63+
duration_file = output_file + '.duration'
64+
with open(duration_file, "w") as f:
65+
f.write(str(duration))
66+
67+
self.logger.bind(tag=TAG).info(f"音频文件生成成功: {text}")
68+
69+
def get_audio_duration(self, file_path):
70+
"""从duration文件中读取音频时长"""
71+
try:
72+
duration_file = file_path + '.duration'
73+
with open(duration_file, "r") as f:
74+
duration = float(f.read().strip()) / 1000 # 转换为秒
75+
return duration
76+
except Exception as e:
77+
self.logger.bind(tag=TAG).error(f"读取音频时长失败: {e}")
78+
return 0

tts.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#coding=utf-8
2+
3+
'''
4+
requires Python 3.6 or later
5+
pip install requests
6+
'''
7+
import base64
8+
import json
9+
import uuid
10+
import requests
11+
12+
# 填写平台申请的appid, access_token以及cluster
13+
appid = "3981743413"
14+
access_token= "_jYCQn1dqmJTlTtWEXivEmoWvQBFyAuS"
15+
cluster = "volcano_tts"
16+
17+
voice_type = "zh_male_yangguangqingnian_moon_bigtts"
18+
host = "openspeech.bytedance.com"
19+
api_url = f"https://{host}/api/v1/tts"
20+
21+
header = {"Authorization": f"Bearer;{access_token}"}
22+
23+
request_json = {
24+
"app": {
25+
"appid": appid,
26+
"token": "access_token",
27+
"cluster": cluster
28+
},
29+
"user": {
30+
"uid": "388808087185088"
31+
},
32+
"audio": {
33+
"voice_type": voice_type,
34+
"encoding": "pcm",
35+
"speed_ratio": 1.0,
36+
"volume_ratio": 1.0,
37+
"pitch_ratio": 1.0,
38+
},
39+
"request": {
40+
"reqid": str(uuid.uuid4()),
41+
"text": "字节跳动语音合成",
42+
"text_type": "plain",
43+
"operation": "query",
44+
"with_frontend": 1,
45+
"frontend_type": "unitTson"
46+
47+
}
48+
}
49+
50+
if __name__ == '__main__':
51+
try:
52+
resp = requests.post(api_url, json.dumps(request_json), headers=header)
53+
print(f"resp body: \n{resp.json()["addition"]["duration"]}")
54+
if "data" in resp.json():
55+
data = resp.json()["data"]
56+
file_to_save = open("test_submit.mp3", "wb")
57+
file_to_save.write(base64.b64decode(data))
58+
except Exception as e:
59+
e.with_traceback()

0 commit comments

Comments
 (0)