unified_python/tts/音频上传.py

77 lines
2.7 KiB
Python
Raw Normal View History

2025-08-06 17:26:07 +08:00
import base64
import os
import requests
host = "https://openspeech.bytedance.com"
def train(appid, token, audio_path, spk_id, batch_size=1): # 将batch_size默认值改为1
url = host + "/api/v1/mega_tts/audio/upload"
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer;" + token,
"Resource-Id": "volc.megatts.voiceclone",
}
# 获取所有音频文件
audio_files = []
if os.path.isdir(audio_path):
for file_name in os.listdir(audio_path):
file_path = os.path.join(audio_path, file_name)
if os.path.isfile(file_path) and file_name.lower().endswith(('.wav', '.mp3', '.flac', '.m4a')):
audio_files.append(file_path)
else:
audio_files = [audio_path]
# 添加文件数量信息输出
print(f"总共找到 {len(audio_files)} 个音频文件")
# 分批上传
for i in range(0, len(audio_files), batch_size):
batch_files = audio_files[i:i + batch_size]
audios = []
for file_path in batch_files:
encoded_data, audio_format = encode_audio_file(file_path)
audios.append({"audio_bytes": encoded_data, "audio_format": audio_format})
data = {"appid": appid, "speaker_id": spk_id, "audios": audios, "source": 2, "language": 0, "model_type": 1}
print(f"正在上传批次 {i // batch_size + 1},包含 {len(audios)} 个文件...")
response = requests.post(url, json=data, headers=headers)
print("status code = ", response.status_code)
if response.status_code != 200:
raise Exception("train请求错误:" + response.text)
print("headers = ", response.headers)
print(response.json())
print("-" * 50) # 分隔符,便于查看每批次结果
def get_status(appid, token, spk_id):
url = host + "/api/v1/mega_tts/status"
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer;" + token,
"Resource-Id": "volc.megatts.voiceclone",
}
body = {"appid": appid, "speaker_id": spk_id}
response = requests.post(url, headers=headers, json=body)
print(response.json())
def encode_audio_file(file_path):
with open(file_path, 'rb') as audio_file:
audio_data = audio_file.read()
encoded_data = str(base64.b64encode(audio_data), "utf-8")
audio_format = os.path.splitext(file_path)[1][1:] # 获取文件扩展名作为音频格式
return encoded_data, audio_format
if __name__ == "__main__":
appid = "9407991441"
token = "VBI4pixTt-GaARTdacAAdQPrHMY333Di"
spk_id = "S_xQVFJrvA1"
train(appid=appid, token=token, audio_path="./马保国语音包", spk_id=spk_id)
get_status(appid=appid, token=token, spk_id=spk_id)