unified_python/tts/音频上传.py

import base64
import os
import requests


host = "https://openspeech.bytedance.com"


def train(appid, token, audio_path, spk_id, batch_size=1):  # 将batch_size默认值改为1
    url = host + "/api/v1/mega_tts/audio/upload"
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer;" + token,
        "Resource-Id": "volc.megatts.voiceclone",
    }

    # 获取所有音频文件
    audio_files = []
    if os.path.isdir(audio_path):
        for file_name in os.listdir(audio_path):
            file_path = os.path.join(audio_path, file_name)
            if os.path.isfile(file_path) and file_name.lower().endswith(('.wav', '.mp3', '.flac', '.m4a')):
                audio_files.append(file_path)
    else:
        audio_files = [audio_path]

    # 添加文件数量信息输出
    print(f"总共找到 {len(audio_files)} 个音频文件")

    # 分批上传
    for i in range(0, len(audio_files), batch_size):
        batch_files = audio_files[i:i + batch_size]
        audios = []

        for file_path in batch_files:
            encoded_data, audio_format = encode_audio_file(file_path)
            audios.append({"audio_bytes": encoded_data, "audio_format": audio_format})

        data = {"appid": appid, "speaker_id": spk_id, "audios": audios, "source": 2, "language": 0, "model_type": 1}
        print(f"正在上传批次 {i // batch_size + 1}，包含 {len(audios)} 个文件...")

        response = requests.post(url, json=data, headers=headers)
        print("status code = ", response.status_code)
        if response.status_code != 200:
            raise Exception("train请求错误:" + response.text)
        print("headers = ", response.headers)
        print(response.json())
        print("-" * 50)  # 分隔符，便于查看每批次结果


def get_status(appid, token, spk_id):
    url = host + "/api/v1/mega_tts/status"
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer;" + token,
        "Resource-Id": "volc.megatts.voiceclone",
    }
    body = {"appid": appid, "speaker_id": spk_id}
    response = requests.post(url, headers=headers, json=body)
    print(response.json())


def encode_audio_file(file_path):
    with open(file_path, 'rb') as audio_file:
        audio_data = audio_file.read()
        encoded_data = str(base64.b64encode(audio_data), "utf-8")
        audio_format = os.path.splitext(file_path)[1][1:]  # 获取文件扩展名作为音频格式
        return encoded_data, audio_format


if __name__ == "__main__":
    appid = "9407991441"
    token = "VBI4pixTt-GaARTdacAAdQPrHMY333Di"
    spk_id = "S_xQVFJrvA1"
    train(appid=appid, token=token, audio_path="./马保国语音包", spk_id=spk_id)
    get_status(appid=appid, token=token, spk_id=spk_id)