* support basic TTS inference * Agent (#648) * agent * rm fastapi * routes * dry run: tts * api_invoke_cahta * .gradio ignore * small fix * Fix llama generate * add lots * add agent * fix agent * fix agent * fix route * fix compile * Add fixed timbre * Fix duplicated audio * Fix * remove unused * Improve ui * okok * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Update Agent Webui and doc * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Lengyue <lengyue@lengyue.me> Co-authored-by: spicysama <a2983352531@outlook.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
96 lines
2.6 KiB
Python
96 lines
2.6 KiB
Python
import os
|
|
from argparse import ArgumentParser
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import ormsgpack
|
|
|
|
from tools.schema import ServeReferenceAudio, ServeTTSRequest
|
|
|
|
api_key = os.environ.get("FISH_API_KEY", "YOUR_API_KEY")
|
|
|
|
|
|
def audio_request():
|
|
# priority: ref_id > references
|
|
request = ServeTTSRequest(
|
|
text="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
|
|
# reference_id="114514",
|
|
references=[
|
|
ServeReferenceAudio(
|
|
audio=open("lengyue.wav", "rb").read(),
|
|
text=open("lengyue.lab", "r", encoding="utf-8").read(),
|
|
)
|
|
],
|
|
streaming=True,
|
|
)
|
|
|
|
api_key = os.environ.get("FISH_API_KEY", "YOUR_API_KEY")
|
|
|
|
with (
|
|
httpx.Client() as client,
|
|
open("hello.wav", "wb") as f,
|
|
):
|
|
with client.stream(
|
|
"POST",
|
|
"http://127.0.0.1:8080/v1/tts",
|
|
content=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
|
|
headers={
|
|
"authorization": f"Bearer {api_key}",
|
|
"content-type": "application/msgpack",
|
|
},
|
|
timeout=None,
|
|
) as response:
|
|
for chunk in response.iter_bytes():
|
|
f.write(chunk)
|
|
|
|
|
|
def asr_request(audio_path: Path):
|
|
|
|
# Read the audio file
|
|
with open(
|
|
str(audio_path),
|
|
"rb",
|
|
) as audio_file:
|
|
audio_data = audio_file.read()
|
|
|
|
# Prepare the request data
|
|
request_data = {
|
|
"audio": audio_data,
|
|
"language": "en", # Optional: specify the language
|
|
"ignore_timestamps": False, # Optional: set to True to ignore precise timestamps
|
|
}
|
|
|
|
# Send the request
|
|
with httpx.Client() as client:
|
|
response = client.post(
|
|
"https://api.fish.audio/v1/asr",
|
|
headers={
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/msgpack",
|
|
},
|
|
content=ormsgpack.packb(request_data),
|
|
)
|
|
|
|
# Parse the response
|
|
result = response.json()
|
|
|
|
print(f"Transcribed text: {result['text']}")
|
|
print(f"Audio duration: {result['duration']} seconds")
|
|
|
|
for segment in result["segments"]:
|
|
print(f"Segment: {segment['text']}")
|
|
print(f"Start time: {segment['start']}, End time: {segment['end']}")
|
|
|
|
|
|
def parse_args():
|
|
parser = ArgumentParser()
|
|
parser.add_argument("--audio_path", type=Path, default="audio/ref/trump.mp3")
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
|
|
asr_request(args.audio_path)
|