fish-speech/tools/run_webui.py
Picus303 62eae262c2
Make WebUI and API code cleaner (+ 1.5 fixes) (#703)
* rename webui.py to run_webui.py

* remove unused imports

* remove unsued code

* move inference code and fix all warnings

* move web app code

* make code easier to read

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove unused function

* remove msgpack_api.py

* rename API files

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* finish updating the doc with the new file names

* finish updating the doc with the new file names

* fix CPU use in the API

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactor WebUIinference in a class with submodules

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* re-enable streaming in webui inference code

* generalize inference code in webui

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor fix

* make a unique inference engine class

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor fix

* cleaning code

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* implement new structure of the API (not working)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactor API

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor fixes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* reimplement chat endpoint

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-12-07 14:13:19 +08:00

102 lines
3.1 KiB
Python

import os
from argparse import ArgumentParser
from pathlib import Path
import pyrootutils
import torch
from loguru import logger
pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
from tools.inference_engine import TTSInferenceEngine
from tools.llama.generate import launch_thread_safe_queue
from tools.schema import ServeTTSRequest
from tools.vqgan.inference import load_model as load_decoder_model
from tools.webui import build_app
from tools.webui.inference import get_inference_wrapper
# Make einx happy
os.environ["EINX_FILTER_TRACEBACK"] = "false"
def parse_args():
parser = ArgumentParser()
parser.add_argument(
"--llama-checkpoint-path",
type=Path,
default="checkpoints/fish-speech-1.5",
)
parser.add_argument(
"--decoder-checkpoint-path",
type=Path,
default="checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
)
parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--half", action="store_true")
parser.add_argument("--compile", action="store_true")
parser.add_argument("--max-gradio-length", type=int, default=0)
parser.add_argument("--theme", type=str, default="light")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
args.precision = torch.half if args.half else torch.bfloat16
# Check if CUDA is available
if not torch.cuda.is_available():
logger.info("CUDA is not available, running on CPU.")
args.device = "cpu"
logger.info("Loading Llama model...")
llama_queue = launch_thread_safe_queue(
checkpoint_path=args.llama_checkpoint_path,
device=args.device,
precision=args.precision,
compile=args.compile,
)
logger.info("Loading VQ-GAN model...")
decoder_model = load_decoder_model(
config_name=args.decoder_config_name,
checkpoint_path=args.decoder_checkpoint_path,
device=args.device,
)
logger.info("Decoder model loaded, warming up...")
# Create the inference engine
inference_engine = TTSInferenceEngine(
llama_queue=llama_queue,
decoder_model=decoder_model,
compile=args.compile,
precision=args.precision,
)
# Dry run to check if the model is loaded correctly and avoid the first-time latency
list(
inference_engine.inference(
ServeTTSRequest(
text="Hello world.",
references=[],
reference_id=None,
max_new_tokens=0,
chunk_length=200,
top_p=0.7,
repetition_penalty=1.5,
temperature=0.7,
format="wav",
)
)
)
logger.info("Warming up done, launching the web UI...")
# Get the inference function with the immutable arguments
inference_fct = get_inference_wrapper(inference_engine)
app = build_app(inference_fct, args.theme)
app.launch(show_api=True)