135 lines
4.2 KiB
Python
Raw Normal View History

2024-03-02 01:45:29 +00:00
import os
2023-10-14 03:34:46 -04:00
from typing import Optional
2023-10-11 04:30:29 -04:00
import hydra
2023-10-31 05:38:20 +00:00
import lightning as L
2024-03-02 01:45:29 +00:00
import pyrootutils
2023-10-11 03:14:22 -04:00
import torch
2023-10-31 05:38:20 +00:00
from lightning import Callback, LightningDataModule, LightningModule, Trainer
from lightning.pytorch.loggers import Logger
2023-10-11 03:14:22 -04:00
from omegaconf import DictConfig, OmegaConf
2023-10-13 01:08:40 -04:00
2023-10-31 05:38:20 +00:00
import fish_speech.utils as utils
2023-10-11 03:14:22 -04:00
2024-03-02 01:45:29 +00:00
os.environ.pop("SLURM_NTASKS", None)
os.environ.pop("SLURM_JOB_NAME", None)
os.environ.pop("SLURM_NTASKS_PER_NODE", None)
# register eval resolver and root
pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
2023-10-11 03:14:22 -04:00
# Allow TF32 on Ampere GPUs
torch.set_float32_matmul_precision("high")
torch.backends.cudnn.allow_tf32 = True
2023-10-13 01:08:40 -04:00
# register eval resolver
2023-10-11 03:14:22 -04:00
OmegaConf.register_new_resolver("eval", eval)
2023-10-31 05:38:20 +00:00
log = utils.RankedLogger(__name__, rank_zero_only=True)
@utils.task_wrapper
def train(cfg: DictConfig) -> tuple[dict, dict]:
"""Trains the model. Can additionally evaluate on a testset, using best weights obtained during
training.
This method is wrapped in optional @task_wrapper decorator, that controls the behavior during
failure. Useful for multiruns, saving info about the crash, etc.
Args:
cfg (DictConfig): Configuration composed by Hydra.
Returns:
Tuple[dict, dict]: Dict with metrics and dict with all instantiated objects.
""" # noqa: E501
# set seed for random number generators in pytorch, numpy and python.random
if cfg.get("seed"):
2023-12-07 12:57:44 +00:00
L.seed_everything(cfg.seed, workers=False)
2023-10-31 05:38:20 +00:00
if cfg.get("deterministic"):
torch.use_deterministic_algorithms(True)
log.info(f"Instantiating datamodule <{cfg.data._target_}>")
datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data)
log.info(f"Instantiating model <{cfg.model._target_}>")
model: LightningModule = hydra.utils.instantiate(cfg.model)
log.info("Instantiating callbacks...")
callbacks: list[Callback] = utils.instantiate_callbacks(cfg.get("callbacks"))
log.info("Instantiating loggers...")
logger: list[Logger] = utils.instantiate_loggers(cfg.get("logger"))
log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
trainer: Trainer = hydra.utils.instantiate(
cfg.trainer, callbacks=callbacks, logger=logger
)
object_dict = {
"cfg": cfg,
"datamodule": datamodule,
"model": model,
"callbacks": callbacks,
"logger": logger,
"trainer": trainer,
}
if logger:
log.info("Logging hyperparameters!")
utils.log_hyperparameters(object_dict)
2023-10-11 05:14:33 -04:00
2023-10-31 05:38:20 +00:00
if cfg.get("train"):
log.info("Starting training!")
ckpt_path = cfg.get("ckpt_path")
2023-12-12 05:16:15 +00:00
auto_resume = False
2023-10-31 05:38:20 +00:00
if ckpt_path is None:
ckpt_path = utils.get_latest_checkpoint(cfg.paths.ckpt_dir)
2023-12-12 05:16:15 +00:00
auto_resume = True
2023-10-31 05:38:20 +00:00
if ckpt_path is not None:
log.info(f"Resuming from checkpoint: {ckpt_path}")
2023-12-12 05:16:15 +00:00
# resume weights only is disabled for auto-resume
if cfg.get("resume_weights_only") and auto_resume is False:
2023-10-31 05:38:20 +00:00
log.info("Resuming weights only!")
ckpt = torch.load(ckpt_path, map_location=model.device)
2023-12-12 05:16:15 +00:00
if "state_dict" in ckpt:
ckpt = ckpt["state_dict"]
err = model.load_state_dict(ckpt, strict=False)
log.info(f"Error loading state dict: {err}")
2023-10-31 05:38:20 +00:00
ckpt_path = None
trainer.fit(model=model, datamodule=datamodule, ckpt_path=ckpt_path)
2023-10-11 04:30:29 -04:00
2023-10-31 05:38:20 +00:00
train_metrics = trainer.callback_metrics
if cfg.get("test"):
log.info("Starting testing!")
ckpt_path = trainer.checkpoint_callback.best_model_path
if ckpt_path == "":
log.warning("Best ckpt not found! Using current weights for testing...")
ckpt_path = cfg.get("ckpt_path")
trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path)
log.info(f"Best ckpt path: {ckpt_path}")
test_metrics = trainer.callback_metrics
# merge train and test metrics
metric_dict = {**train_metrics, **test_metrics}
return metric_dict, object_dict
2023-10-14 04:01:18 -04:00
2023-10-11 03:14:22 -04:00
@hydra.main(
version_base="1.3", config_path="./configs", config_name="llama_pretrain.yaml"
)
2023-10-31 05:38:20 +00:00
def main(cfg: DictConfig) -> Optional[float]:
# train the model
train(cfg)
2023-10-11 04:30:29 -04:00
2023-10-11 03:14:22 -04:00
if __name__ == "__main__":
main()