WIP: Japanese Hubert
This commit is contained in:
parent
4faee09aa1
commit
7e8e31a8c0
@ -20,7 +20,6 @@ export const MergeLabRow = (_props: MergeLabRowProps) => {
|
||||
}, "")
|
||||
}, [appState.serverSetting.serverSetting.modelSlots])
|
||||
|
||||
console.log("newSlotChangeKey", newSlotChangeKey)
|
||||
useEffect(() => {
|
||||
// PyTorchモデルだけフィルタリング
|
||||
const models = appState.serverSetting.serverSetting.modelSlots.filter(x => { return x.pyTorchModelFile && x.pyTorchModelFile.length > 0 })
|
||||
|
@ -53,6 +53,9 @@ def setupArgParser():
|
||||
parser.add_argument(
|
||||
"--hubert_base", type=str, help="path to hubert_base model(pytorch)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hubert_base_jp", type=str, help="path to hubert_base_jp model(pytorch)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hubert_soft", type=str, help="path to hubert_soft model(pytorch)"
|
||||
)
|
||||
@ -109,6 +112,7 @@ if __name__ == "MMVCServerSIO":
|
||||
content_vec_500_onnx=args.content_vec_500_onnx,
|
||||
content_vec_500_onnx_on=args.content_vec_500_onnx_on,
|
||||
hubert_base=args.hubert_base,
|
||||
hubert_base_jp=args.hubert_base_jp,
|
||||
hubert_soft=args.hubert_soft,
|
||||
nsf_hifigan=args.nsf_hifigan,
|
||||
)
|
||||
|
@ -1,3 +1,4 @@
|
||||
from enum import Enum
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
@ -63,3 +64,9 @@ def getFrontendPath():
|
||||
else "../client/demo/dist"
|
||||
)
|
||||
return frontend_path
|
||||
|
||||
|
||||
class EnumEmbedderTypes(Enum):
|
||||
hubert = "hubert"
|
||||
contentvec = "contentvec"
|
||||
hubert_jp = "hubert_jp"
|
||||
|
@ -7,6 +7,8 @@ from voice_changer.RVC.MergeModelRequest import MergeModelRequest
|
||||
from voice_changer.RVC.ModelWrapper import ModelWrapper
|
||||
from Exceptions import NoModeLoadedException
|
||||
from voice_changer.RVC.RVCSettings import RVCSettings
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
@ -16,7 +18,7 @@ from typing import cast
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from fairseq import checkpoint_utils
|
||||
# from fairseq import checkpoint_utils
|
||||
import traceback
|
||||
import faiss
|
||||
|
||||
@ -56,6 +58,7 @@ providers = [
|
||||
|
||||
class RVC:
|
||||
audio_buffer: AudioInOut | None = None
|
||||
embedder: Embedder | None = None
|
||||
|
||||
def __init__(self, params: VoiceChangerParams):
|
||||
self.initialLoad = True
|
||||
@ -119,21 +122,24 @@ class RVC:
|
||||
asdict(self.settings.modelSlots[tmp_slot]),
|
||||
)
|
||||
# hubertロード
|
||||
try:
|
||||
hubert_path = self.params.hubert_base
|
||||
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[hubert_path],
|
||||
suffix="",
|
||||
)
|
||||
model = models[0]
|
||||
model.eval()
|
||||
if self.is_half:
|
||||
model = model.half()
|
||||
self.hubert_model = model
|
||||
# try:
|
||||
# hubert_path = self.params.hubert_base
|
||||
# hubert_path_jp = self.params.hubert_base_jp
|
||||
# print(hubert_path, hubert_path_jp)
|
||||
|
||||
except Exception as e:
|
||||
print("EXCEPTION during loading hubert/contentvec model", e)
|
||||
print(" hubert_path:", hubert_path)
|
||||
# models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
# [hubert_path],
|
||||
# suffix="",
|
||||
# )
|
||||
# model = models[0]
|
||||
# model.eval()
|
||||
# if self.is_half:
|
||||
# model = model.half()
|
||||
# self.hubert_model = model
|
||||
|
||||
# except Exception as e:
|
||||
# print("EXCEPTION during loading hubert/contentvec model", e)
|
||||
# print(" hubert_path:", hubert_path)
|
||||
|
||||
# 初回のみロード
|
||||
if self.initialLoad or tmp_slot == self.currentSlot:
|
||||
@ -256,6 +262,7 @@ class RVC:
|
||||
|
||||
self.next_trans = self.settings.modelSlots[slot].defaultTrans
|
||||
self.next_samplingRate = self.settings.modelSlots[slot].samplingRate
|
||||
self.next_embedder = self.settings.modelSlots[slot].embedder
|
||||
self.next_framework = (
|
||||
"ONNX" if self.next_onnx_session is not None else "PyTorch"
|
||||
)
|
||||
@ -266,6 +273,17 @@ class RVC:
|
||||
print("[Voice Changer] Switching model..")
|
||||
# del self.net_g
|
||||
# del self.onnx_session
|
||||
try:
|
||||
self.embedder = EmbedderManager.getEmbedder(
|
||||
self.next_embedder,
|
||||
self.params.hubert_base,
|
||||
True,
|
||||
torch.device("cuda:0"),
|
||||
)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] load hubert error", e)
|
||||
traceback.print_exc()
|
||||
|
||||
self.net_g = self.next_net_g
|
||||
self.onnx_session = self.next_onnx_session
|
||||
self.feature_file = self.next_feature_file
|
||||
@ -397,7 +415,8 @@ class RVC:
|
||||
else:
|
||||
dev = torch.device("cuda", index=self.settings.gpu)
|
||||
|
||||
self.hubert_model = self.hubert_model.to(dev)
|
||||
# self.hubert_model = self.hubert_model.to(dev)
|
||||
self.embedder = self.embedder.to(dev)
|
||||
|
||||
audio = data[0]
|
||||
convertSize = data[1]
|
||||
@ -420,7 +439,8 @@ class RVC:
|
||||
|
||||
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
|
||||
audio_out = vc.pipeline(
|
||||
self.hubert_model,
|
||||
# self.hubert_model,
|
||||
self.embedder,
|
||||
self.onnx_session,
|
||||
sid,
|
||||
audio,
|
||||
@ -454,7 +474,7 @@ class RVC:
|
||||
else:
|
||||
dev = torch.device("cuda", index=self.settings.gpu)
|
||||
|
||||
self.hubert_model = self.hubert_model.to(dev)
|
||||
self.embedder = self.embedder.to(dev)
|
||||
self.net_g = self.net_g.to(dev)
|
||||
|
||||
audio = data[0]
|
||||
@ -478,7 +498,8 @@ class RVC:
|
||||
|
||||
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
|
||||
audio_out = vc.pipeline(
|
||||
self.hubert_model,
|
||||
# self.hubert_model,
|
||||
self.embedder,
|
||||
self.net_g,
|
||||
sid,
|
||||
audio,
|
||||
@ -620,9 +641,7 @@ class RVC:
|
||||
indexFilename=None,
|
||||
clusterTorchModelFilename=None,
|
||||
)
|
||||
params = {
|
||||
"trans":req.defaultTrans
|
||||
}
|
||||
params = {"trans": req.defaultTrans}
|
||||
props: LoadModelParams = LoadModelParams(
|
||||
slot=targetSlot, isHalf=True, files=filePaths, params=json.dumps(params)
|
||||
)
|
||||
|
@ -6,6 +6,8 @@ import torch.nn.functional as F
|
||||
import scipy.signal as signal
|
||||
import pyworld
|
||||
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
|
||||
|
||||
class VC(object):
|
||||
def __init__(self, tgt_sr, device, is_half, x_pad):
|
||||
@ -66,35 +68,11 @@ class VC(object):
|
||||
f0_mel[f0_mel > 255] = 255
|
||||
f0_coarse = np.rint(f0_mel).astype(np.int)
|
||||
|
||||
# Volume Extract
|
||||
# volume = self.extractVolume(audio, 512)
|
||||
# volume = np.pad(
|
||||
# volume.astype("float"), (start_frame, n_frames - len(volume) - start_frame)
|
||||
# )
|
||||
|
||||
# return f0_coarse, f0bak, volume # 1-0
|
||||
return f0_coarse, f0bak
|
||||
|
||||
# def extractVolume(self, audio, hopsize):
|
||||
# n_frames = int(len(audio) // hopsize) + 1
|
||||
# audio2 = audio**2
|
||||
# audio2 = np.pad(
|
||||
# audio2,
|
||||
# (int(hopsize // 2), int((hopsize + 1) // 2)),
|
||||
# mode="reflect",
|
||||
# )
|
||||
# volume = np.array(
|
||||
# [
|
||||
# np.mean(audio2[int(n * hopsize) : int((n + 1) * hopsize)]) # noqa:E203
|
||||
# for n in range(n_frames)
|
||||
# ]
|
||||
# )
|
||||
# volume = np.sqrt(volume)
|
||||
# return volume
|
||||
|
||||
def pipeline(
|
||||
self,
|
||||
embedder,
|
||||
embedder: Embedder,
|
||||
model,
|
||||
sid,
|
||||
audio,
|
||||
@ -141,24 +119,25 @@ class VC(object):
|
||||
|
||||
# embedding
|
||||
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
||||
if embChannels == 256:
|
||||
inputs = {
|
||||
"source": feats.to(self.device),
|
||||
"padding_mask": padding_mask,
|
||||
"output_layer": 9, # layer 9
|
||||
}
|
||||
else:
|
||||
inputs = {
|
||||
"source": feats.to(self.device),
|
||||
"padding_mask": padding_mask,
|
||||
}
|
||||
feats = embedder.extractFeatures(feats, embChannels)
|
||||
# if embChannels == 256:
|
||||
# inputs = {
|
||||
# "source": feats.to(self.device),
|
||||
# "padding_mask": padding_mask,
|
||||
# "output_layer": 9, # layer 9
|
||||
# }
|
||||
# else:
|
||||
# inputs = {
|
||||
# "source": feats.to(self.device),
|
||||
# "padding_mask": padding_mask,
|
||||
# }
|
||||
|
||||
with torch.no_grad():
|
||||
logits = embedder.extract_features(**inputs)
|
||||
if embChannels == 256:
|
||||
feats = embedder.final_proj(logits[0])
|
||||
else:
|
||||
feats = logits[0]
|
||||
# with torch.no_grad():
|
||||
# logits = embedder.extract_features(**inputs)
|
||||
# if embChannels == 256:
|
||||
# feats = embedder.final_proj(logits[0])
|
||||
# else:
|
||||
# feats = logits[0]
|
||||
|
||||
# Index - feature抽出
|
||||
if (
|
||||
|
61
server/voice_changer/RVC/embedder/Embedder.py
Normal file
61
server/voice_changer/RVC/embedder/Embedder.py
Normal file
@ -0,0 +1,61 @@
|
||||
from typing import Any, Protocol
|
||||
|
||||
import torch
|
||||
from torch import device
|
||||
|
||||
from const import EnumEmbedderTypes
|
||||
|
||||
|
||||
class Embedder(Protocol):
|
||||
embedderType: EnumEmbedderTypes = EnumEmbedderTypes.hubert
|
||||
file: str
|
||||
isHalf: bool = True
|
||||
dev: device
|
||||
|
||||
model: Any | None = None
|
||||
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True):
|
||||
self.embedderType = EnumEmbedderTypes.hubert
|
||||
self.file = file
|
||||
self.isHalf = isHalf
|
||||
self.dev = dev
|
||||
|
||||
def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor:
|
||||
...
|
||||
|
||||
def setHalf(self, isHalf: bool):
|
||||
self.isHalf = isHalf
|
||||
if self.model is not None and isHalf:
|
||||
self.model = self.model.half()
|
||||
|
||||
def setDevice(self, dev: device):
|
||||
self.dev = dev
|
||||
if self.model is not None:
|
||||
self.model = self.model.to(self.dev)
|
||||
|
||||
def matchCondition(self, embedderType: EnumEmbedderTypes, file: str) -> bool:
|
||||
# Check Type
|
||||
if self.embedderType != embedderType:
|
||||
print(
|
||||
"[Voice Changer] embeder type is not match",
|
||||
self.embedderType,
|
||||
embedderType,
|
||||
)
|
||||
return False
|
||||
|
||||
# Check File Path
|
||||
if self.file != file:
|
||||
print(
|
||||
"[Voice Changer] embeder file is not match",
|
||||
self.file,
|
||||
file,
|
||||
)
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
def to(self, dev: torch.device):
|
||||
if self.model is not None:
|
||||
self.model = self.model.to(dev)
|
||||
return self
|
42
server/voice_changer/RVC/embedder/EmbedderManager.py
Normal file
42
server/voice_changer/RVC/embedder/EmbedderManager.py
Normal file
@ -0,0 +1,42 @@
|
||||
from torch import device
|
||||
|
||||
from const import EnumEmbedderTypes
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.embedder.FairseqContentvec import FairseqContentvec
|
||||
from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
|
||||
from voice_changer.RVC.embedder.FairseqHubertJp import FairseqHubertJp
|
||||
|
||||
|
||||
class EmbedderManager:
|
||||
currentEmbedder: Embedder | None = None
|
||||
|
||||
@classmethod
|
||||
def getEmbedder(
|
||||
cls, embederType: EnumEmbedderTypes, file: str, isHalf: bool, dev: device
|
||||
) -> Embedder:
|
||||
if cls.currentEmbedder is None:
|
||||
print("[Voice Changer] generate new embedder. (no embedder)")
|
||||
cls.loadEmbedder(embederType, file, isHalf, dev)
|
||||
cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev)
|
||||
elif cls.currentEmbedder.matchCondition(embederType, file) is False:
|
||||
print("[Voice Changer] generate new embedder. (not match)")
|
||||
cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev)
|
||||
else:
|
||||
cls.currentEmbedder.setDevice(dev)
|
||||
cls.currentEmbedder.setHalf(isHalf)
|
||||
print("RETURN", cls.currentEmbedder)
|
||||
return cls.currentEmbedder
|
||||
|
||||
@classmethod
|
||||
def loadEmbedder(
|
||||
cls, embederType: EnumEmbedderTypes, file: str, isHalf: bool, dev: device
|
||||
) -> Embedder:
|
||||
if embederType == EnumEmbedderTypes.hubert:
|
||||
return FairseqHubert().loadModel(file, dev, isHalf)
|
||||
elif embederType == EnumEmbedderTypes.hubert_jp: # same as hubert
|
||||
return FairseqHubertJp().loadModel(file, dev, isHalf)
|
||||
elif embederType == EnumEmbedderTypes.contentvec: # same as hubert
|
||||
return FairseqContentvec().loadModel(file, dev, isHalf)
|
||||
else:
|
||||
# return hubert as default
|
||||
return FairseqHubert().loadModel(file, dev, isHalf)
|
11
server/voice_changer/RVC/embedder/FairseqContentvec.py
Normal file
11
server/voice_changer/RVC/embedder/FairseqContentvec.py
Normal file
@ -0,0 +1,11 @@
|
||||
from torch import device
|
||||
from const import EnumEmbedderTypes
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
|
||||
|
||||
|
||||
class FairseqContentvec(FairseqHubert):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
|
||||
super().loadModel(file, dev, isHalf)
|
||||
self.embedderType = EnumEmbedderTypes.contentvec
|
||||
return self
|
47
server/voice_changer/RVC/embedder/FairseqHubert.py
Normal file
47
server/voice_changer/RVC/embedder/FairseqHubert.py
Normal file
@ -0,0 +1,47 @@
|
||||
import torch
|
||||
from torch import device
|
||||
from const import EnumEmbedderTypes
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from fairseq import checkpoint_utils
|
||||
|
||||
|
||||
class FairseqHubert(Embedder):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
|
||||
super().loadModel(file, dev, isHalf)
|
||||
|
||||
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[file],
|
||||
suffix="",
|
||||
)
|
||||
model = models[0]
|
||||
model.eval()
|
||||
|
||||
model = model.to(dev)
|
||||
if isHalf:
|
||||
model = model.half()
|
||||
|
||||
self.model = model
|
||||
self.embedderType = EnumEmbedderTypes.hubert
|
||||
return self
|
||||
|
||||
def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor:
|
||||
padding_mask = torch.BoolTensor(feats.shape).to(self.dev).fill_(False)
|
||||
if embChannels == 256:
|
||||
inputs = {
|
||||
"source": feats.to(self.dev),
|
||||
"padding_mask": padding_mask,
|
||||
"output_layer": 9, # layer 9
|
||||
}
|
||||
else:
|
||||
inputs = {
|
||||
"source": feats.to(self.dev),
|
||||
"padding_mask": padding_mask,
|
||||
}
|
||||
|
||||
with torch.no_grad():
|
||||
logits = self.model.extract_features(**inputs)
|
||||
if embChannels == 256:
|
||||
feats = self.model.final_proj(logits[0])
|
||||
else:
|
||||
feats = logits[0]
|
||||
return feats
|
11
server/voice_changer/RVC/embedder/FairseqHubertJp.py
Normal file
11
server/voice_changer/RVC/embedder/FairseqHubertJp.py
Normal file
@ -0,0 +1,11 @@
|
||||
from torch import device
|
||||
from const import EnumEmbedderTypes
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
|
||||
|
||||
|
||||
class FairseqHubertJp(FairseqHubert):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
|
||||
super().loadModel(file, dev, isHalf)
|
||||
self.embedderType = EnumEmbedderTypes.hubert_jp
|
||||
return self
|
@ -2,10 +2,11 @@ from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoiceChangerParams():
|
||||
class VoiceChangerParams:
|
||||
content_vec_500: str
|
||||
content_vec_500_onnx: str
|
||||
content_vec_500_onnx_on: bool
|
||||
hubert_base: str
|
||||
hubert_base_jp: str
|
||||
hubert_soft: str
|
||||
nsf_hifigan: str
|
||||
|
Loading…
x
Reference in New Issue
Block a user