WIP: support rvc-webui, refactoring

This commit is contained in:
wataru 2023-04-25 03:03:38 +09:00
parent 86798b3896
commit 777c2d6e1e
6 changed files with 62 additions and 32 deletions

View File

@ -40,6 +40,10 @@
{
"name": "onnxExecutor",
"options": {}
},
{
"name": "modelSamplingRate",
"options": {}
}
],
"modelSetting": [

File diff suppressed because one or more lines are too long

View File

@ -40,6 +40,10 @@
{
"name": "onnxExecutor",
"options": {}
},
{
"name": "modelSamplingRate",
"options": {}
}
],
"modelSetting": [

View File

@ -30,10 +30,13 @@ class ModelWrapper:
self.embChannels = metadata["embChannels"]
print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
except:
self.samplingRate = -1
self.samplingRate = 48000
self.f0 = True
print(f"[Voice Changer] Onnx version is old. Please regenerate onnxfile. Fallback to default")
self.embChannels = 256
print(f"[Voice Changer] ############## !!!! CAUTION !!!! ####################")
print(f"[Voice Changer] This onnx's version is depricated. Please regenerate onnxfile. Fallback to default")
print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
print(f"[Voice Changer] ############## !!!! CAUTION !!!! ####################")
def getSamplingRate(self):
return self.samplingRate

View File

@ -79,7 +79,7 @@ class RVCSettings():
rvcQuality: int = 0
silenceFront: int = 1 # 0:off, 1:on
modelSamplingRate: int = 48000
modelSlotIndex: int = 0
modelSlotIndex: int = -1
speakers: dict[str, int] = field(
default_factory=lambda: {}
@ -118,13 +118,30 @@ class RVC:
params_str = props["params"]
params = json.loads(params_str)
self.settings.modelSlots[self.tmp_slot] = ModelSlot(
pyTorchModelFile=props["files"]["pyTorchModelFilename"],
onnxModelFile=props["files"]["onnxModelFilename"],
featureFile=props["files"]["featureFilename"],
indexFile=props["files"]["indexFilename"],
defaultTrans=params["trans"]
)
# self.settings.modelSlots[self.tmp_slot] = ModelSlot(
# pyTorchModelFile=props["files"]["pyTorchModelFilename"],
# onnxModelFile=props["files"]["onnxModelFilename"],
# featureFile=props["files"]["featureFilename"],
# indexFile=props["files"]["indexFilename"],
# defaultTrans=params["trans"]
# )
newSlot = asdict(self.settings.modelSlots[self.tmp_slot])
newSlot.update({
"pyTorchModelFile": props["files"]["pyTorchModelFilename"],
"onnxModelFile": props["files"]["onnxModelFilename"],
"featureFile": props["files"]["featureFilename"],
"indexFile": props["files"]["indexFilename"],
"defaultTrans": params["trans"]
})
# .update({
# pyTorchModelFile: props["files"]["pyTorchModelFilename"],
# onnxModelFile: props["files"]["onnxModelFilename"],
# featureFile: props["files"]["featureFilename"],
# indexFile: props["files"]["indexFilename"],
# defaultTrans: params["trans"]
# })
self.settings.modelSlots[self.tmp_slot] = ModelSlot(**newSlot)
print("[Voice Changer] RVC loading... slot:", self.tmp_slot)
@ -213,8 +230,8 @@ class RVC:
self.next_onnx_session = ModelWrapper(onnxModelFile)
self.settings.modelSlots[slot].samplingRateOnnx = self.next_onnx_session.getSamplingRate()
self.settings.modelSlots[slot].f0Onnx = self.next_onnx_session.getF0()
if self.settings.modelSlots[slot].samplingRate == -1: # ONNXにsampling rateが入っていない
self.settings.modelSlots[slot].samplingRate = self.settings.modelSamplingRate
# if self.settings.modelSlots[slot].samplingRate == -1: # ONNXにsampling rateが入っていない
# self.settings.modelSlots[slot].samplingRate = self.settings.modelSamplingRate
self.settings.modelSlots[slot].embChannelsOnnx = self.next_onnx_session.getEmbChannels()
# ONNXがある場合は、ONNXの設定を優先
@ -228,6 +245,8 @@ class RVC:
self.next_feature_file = self.settings.modelSlots[slot].featureFile
self.next_index_file = self.settings.modelSlots[slot].indexFile
self.next_trans = self.settings.modelSlots[slot].defaultTrans
self.next_samplingRate = self.settings.modelSlots[slot].samplingRate
self.next_framework = "ONNX" if self.next_onnx_session != None else "PyTorch"
print("[Voice Changer] Prepare done.",)
return self.get_info()
@ -240,6 +259,8 @@ class RVC:
self.feature_file = self.next_feature_file
self.index_file = self.next_index_file
self.settings.tran = self.next_trans
self.settings.framework = self.next_framework
self.settings.modelSamplingRate = self.next_samplingRate
self.next_net_g = None
self.next_onnx_session = None
print("[Voice Changer] Switching model..done",)
@ -351,14 +372,13 @@ class RVC:
file_index = self.index_file if self.index_file != None else ""
file_big_npy = self.feature_file if self.feature_file != None else ""
index_rate = self.settings.indexRatio
if_f0 = 1
if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0
f0_file = None
f0 = self.settings.modelSlots[self.currentSlot].f0
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
print("embChannels::1:", embChannels)
audio_out = vc.pipeline(self.hubert_model, self.onnx_session, sid, audio, times, f0_up_key, f0_method,
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, f0=f0, embChannels=embChannels)
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, embChannels=embChannels)
result = audio_out * np.sqrt(vol)
return result
@ -399,22 +419,21 @@ class RVC:
file_index = self.index_file if self.index_file != None else ""
file_big_npy = self.feature_file if self.feature_file != None else ""
index_rate = self.settings.indexRatio
if_f0 = 1
if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0
f0_file = None
f0 = self.settings.modelSlots[self.currentSlot].f0
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, f0=f0, embChannels=embChannels)
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, embChannels=embChannels)
result = audio_out * np.sqrt(vol)
return result
def inference(self, data):
# if self.settings.modelSlotIndex < -1:
# print("[Voice Changer] No model uploaded.")
# raise NoModeLoadedException("model_common")
if self.settings.modelSlotIndex < -1:
print("[Voice Changer] No model uploaded.")
raise NoModeLoadedException("model_common")
if self.currentSlot != self.settings.modelSlotIndex:
print(f"Switch model {self.currentSlot} -> {self.settings.modelSlotIndex}")

View File

@ -84,7 +84,7 @@ class VC(object):
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak # 1-0
def vc(self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate, f0=True, embChannels=256): # ,file_index,file_big_npy
def vc(self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate, embChannels=256): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0)
if (self.is_half == True):
feats = feats.half()
@ -137,7 +137,7 @@ class VC(object):
p_len = torch.tensor([p_len], device=self.device).long()
with torch.no_grad():
if f0 == True:
if pitch != None:
audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
else:
if hasattr(net_g, "infer_pitchless"):
@ -154,7 +154,7 @@ class VC(object):
times[2] += (t2 - t1)
return audio1
def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0, f0=True, embChannels=256):
def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0, embChannels=256):
if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0):
try:
index = faiss.read_index(file_index)
@ -185,10 +185,10 @@ class VC(object):
times[1] += (t2 - t1)
if self.t_pad_tgt == 0:
audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch,
pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, f0, embChannels))
pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, embChannels))
else:
audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch,
pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, f0, embChannels)[self.t_pad_tgt:-self.t_pad_tgt])
pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, embChannels)[self.t_pad_tgt:-self.t_pad_tgt])
audio_opt = np.concatenate(audio_opt)
del pitch, pitchf, sid