support so-vits-svc.alpha onnx

This commit is contained in:
wataru 2023-03-18 06:30:50 +09:00
parent a758dd1595
commit 43eb4d5e83
2 changed files with 60 additions and 13 deletions

View File

@ -112,14 +112,15 @@ class SoVitsSvc40v2:
self.net_g.eval()
utils.load_checkpoint(pyTorch_model_file, self.net_g, None)
# # ONNXモデル生成
# if onnx_model_file != None:
# ort_options = onnxruntime.SessionOptions()
# ort_options.intra_op_num_threads = 8
# self.onnx_session = onnxruntime.InferenceSession(
# onnx_model_file,
# providers=providers
# )
# ONNXモデル生成
if onnx_model_file != None:
ort_options = onnxruntime.SessionOptions()
ort_options.intra_op_num_threads = 8
self.onnx_session = onnxruntime.InferenceSession(
onnx_model_file,
providers=providers
)
input_info = self.onnx_session.get_inputs()
return self.get_info()
def update_setteings(self, key: str, val: any):
@ -181,6 +182,17 @@ class SoVitsSvc40v2:
# wav16k = librosa.resample(audio_buffer, orig_sr=24000, target_sr=16000)
wav16k = librosa.resample(audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000)
wav16k = torch.from_numpy(wav16k)
if (self.settings.gpu < 0 or self.gpu_num == 0) or self.settings.framework == "ONNX":
dev = torch.device("cpu")
else:
dev = torch.device("cuda", index=self.settings.gpu)
self.hubert_model = self.hubert_model.to(dev)
wav16k = wav16k.to(dev)
uv = uv.to(dev)
f0 = f0.to(dev)
c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
@ -222,6 +234,37 @@ class SoVitsSvc40v2:
return (c, f0, uv, convertSize, vol)
def _onnx_inference(self, data):
if hasattr(self, "onnx_session") == False or self.onnx_session == None:
print("[Voice Changer] No onnx session.")
return np.zeros(1).astype(np.int16)
convertSize = data[3]
vol = data[4]
data = (data[0], data[1], data[2],)
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16)
c, f0, uv = [x.numpy() for x in data]
audio1 = self.onnx_session.run(
["audio"],
{
"c": c,
"f0": f0,
"g": np.array([self.settings.dstId]),
"uv": np.array([self.settings.dstId]),
"predict_f0": np.array([self.settings.dstId]),
"noice_scale": np.array([self.settings.dstId]),
})[0][0, 0] * self.hps.data.max_wav_value
audio1 = audio1 * vol
result = audio1
return result
pass
def _pyTorch_inference(self, data):

View File

@ -155,11 +155,13 @@ class VoiceChanger():
# 前処理
with Timer("pre-process") as t:
if self.settings.inputSampleRate != processing_sampling_rate:
newData = resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate)
else:
newData = receivedData
with Timer("pre-process") as t1:
if self.settings.inputSampleRate != processing_sampling_rate:
newData = resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate)
else:
newData = receivedData
# print("t1::::", t1.secs)
inputSize = newData.shape[0]
crossfadeSize = min(self.settings.crossFadeOverlapSize, inputSize)
@ -172,7 +174,9 @@ class VoiceChanger():
print_convert_processing(f" will be cropped:{-1 * (inputSize + crossfadeSize)}, {-1 * (crossfadeSize)}")
self._generate_strength(crossfadeSize)
data = self.voiceChanger.generate_input(newData, inputSize, crossfadeSize)
with Timer("pre-process") as t2:
data = self.voiceChanger.generate_input(newData, inputSize, crossfadeSize)
# print("t2::::", t2.secs)
preprocess_time = t.secs
# 変換処理