WIP: Japanese Hubert

This commit is contained in:
wataru 2023-05-03 13:14:00 +09:00
parent bfb2de9ea1
commit 48846aad7f
18 changed files with 276 additions and 348 deletions

View File

@ -8,6 +8,11 @@ class NoModeLoadedException(Exception):
) )
class HalfPrecisionChangingException(Exception):
def __str__(self):
return repr("HalfPrecision related exception.")
class ONNXInputArgumentException(Exception): class ONNXInputArgumentException(Exception):
def __str__(self): def __str__(self):
return repr("ONNX received invalid argument.") return repr("ONNX received invalid argument.")

View File

@ -48,6 +48,15 @@ def _setInfoByPytorch(slot: ModelSlot, file: str):
if slot.embedder.endswith("768"): if slot.embedder.endswith("768"):
slot.embedder = slot.embedder[:-3] slot.embedder = slot.embedder[:-3]
if slot.embedder == EnumEmbedderTypes.hubert.value:
slot.embedder = EnumEmbedderTypes.hubert
elif slot.embedder == EnumEmbedderTypes.contentvec.value:
slot.embedder = EnumEmbedderTypes.contentvec
elif slot.embedder == EnumEmbedderTypes.hubert_jp.value:
slot.embedder = EnumEmbedderTypes.hubert_jp
else:
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
slot.samplingRate = cpt["config"][-1] slot.samplingRate = cpt["config"][-1]
del cpt del cpt
@ -63,9 +72,18 @@ def _setInfoByONNX(slot: ModelSlot, file: str):
slot.modelType = metadata["modelType"] slot.modelType = metadata["modelType"]
slot.embChannels = metadata["embChannels"] slot.embChannels = metadata["embChannels"]
slot.embedder = (
metadata["embedder"] if "embedder" in metadata else EnumEmbedderTypes.hubert if "embedder" not in metadata:
) slot.embedder = EnumEmbedderTypes.hubert
elif metadata["embedder"] == EnumEmbedderTypes.hubert.value:
slot.embedder = EnumEmbedderTypes.hubert
elif metadata["embedder"] == EnumEmbedderTypes.contentvec.value:
slot.embedder = EnumEmbedderTypes.contentvec
elif metadata["embedder"] == EnumEmbedderTypes.hubert_jp.value:
slot.embedder = EnumEmbedderTypes.hubert_jp
else:
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
slot.f0 = metadata["f0"] slot.f0 = metadata["f0"]
slot.modelType = ( slot.modelType = (
EnumInferenceTypes.onnxRVC if slot.f0 else EnumInferenceTypes.onnxRVCNono EnumInferenceTypes.onnxRVC if slot.f0 else EnumInferenceTypes.onnxRVCNono
@ -73,7 +91,7 @@ def _setInfoByONNX(slot: ModelSlot, file: str):
slot.samplingRate = metadata["samplingRate"] slot.samplingRate = metadata["samplingRate"]
slot.deprecated = False slot.deprecated = False
except: except Exception as e:
slot.modelType = EnumInferenceTypes.onnxRVC slot.modelType = EnumInferenceTypes.onnxRVC
slot.embChannels = 256 slot.embChannels = 256
slot.embedder = EnumEmbedderTypes.hubert slot.embedder = EnumEmbedderTypes.hubert
@ -81,6 +99,7 @@ def _setInfoByONNX(slot: ModelSlot, file: str):
slot.samplingRate = 48000 slot.samplingRate = 48000
slot.deprecated = True slot.deprecated = True
print("[Voice Changer] setInfoByONNX", e)
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################") print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.") print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.")
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################") print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")

View File

@ -1,5 +1,6 @@
import sys import sys
import os import os
from voice_changer.RVC.ModelSlot import ModelSlot
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
@ -22,7 +23,6 @@ import resampy
from voice_changer.RVC.MergeModel import merge_model from voice_changer.RVC.MergeModel import merge_model
from voice_changer.RVC.MergeModelRequest import MergeModelRequest from voice_changer.RVC.MergeModelRequest import MergeModelRequest
from voice_changer.RVC.ModelSlotGenerator import generateModelSlot from voice_changer.RVC.ModelSlotGenerator import generateModelSlot
from Exceptions import NoModeLoadedException
from voice_changer.RVC.RVCSettings import RVCSettings from voice_changer.RVC.RVCSettings import RVCSettings
from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
@ -42,7 +42,7 @@ import torch
import traceback import traceback
import faiss import faiss
from const import TMP_DIR, UPLOAD_DIR from const import TMP_DIR, UPLOAD_DIR, EnumEmbedderTypes
from voice_changer.RVC.custom_vc_infer_pipeline import VC from voice_changer.RVC.custom_vc_infer_pipeline import VC
@ -56,34 +56,29 @@ providers = [
class RVC: class RVC:
audio_buffer: AudioInOut | None = None initialLoad: bool = True
settings: RVCSettings = RVCSettings()
embedder: Embedder | None = None embedder: Embedder | None = None
inferencer: Inferencer | None = None inferencer: Inferencer | None = None
pitchExtractor: PitchExtractor | None = None pitchExtractor: PitchExtractor | None = None
deviceManager = DeviceManager.get_instance() deviceManager = DeviceManager.get_instance()
audio_buffer: AudioInOut | None = None
prevVol: float = 0
params: VoiceChangerParams
currentSlot: int = -1
needSwitch: bool = False
def __init__(self, params: VoiceChangerParams): def __init__(self, params: VoiceChangerParams):
self.initialLoad = True
self.settings = RVCSettings()
self.pitchExtractor = PitchExtractorManager.getPitchExtractor( self.pitchExtractor = PitchExtractorManager.getPitchExtractor(
self.settings.f0Detector self.settings.f0Detector
) )
self.feature_file = None
self.index_file = None
self.prevVol = 0
self.params = params self.params = params
self.currentSlot = -1
print("RVC initialization: ", params) print("RVC initialization: ", params)
def loadModel(self, props: LoadModelParams): def loadModel(self, props: LoadModelParams):
"""
loadModelはスロットへのエントリ(推論向けにはロードしない)
例外的にまだ一つも推論向けにロードされていない場合と稼働中スロットの場合はロードする
"""
self.is_half = props.isHalf
target_slot_idx = props.slot target_slot_idx = props.slot
params_str = props.params params_str = props.params
params = json.loads(params_str) params = json.loads(params_str)
@ -94,167 +89,175 @@ class RVC:
f"[Voice Changer] RVC new model is uploaded,{target_slot_idx}", f"[Voice Changer] RVC new model is uploaded,{target_slot_idx}",
asdict(modelSlot), asdict(modelSlot),
) )
"""
[Voice Changer] RVC new model is uploaded,0 {'pyTorchModelFile': 'upload_dir/0/kurage.pth', 'onnxModelFile': None, 'featureFile': None, 'indexFile': None, 'defaultTrans': 16, 'isONNX': False, 'modelType': <EnumInferenceTypes.pyTorchWebUI: 'pyTorchWebUI'>, 'samplingRate': 48000, 'f0': True, 'embChannels': 768, 'deprecated': False, 'embedder': 'hubert-base-japanese'}
"""
# 初回のみロード # 初回のみロード
if self.initialLoad or target_slot_idx == self.currentSlot: if self.initialLoad:
self.prepareModel(target_slot_idx) self.prepareModel(target_slot_idx)
self.settings.modelSlotIndex = target_slot_idx self.settings.modelSlotIndex = target_slot_idx
# self.currentSlot = self.settings.modelSlotIndex
self.switchModel() self.switchModel()
self.initialLoad = False self.initialLoad = False
elif target_slot_idx == self.currentSlot:
self.prepareModel(target_slot_idx)
self.needSwitch = True
return self.get_info() return self.get_info()
# def _getDevice(self): def createPipeline(self, modelSlot: ModelSlot):
# if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False): dev = self.deviceManager.getDevice(self.settings.gpu)
# dev = torch.device("cpu") half = self.deviceManager.halfPrecisionAvailable(self.settings.gpu)
# elif self.mps_enabled: # ファイル名特定(Inferencer)
# dev = torch.device("mps") inferencerFilename = (
# else: modelSlot.onnxModelFile if modelSlot.isONNX else modelSlot.pyTorchModelFile
# dev = torch.device("cuda", index=self.settings.gpu) )
# return dev # ファイル名特定(embedder)
if modelSlot.embedder == EnumEmbedderTypes.hubert:
emmbedderFilename = self.params.hubert_base
elif modelSlot.embedder == EnumEmbedderTypes.contentvec:
emmbedderFilename = self.params.content_vec_500
elif modelSlot.embedder == EnumEmbedderTypes.hubert_jp:
emmbedderFilename = self.params.hubert_base_jp
else:
raise RuntimeError(
"[Voice Changer] Exception loading embedder failed. unknwon type:",
modelSlot.embedder,
)
# Inferencer 生成
try:
inferencer = InferencerManager.getInferencer(
modelSlot.modelType,
inferencerFilename,
half,
dev,
)
except Exception as e:
print("[Voice Changer] exception! loading inferencer", e)
traceback.print_exc()
# Embedder 生成
try:
print("AFASFDAFDAFDASDFASDFSADFASDFA", half, self.settings.gpu)
embedder = EmbedderManager.getEmbedder(
modelSlot.embedder,
emmbedderFilename,
half,
dev,
)
except Exception as e:
print("[Voice Changer] exception! loading embedder", e)
traceback.print_exc()
return inferencer, embedder
def loadIndex(self, modelSlot: ModelSlot):
# Indexのロード
print("[Voice Changer] Loading index...")
# ファイル指定がない場合はNone
if modelSlot.featureFile is None or modelSlot.indexFile is None:
return None, None
# ファイル指定があってもファイルがない場合はNone
if (
os.path.exists(modelSlot.featureFile) is not True
or os.path.exists(modelSlot.indexFile) is not True
):
return None, None
try:
index = faiss.read_index(modelSlot.indexFile)
feature = np.load(modelSlot.featureFile)
except:
print("[Voice Changer] load index failed. Use no index.")
traceback.print_exc()
return None, None
return index, feature
def prepareModel(self, slot: int): def prepareModel(self, slot: int):
if slot < 0: if slot < 0:
return self.get_info() return self.get_info()
print("[Voice Changer] Prepare Model of slot:", slot) print("[Voice Changer] Prepare Model of slot:", slot)
modelSlot = self.settings.modelSlots[slot] modelSlot = self.settings.modelSlots[slot]
filename = (
modelSlot.onnxModelFile if modelSlot.isONNX else modelSlot.pyTorchModelFile
)
dev = self.deviceManager.getDevice(self.settings.gpu)
# Inferencerのロード # Inferencer, embedderのロード
inferencer = InferencerManager.getInferencer( inferencer, embedder = self.createPipeline(modelSlot)
modelSlot.modelType,
filename,
self.settings.isHalf,
dev,
)
self.next_inferencer = inferencer self.next_inferencer = inferencer
self.next_embedder = embedder
# Indexのロード # Indexのロード
print("[Voice Changer] Loading index...") index, feature = self.loadIndex(modelSlot)
if modelSlot.featureFile is not None and modelSlot.indexFile is not None: self.next_index = index
if ( self.next_feature = feature
os.path.exists(modelSlot.featureFile) is True
and os.path.exists(modelSlot.indexFile) is True
):
try:
self.next_index = faiss.read_index(modelSlot.indexFile)
self.next_feature = np.load(modelSlot.featureFile)
except:
print("[Voice Changer] load index failed. Use no index.")
traceback.print_exc()
self.next_index = self.next_feature = None
else:
print("[Voice Changer] Index file is not found. Use no index.")
self.next_index = self.next_feature = None
else:
self.next_index = self.next_feature = None
# その他の設定
self.next_trans = modelSlot.defaultTrans self.next_trans = modelSlot.defaultTrans
self.next_samplingRate = modelSlot.samplingRate self.next_samplingRate = modelSlot.samplingRate
self.next_embedder = modelSlot.embedder
self.next_framework = "ONNX" if modelSlot.isONNX else "PyTorch" self.next_framework = "ONNX" if modelSlot.isONNX else "PyTorch"
self.needSwitch = True
print("[Voice Changer] Prepare done.") print("[Voice Changer] Prepare done.")
return self.get_info() return self.get_info()
def switchModel(self): def switchModel(self):
print("[Voice Changer] Switching model..") print("[Voice Changer] Switching model..")
dev = self.deviceManager.getDevice(self.settings.gpu) self.embedder = self.next_embedder
# embedderはモデルによらず再利用できる可能性が高いので、Switchのタイミングでこちらで取得
try:
self.embedder = EmbedderManager.getEmbedder(
self.next_embedder,
self.params.hubert_base,
True,
dev,
)
except Exception as e:
print("[Voice Changer] load hubert error", e)
traceback.print_exc()
self.inferencer = self.next_inferencer self.inferencer = self.next_inferencer
self.feature = self.next_feature self.feature = self.next_feature
self.index = self.next_index self.index = self.next_index
self.settings.tran = self.next_trans self.settings.tran = self.next_trans
self.settings.framework = self.next_framework
self.settings.modelSamplingRate = self.next_samplingRate self.settings.modelSamplingRate = self.next_samplingRate
self.settings.framework = self.next_framework
self.next_net_g = None
self.next_onnx_session = None
print( print(
"[Voice Changer] Switching model..done", "[Voice Changer] Switching model..done",
) )
def update_settings(self, key: str, val: int | float | str): def update_settings(self, key: str, val: int | float | str):
# if key == "onnxExecutionProvider" and self.onnx_session is not None:
# if val == "CUDAExecutionProvider":
# if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
# self.settings.gpu = 0
# provider_options = [{"device_id": self.settings.gpu}]
# self.onnx_session.set_providers(
# providers=[val], provider_options=provider_options
# )
# if hasattr(self, "hubert_onnx"):
# self.hubert_onnx.set_providers(
# providers=[val], provider_options=provider_options
# )
# else:
# self.onnx_session.set_providers(providers=[val])
# if hasattr(self, "hubert_onnx"):
# self.hubert_onnx.set_providers(providers=[val])
# elif key == "onnxExecutionProvider" and self.onnx_session is None:
# print("Onnx is not enabled. Please load model.")
# return False
if key in self.settings.intData: if key in self.settings.intData:
# 設定前処理
val = cast(int, val) val = cast(int, val)
# if (
# key == "gpu"
# and val >= 0
# and val < self.gpu_num
# and self.onnx_session is not None
# ):
# providers = self.onnx_session.get_providers()
# print("Providers:", providers)
# if "CUDAExecutionProvider" in providers:
# provider_options = [{"device_id": self.settings.gpu}]
# self.onnx_session.set_providers(
# providers=["CUDAExecutionProvider"],
# provider_options=provider_options,
# )
if key == "modelSlotIndex": if key == "modelSlotIndex":
if int(val) < 0: if val < 0:
return True return True
# self.switchModel(int(val)) val = val % 1000 # Quick hack for same slot is selected
val = int(val) % 1000 # Quick hack for same slot is selected
self.prepareModel(val) self.prepareModel(val)
self.currentSlot = -1 self.needSwitch = True
setattr(self.settings, key, int(val))
# 設定
setattr(self.settings, key, val)
if key == "gpu" and self.embedder is not None:
dev = self.deviceManager.getDevice(val)
half = self.deviceManager.halfPrecisionAvailable(val)
# half-precisionの使用可否が変わるときは作り直し
if (
self.inferencer is not None
and self.inferencer.isHalf == half
and self.embedder.isHalf == half
):
print(
"NOT NEED CHAGE TO NEW PIPELINE!!!!!!!!!!!!!!!!!!!!!!!!!!!",
half,
)
self.embedder.setDevice(dev)
self.inferencer.setDevice(dev)
else:
print("CHAGE TO NEW PIPELINE!!!!!!!!!!!!!!!!!!!!!!!!!!!", half)
self.prepareModel(self.settings.modelSlotIndex)
elif key in self.settings.floatData: elif key in self.settings.floatData:
setattr(self.settings, key, float(val)) setattr(self.settings, key, float(val))
elif key in self.settings.strData: elif key in self.settings.strData:
setattr(self.settings, key, str(val)) setattr(self.settings, key, str(val))
else: else:
return False return False
return True return True
def get_info(self): def get_info(self):
data = asdict(self.settings) data = asdict(self.settings)
# data["onnxExecutionProviders"] = (
# self.onnx_session.get_providers() if self.onnx_session is not None else []
# )
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] is not None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
return data return data
def get_processing_sampling_rate(self): def get_processing_sampling_rate(self):
@ -295,118 +298,6 @@ class RVC:
return (self.audio_buffer, convertSize, vol) return (self.audio_buffer, convertSize, vol)
def _onnx_inference(self, data):
if hasattr(self, "onnx_session") is False or self.onnx_session is None:
print("[Voice Changer] No onnx session.")
raise NoModeLoadedException("ONNX")
if self.settings.gpu < 0 or self.gpu_num == 0:
dev = torch.device("cpu")
else:
dev = torch.device("cuda", index=self.settings.gpu)
# self.hubert_model = self.hubert_model.to(dev)
self.embedder = self.embedder.to(dev)
audio = data[0]
convertSize = data[1]
vol = data[2]
audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16)
with torch.no_grad():
repeat = 3 if self.is_half else 1
repeat *= self.settings.rvcQuality # 0 or 3
vc = VC(
self.settings.modelSamplingRate,
torch.device("cuda:0"),
self.is_half,
repeat,
)
sid = 0
f0_up_key = self.settings.tran
f0_method = self.settings.f0Detector
index_rate = self.settings.indexRatio
if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
audio_out = vc.pipeline(
# self.hubert_model,
self.embedder,
self.onnx_session,
self.pitchExtractor,
sid,
audio,
f0_up_key,
f0_method,
self.index,
self.feature,
index_rate,
if_f0,
silence_front=self.settings.extraConvertSize
/ self.settings.modelSamplingRate,
embChannels=embChannels,
)
result = audio_out * np.sqrt(vol)
return result
def _pyTorch_inference(self, data):
# if hasattr(self, "net_g") is False or self.net_g is None:
# print(
# "[Voice Changer] No pyTorch session.",
# hasattr(self, "net_g"),
# self.net_g,
# )
# raise NoModeLoadedException("pytorch")
dev = self.deviceManager.getDevice(self.settings.gpu)
self.embedder = self.embedder.to(dev)
self.inferencer = self.inferencer.to(dev)
audio = data[0]
convertSize = data[1]
vol = data[2]
audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16)
repeat = 3 if self.is_half else 1
repeat *= self.settings.rvcQuality # 0 or 3
vc = VC(self.settings.modelSamplingRate, dev, self.is_half, repeat)
sid = 0
f0_up_key = self.settings.tran
f0_method = self.settings.f0Detector
index_rate = self.settings.indexRatio
if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
audio_out = vc.pipeline(
self.embedder,
self.inferencer,
self.pitchExtractor,
sid,
audio,
f0_up_key,
f0_method,
self.index,
self.feature,
index_rate,
if_f0,
silence_front=self.settings.extraConvertSize
/ self.settings.modelSamplingRate,
embChannels=embChannels,
)
result = audio_out * np.sqrt(vol)
return result
def inference(self, data): def inference(self, data):
if self.settings.modelSlotIndex < 0: if self.settings.modelSlotIndex < 0:
print( print(
@ -415,15 +306,17 @@ class RVC:
self.currentSlot, self.currentSlot,
) )
raise NoModeLoadedException("model_common") raise NoModeLoadedException("model_common")
if self.needSwitch:
if self.currentSlot != self.settings.modelSlotIndex:
print(f"Switch model {self.currentSlot} -> {self.settings.modelSlotIndex}") print(f"Switch model {self.currentSlot} -> {self.settings.modelSlotIndex}")
self.currentSlot = self.settings.modelSlotIndex self.currentSlot = self.settings.modelSlotIndex
self.switchModel() self.switchModel()
self.needSwitch = False
dev = self.deviceManager.getDevice(self.settings.gpu) dev = self.deviceManager.getDevice(self.settings.gpu)
self.embedder = self.embedder.to(dev) half = self.deviceManager.halfPrecisionAvailable(self.settings.gpu)
self.inferencer = self.inferencer.to(dev)
# self.embedder = self.embedder.setDevice(dev)
# self.inferencer = self.inferencer.setDevice(dev)
audio = data[0] audio = data[0]
convertSize = data[1] convertSize = data[1]
@ -434,16 +327,16 @@ class RVC:
if vol < self.settings.silentThreshold: if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16) return np.zeros(convertSize).astype(np.int16)
repeat = 3 if self.is_half else 1 repeat = 3 if half else 1
repeat *= self.settings.rvcQuality # 0 or 3 repeat *= self.settings.rvcQuality # 0 or 3
vc = VC(self.settings.modelSamplingRate, dev, self.is_half, repeat) vc = VC(self.settings.modelSamplingRate, dev, half, repeat)
sid = 0 sid = 0
f0_up_key = self.settings.tran f0_up_key = self.settings.tran
f0_method = self.settings.f0Detector
index_rate = self.settings.indexRatio index_rate = self.settings.indexRatio
if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0 if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0
embChannels = self.settings.modelSlots[self.currentSlot].embChannels embChannels = self.settings.modelSlots[self.currentSlot].embChannels
audio_out = vc.pipeline( audio_out = vc.pipeline(
self.embedder, self.embedder,
self.inferencer, self.inferencer,
@ -451,7 +344,6 @@ class RVC:
sid, sid,
audio, audio,
f0_up_key, f0_up_key,
f0_method,
self.index, self.index,
self.feature, self.feature,
index_rate, index_rate,

View File

@ -15,9 +15,6 @@ class RVCSettings:
clusterInferRatio: float = 0.1 clusterInferRatio: float = 0.1
framework: str = "PyTorch" # PyTorch or ONNX framework: str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile: str = ""
onnxModelFile: str = ""
configFile: str = ""
modelSlots: list[ModelSlot] = field( modelSlots: list[ModelSlot] = field(
default_factory=lambda: [ModelSlot(), ModelSlot(), ModelSlot(), ModelSlot()] default_factory=lambda: [ModelSlot(), ModelSlot(), ModelSlot(), ModelSlot()]
) )

View File

@ -1,13 +0,0 @@
import torch
from transformers import HubertModel
from voice_changer.utils.VoiceChangerModel import AudioInOut
class RinnaHubertBase:
def __init__(self):
model = HubertModel.from_pretrained("rinna/japanese-hubert-base")
model.eval()
self.model = model
def extract(self, audio: AudioInOut):
return self.model(audio)

View File

@ -3,6 +3,7 @@ import numpy as np
# import parselmouth # import parselmouth
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from Exceptions import HalfPrecisionChangingException
from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.inferencer.Inferencer import Inferencer from voice_changer.RVC.inferencer.Inferencer import Inferencer
@ -26,7 +27,6 @@ class VC(object):
sid, sid,
audio, audio,
f0_up_key, f0_up_key,
f0_method,
index, index,
big_npy, big_npy,
index_rate, index_rate,
@ -68,7 +68,13 @@ class VC(object):
# embedding # embedding
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
feats = embedder.extractFeatures(feats, embChannels) try:
feats = embedder.extractFeatures(feats, embChannels)
except RuntimeError as e:
if "HALF" in e.__str__().upper():
raise HalfPrecisionChangingException()
else:
raise e
# Index - feature抽出 # Index - feature抽出
if ( if (
@ -103,34 +109,46 @@ class VC(object):
# 推論実行 # 推論実行
with torch.no_grad(): with torch.no_grad():
if pitch is not None: audio1 = (
audio1 = ( (inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
( .data.cpu()
inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] .float()
* 32768 .numpy()
) .astype(np.int16)
.data.cpu() )
.float()
.numpy() # if pitch is not None:
.astype(np.int16) # print("INFERENCE 1 ")
) # audio1 = (
else: # (
if hasattr(inferencer, "infer_pitchless"): # inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
audio1 = ( # * 32768
(inferencer.infer_pitchless(feats, p_len, sid)[0][0, 0] * 32768) # )
.data.cpu() # .data.cpu()
.float() # .float()
.numpy() # .numpy()
.astype(np.int16) # .astype(np.int16)
) # )
else: # else:
audio1 = ( # if hasattr(inferencer, "infer_pitchless"):
(inferencer.infer(feats, p_len, sid)[0][0, 0] * 32768) # print("INFERENCE 2 ")
.data.cpu()
.float() # audio1 = (
.numpy() # (inferencer.infer_pitchless(feats, p_len, sid)[0][0, 0] * 32768)
.astype(np.int16) # .data.cpu()
) # .float()
# .numpy()
# .astype(np.int16)
# )
# else:
# print("INFERENCE 3 ")
# audio1 = (
# (inferencer.infer(feats, p_len, sid)[0][0, 0] * 32768)
# .data.cpu()
# .float()
# .numpy()
# .astype(np.int16)
# )
del feats, p_len, padding_mask del feats, p_len, padding_mask
torch.cuda.empty_cache() torch.cuda.empty_cache()

View File

@ -29,6 +29,9 @@ class DeviceManager(object):
def halfPrecisionAvailable(self, id: int): def halfPrecisionAvailable(self, id: int):
if self.gpu_num == 0: if self.gpu_num == 0:
return False return False
if id < 0:
return False
gpuName = torch.cuda.get_device_name(id).upper() gpuName = torch.cuda.get_device_name(id).upper()
# original: https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/config.py # original: https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/config.py
@ -39,3 +42,5 @@ class DeviceManager(object):
or "1080" in gpuName or "1080" in gpuName
): ):
return False return False
return True

View File

@ -36,11 +36,14 @@ class Embedder(Protocol):
self.isHalf = isHalf self.isHalf = isHalf
if self.model is not None and isHalf: if self.model is not None and isHalf:
self.model = self.model.half() self.model = self.model.half()
elif self.model is not None and isHalf is False:
self.model = self.model.float()
def setDevice(self, dev: device): def setDevice(self, dev: device):
self.dev = dev self.dev = dev
if self.model is not None: if self.model is not None:
self.model = self.model.to(self.dev) self.model = self.model.to(self.dev)
return self
def matchCondition(self, embedderType: EnumEmbedderTypes, file: str) -> bool: def matchCondition(self, embedderType: EnumEmbedderTypes, file: str) -> bool:
# Check Type # Check Type
@ -63,11 +66,3 @@ class Embedder(Protocol):
else: else:
return True return True
def to(self, dev: torch.device):
if self.model is not None:
self.model = self.model.to(dev)
return self
def printDevice(self):
print("embedder device:", self.model.device)

View File

@ -23,6 +23,8 @@ class EmbedderManager:
else: else:
cls.currentEmbedder.setDevice(dev) cls.currentEmbedder.setDevice(dev)
cls.currentEmbedder.setHalf(isHalf) cls.currentEmbedder.setHalf(isHalf)
# print("[Voice Changer] generate new embedder. (ANYWAY)", isHalf)
# cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev)
return cls.currentEmbedder return cls.currentEmbedder
@classmethod @classmethod

View File

@ -4,6 +4,7 @@ import torch
from torch import device from torch import device
from const import EnumInferenceTypes from const import EnumInferenceTypes
import onnxruntime
class Inferencer(Protocol): class Inferencer(Protocol):
@ -12,7 +13,7 @@ class Inferencer(Protocol):
isHalf: bool = True isHalf: bool = True
dev: device dev: device
model: Any | None = None model: onnxruntime.InferenceSession | Any | None = None
def loadModel(self, file: str, dev: device, isHalf: bool = True): def loadModel(self, file: str, dev: device, isHalf: bool = True):
... ...
@ -43,16 +44,11 @@ class Inferencer(Protocol):
self.isHalf = isHalf self.isHalf = isHalf
if self.model is not None and isHalf: if self.model is not None and isHalf:
self.model = self.model.half() self.model = self.model.half()
elif self.model is not None and isHalf is False:
self.model = self.model.float()
def setDevice(self, dev: device): def setDevice(self, dev: device):
self.dev = dev self.dev = dev
if self.model is not None: if self.model is not None:
self.model = self.model.to(self.dev) self.model = self.model.to(self.dev)
def to(self, dev: torch.device):
if self.model is not None:
self.model = self.model.to(dev)
return self return self
def printDevice(self):
print("inferencer device:", self.model.device)

View File

@ -2,8 +2,8 @@ from torch import device
from const import EnumInferenceTypes from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.Inferencer import Inferencer from voice_changer.RVC.inferencer.Inferencer import Inferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInference from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferenceNono from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono
from voice_changer.RVC.inferencer.RVCInferencer import RVCInferencer from voice_changer.RVC.inferencer.RVCInferencer import RVCInferencer
from voice_changer.RVC.inferencer.RVCInferencerNono import RVCInferencerNono from voice_changer.RVC.inferencer.RVCInferencerNono import RVCInferencerNono
from voice_changer.RVC.inferencer.WebUIInferencer import WebUIInferencer from voice_changer.RVC.inferencer.WebUIInferencer import WebUIInferencer
@ -48,11 +48,11 @@ class InferencerManager:
inferencerType == EnumInferenceTypes.onnxRVC inferencerType == EnumInferenceTypes.onnxRVC
or inferencerType == EnumInferenceTypes.onnxRVC.value or inferencerType == EnumInferenceTypes.onnxRVC.value
): ):
return OnnxRVCInference().loadModel(file, dev, isHalf) return OnnxRVCInferencer().loadModel(file, dev, isHalf)
elif ( elif (
inferencerType == EnumInferenceTypes.onnxRVCNono inferencerType == EnumInferenceTypes.onnxRVCNono
or inferencerType == EnumInferenceTypes.onnxRVCNono.value or inferencerType == EnumInferenceTypes.onnxRVCNono.value
): ):
return OnnxRVCInferenceNono().loadModel(file, dev, isHalf) return OnnxRVCInferencerNono().loadModel(file, dev, isHalf)
else: else:
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType) raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)

View File

@ -8,18 +8,16 @@ import numpy as np
providers = ["CPUExecutionProvider"] providers = ["CPUExecutionProvider"]
class OnnxRVCInference(Inferencer): class OnnxRVCInferencer(Inferencer):
def loadModel(self, file: str, dev: device, isHalf: bool = True): def loadModel(self, file: str, dev: device, isHalf: bool = True):
super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf) super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf)
# ort_options = onnxruntime.SessionOptions() # ort_options = onnxruntime.SessionOptions()
# ort_options.intra_op_num_threads = 8 # ort_options.intra_op_num_threads = 8
onnx_session = onnxruntime.InferenceSession( onnx_session = onnxruntime.InferenceSession(file, providers=providers)
self.onnx_model, providers=providers
)
# check half-precision # check half-precision
first_input_type = self.onnx_session.get_inputs()[0].type first_input_type = onnx_session.get_inputs()[0].type
if first_input_type == "tensor(float)": if first_input_type == "tensor(float)":
self.isHalf = False self.isHalf = False
else: else:
@ -32,13 +30,16 @@ class OnnxRVCInference(Inferencer):
self, self,
feats: torch.Tensor, feats: torch.Tensor,
pitch_length: torch.Tensor, pitch_length: torch.Tensor,
pitch: torch.Tensor | None, pitch: torch.Tensor,
pitchf: torch.Tensor | None, pitchf: torch.Tensor,
sid: torch.Tensor, sid: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
if pitch is None or pitchf is None: if pitch is None or pitchf is None:
raise RuntimeError("[Voice Changer] Pitch or Pitchf is not found.") raise RuntimeError("[Voice Changer] Pitch or Pitchf is not found.")
print("INFER1", self.model.get_providers())
print("INFER2", self.model.get_provider_options())
print("INFER3", self.model.get_session_options())
if self.isHalf: if self.isHalf:
audio1 = self.model.run( audio1 = self.model.run(
["audio"], ["audio"],
@ -65,14 +66,22 @@ class OnnxRVCInference(Inferencer):
return torch.tensor(np.array(audio1)) return torch.tensor(np.array(audio1))
def setHalf(self, isHalf: bool): def setHalf(self, isHalf: bool):
raise RuntimeError("half-precision is not changable.", self.isHalf) self.isHalf = isHalf
pass
# raise RuntimeError("half-precision is not changable.", self.isHalf)
def setDevice(self, dev: device): def setDevice(self, dev: device):
self.dev = dev index = dev.index
if self.model is not None: type = dev.type
self.model = self.model.to(self.dev) if type == "cpu":
self.model.set_providers(providers=["CPUExecutionProvider"])
elif type == "cuda":
provider_options = [{"device_id": index}]
self.model.set_providers(
providers=["CUDAExecutionProvider"],
provider_options=provider_options,
)
else:
self.model.set_providers(providers=["CPUExecutionProvider"])
def to(self, dev: torch.device):
if self.model is not None:
self.model = self.model.to(dev)
return self return self

View File

@ -2,13 +2,14 @@ import torch
from torch import device from torch import device
import onnxruntime import onnxruntime
from const import EnumInferenceTypes from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.Inferencer import Inferencer
import numpy as np import numpy as np
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
providers = ["CPUExecutionProvider"] providers = ["CPUExecutionProvider"]
class OnnxRVCInferenceNono(Inferencer): class OnnxRVCInferencerNono(OnnxRVCInferencer):
def loadModel(self, file: str, dev: device, isHalf: bool = True): def loadModel(self, file: str, dev: device, isHalf: bool = True):
super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf) super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf)
# ort_options = onnxruntime.SessionOptions() # ort_options = onnxruntime.SessionOptions()
@ -56,16 +57,3 @@ class OnnxRVCInferenceNono(Inferencer):
) )
return torch.tensor(np.array(audio1)) return torch.tensor(np.array(audio1))
def setHalf(self, isHalf: bool):
raise RuntimeError("half-precision is not changable.", self.isHalf)
def setDevice(self, dev: device):
self.dev = dev
if self.model is not None:
self.model = self.model.to(self.dev)
def to(self, dev: torch.device):
if self.model is not None:
self.model = self.model.to(dev)
return self

View File

@ -16,6 +16,8 @@ class RVCInferencer(Inferencer):
model.eval() model.eval()
model.load_state_dict(cpt["weight"], strict=False) model.load_state_dict(cpt["weight"], strict=False)
model = model.to(dev)
if isHalf: if isHalf:
model = model.half() model = model.half()
@ -26,8 +28,8 @@ class RVCInferencer(Inferencer):
self, self,
feats: torch.Tensor, feats: torch.Tensor,
pitch_length: torch.Tensor, pitch_length: torch.Tensor,
pitch: torch.Tensor | None, pitch: torch.Tensor,
pitchf: torch.Tensor | None, pitchf: torch.Tensor,
sid: torch.Tensor, sid: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid) return self.model.infer(feats, pitch_length, pitch, pitchf, sid)

View File

@ -16,6 +16,8 @@ class RVCInferencerNono(Inferencer):
model.eval() model.eval()
model.load_state_dict(cpt["weight"], strict=False) model.load_state_dict(cpt["weight"], strict=False)
model = model.to(dev)
if isHalf: if isHalf:
model = model.half() model = model.half()

View File

@ -14,6 +14,8 @@ class WebUIInferencer(Inferencer):
model.eval() model.eval()
model.load_state_dict(cpt["weight"], strict=False) model.load_state_dict(cpt["weight"], strict=False)
model = model.to(dev)
if isHalf: if isHalf:
model = model.half() model = model.half()
@ -24,8 +26,8 @@ class WebUIInferencer(Inferencer):
self, self,
feats: torch.Tensor, feats: torch.Tensor,
pitch_length: torch.Tensor, pitch_length: torch.Tensor,
pitch: torch.Tensor | None, pitch: torch.Tensor,
pitchf: torch.Tensor | None, pitchf: torch.Tensor,
sid: torch.Tensor, sid: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid) return self.model.infer(feats, pitch_length, pitch, pitchf, sid)

View File

@ -14,6 +14,8 @@ class WebUIInferencerNono(Inferencer):
model.eval() model.eval()
model.load_state_dict(cpt["weight"], strict=False) model.load_state_dict(cpt["weight"], strict=False)
model = model.to(dev)
if isHalf: if isHalf:
model = model.half() model = model.half()

View File

@ -13,7 +13,11 @@ from voice_changer.utils.LoadModelParams import LoadModelParams
from voice_changer.utils.Timer import Timer from voice_changer.utils.Timer import Timer
from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut
from Exceptions import NoModeLoadedException, ONNXInputArgumentException from Exceptions import (
HalfPrecisionChangingException,
NoModeLoadedException,
ONNXInputArgumentException,
)
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
providers = [ providers = [
@ -341,6 +345,9 @@ class VoiceChanger:
except ONNXInputArgumentException as e: except ONNXInputArgumentException as e:
print("[Voice Changer] [Exception]", e) print("[Voice Changer] [Exception]", e)
return np.zeros(1).astype(np.int16), [0, 0, 0] return np.zeros(1).astype(np.int16), [0, 0, 0]
except HalfPrecisionChangingException as e:
print("[Voice Changer] Switching model configuration....", e)
return np.zeros(1).astype(np.int16), [0, 0, 0]
except Exception as e: except Exception as e:
print("VC PROCESSING!!!! EXCEPTION!!!", e) print("VC PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc()) print(traceback.format_exc())