WIP: Volume Extaractor torch

This commit is contained in:
w-okada 2023-07-15 04:45:27 +09:00
parent a1db94c0af
commit a69c89255b
8 changed files with 472 additions and 77 deletions

View File

@ -1,9 +1,5 @@
# import sys
# import os
from dataclasses import asdict from dataclasses import asdict
import numpy as np import numpy as np
import torch
import torchaudio
from data.ModelSlot import DiffusionSVCModelSlot from data.ModelSlot import DiffusionSVCModelSlot
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
@ -13,7 +9,7 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import Pitc
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx # from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from Exceptions import DeviceCannotSupportHalfPrecisionException from Exceptions import DeviceCannotSupportHalfPrecisionException
@ -36,13 +32,12 @@ class DiffusionSVC(VoiceChangerModel):
self.feature_buffer: FeatureInOut | None = None self.feature_buffer: FeatureInOut | None = None
self.prevVol = 0.0 self.prevVol = 0.0
self.slotInfo = slotInfo self.slotInfo = slotInfo
self.initialize()
def initialize(self): def initialize(self):
print("[Voice Changer] [DiffusionSVC] Initializing... ") print("[Voice Changer] [DiffusionSVC] Initializing... ")
# pipelineの生成 # pipelineの生成
self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector) self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector, self.inputSampleRate, self.outputSampleRate)
# その他の設定 # その他の設定
self.settings.tran = self.slotInfo.defaultTune self.settings.tran = self.slotInfo.defaultTune
@ -51,6 +46,11 @@ class DiffusionSVC(VoiceChangerModel):
print("[Voice Changer] [DiffusionSVC] Initializing... done") print("[Voice Changer] [DiffusionSVC] Initializing... done")
def setSamplingRate(self, inputSampleRate, outputSampleRate):
self.inputSampleRate = inputSampleRate
self.outputSampleRate = outputSampleRate
self.initialize()
def update_settings(self, key: str, val: int | float | str): def update_settings(self, key: str, val: int | float | str):
print("[Voice Changer][DiffusionSVC]: update_settings", key, val) print("[Voice Changer][DiffusionSVC]: update_settings", key, val)
if key in self.settings.intData: if key in self.settings.intData:
@ -82,7 +82,6 @@ class DiffusionSVC(VoiceChangerModel):
def generate_input( def generate_input(
self, self,
newData: AudioInOut, newData: AudioInOut,
inputSize: int,
crossfadeSize: int, crossfadeSize: int,
solaSearchFrame: int = 0, solaSearchFrame: int = 0,
): ):
@ -99,11 +98,10 @@ class DiffusionSVC(VoiceChangerModel):
self.pitchf_buffer = np.zeros(new_feature_length) self.pitchf_buffer = np.zeros(new_feature_length)
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels]) self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize convertSize = newData.shape[0] + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128)) convertSize = convertSize + (128 - (convertSize % 128))
outSize = convertSize - self.settings.extraConvertSize
# バッファがたまっていない場合はzeroで補う # バッファがたまっていない場合はzeroで補う
if self.audio_buffer.shape[0] < convertSize: if self.audio_buffer.shape[0] < convertSize:
@ -118,37 +116,39 @@ class DiffusionSVC(VoiceChangerModel):
self.feature_buffer = self.feature_buffer[featureOffset:] self.feature_buffer = self.feature_buffer[featureOffset:]
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
cropOffset = -1 * (inputSize + crossfadeSize) cropOffset = -1 * (newData.shape[0] + crossfadeSize)
cropEnd = -1 * (crossfadeSize) cropEnd = -1 * (crossfadeSize)
crop = self.audio_buffer[cropOffset:cropEnd] crop = self.audio_buffer[cropOffset:cropEnd]
vol = np.sqrt(np.square(crop).mean()) vol = np.sqrt(np.square(crop).mean())
vol = max(vol, self.prevVol * 0.0) vol = float(max(vol, self.prevVol * 0.0))
self.prevVol = vol self.prevVol = vol
return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize) return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol)
def inference(self, data): def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int):
audio = data[0] data = self.generate_input(receivedData, crossfade_frame, sola_search_frame)
pitchf = data[1] audio: AudioInOut = data[0]
feature = data[2] pitchf: PitchfInOut = data[1]
convertSize = data[3] feature: FeatureInOut = data[2]
vol = data[4] convertSize: int = data[3]
vol: float = data[4]
if vol < self.settings.silentThreshold: if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol) return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
if self.pipeline is not None: if self.pipeline is None:
device = self.pipeline.device return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
else:
device = torch.device("cpu") # TODO:pipelineが存在しない場合はzeroを返してもいいかも(要確認)。 # device = self.pipeline.device
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32) # audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99) # audio = self.resampler16K(audio)
sid = self.settings.dstId sid = self.settings.dstId
f0_up_key = self.settings.tran f0_up_key = self.settings.tran
protect = 0 protect = 0
embOutputLayer = 12 embOutputLayer = 12
useFinalProj = False useFinalProj = False
silenceFrontSec = self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0. # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。
try: try:
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
@ -157,12 +157,11 @@ class DiffusionSVC(VoiceChangerModel):
pitchf, pitchf,
feature, feature,
f0_up_key, f0_up_key,
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。 silenceFrontSec,
embOutputLayer, embOutputLayer,
useFinalProj, useFinalProj,
protect protect
) )
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
result = audio_out.detach().cpu().numpy() result = audio_out.detach().cpu().numpy()
return result return result
@ -173,36 +172,36 @@ class DiffusionSVC(VoiceChangerModel):
# raise e # raise e
return return
def __del__(self): def __del__(self):
del self.pipeline del self.pipeline
def export2onnx(self): # def export2onnx(self):
modelSlot = self.slotInfo # modelSlot = self.slotInfo
if modelSlot.isONNX: # if modelSlot.isONNX:
print("[Voice Changer] export2onnx, No pyTorch filepath.") # print("[Voice Changer] export2onnx, No pyTorch filepath.")
return {"status": "ng", "path": ""} # return {"status": "ng", "path": ""}
output_file_simple = export2onnx(self.settings.gpu, modelSlot) # output_file_simple = export2onnx(self.settings.gpu, modelSlot)
return { # return {
"status": "ok", # "status": "ok",
"path": f"/tmp/{output_file_simple}", # "path": f"/tmp/{output_file_simple}",
"filename": output_file_simple, # "filename": output_file_simple,
} # }
def get_model_current(self): # def get_model_current(self):
return [ # return [
{ # {
"key": "defaultTune", # "key": "defaultTune",
"val": self.settings.tran, # "val": self.settings.tran,
}, # },
{ # {
"key": "defaultIndexRatio", # "key": "defaultIndexRatio",
"val": self.settings.indexRatio, # "val": self.settings.indexRatio,
}, # },
{ # {
"key": "defaultProtect", # "key": "defaultProtect",
"val": self.settings.protect, # "val": self.settings.protect,
}, # },
] # ]

View File

@ -15,6 +15,9 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtrac
from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.common.VolumeExtractor import VolumeExtractor from voice_changer.common.VolumeExtractor import VolumeExtractor
from torchaudio.transforms import Resample
from voice_changer.utils.Timer import Timer
class Pipeline(object): class Pipeline(object):
@ -39,16 +42,22 @@ class Pipeline(object):
targetSR, targetSR,
device, device,
isHalf, isHalf,
resamplerIn: Resample,
resamplerOut: Resample
): ):
self.inferencer = inferencer self.inferencer = inferencer
inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig() inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig()
self.hop_size = inferencer_block_size * 16000 / inferencer_sampling_rate # 16000はオーディオのサンプルレート。この時点で16Kになっている。 self.hop_size = inferencer_block_size * 16000 / inferencer_sampling_rate # 16000はオーディオのサンプルレート。16Kで処理
self.inferencer_block_size = inferencer_block_size self.inferencer_block_size = inferencer_block_size
self.inferencer_sampling_rate = inferencer_sampling_rate self.inferencer_sampling_rate = inferencer_sampling_rate
self.volumeExtractor = VolumeExtractor(self.hop_size) self.volumeExtractor = VolumeExtractor(self.hop_size)
self.embedder = embedder self.embedder = embedder
self.pitchExtractor = pitchExtractor self.pitchExtractor = pitchExtractor
self.resamplerIn = resamplerIn
self.resamplerOut = resamplerOut
# self.f0ex = self.load_f0_extractor(f0_model="harvest", f0_min=50, f0_max=1100) # self.f0ex = self.load_f0_extractor(f0_model="harvest", f0_min=50, f0_max=1100)
print("VOLUME EXTRACTOR", self.volumeExtractor) print("VOLUME EXTRACTOR", self.volumeExtractor)
@ -83,10 +92,28 @@ class Pipeline(object):
self.pitchExtractor = pitchExtractor self.pitchExtractor = pitchExtractor
@torch.no_grad() @torch.no_grad()
def extract_volume_and_mask(self, audio, threhold): def extract_volume_and_mask(self, audio: torch.Tensor, threshold: float):
volume = self.volumeExtractor.extract(audio) '''
mask = self.volumeExtractor.get_mask_from_volume(volume, self.inferencer_block_size, threhold=threhold, device=self.device) with Timer("[VolumeExt np]") as t:
volume = torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0) for i in range(100):
volume = self.volumeExtractor.extract(audio)
time_np = t.secs
with Timer("[VolumeExt pt]") as t:
for i in range(100):
volume_t = self.volumeExtractor.extract_t(audio)
time_pt = t.secs
print("[Volume np]:", volume)
print("[Volume pt]:", volume_t)
print("[Perform]:", time_np, time_pt)
# -> [Perform]: 0.030178070068359375 0.005780220031738281 (RTX4090)
# -> [Perform]: 0.029046058654785156 0.0025115013122558594 (CPU i9 13900KF)
# ---> これくらいの処理ならCPU上のTorchでやった方が早い
'''
# volume_t = self.volumeExtractor.extract_t(audio)
volume_t = self.volumeExtractor.extract_t(audio)
mask = self.volumeExtractor.get_mask_from_volume_t(volume_t, self.inferencer_block_size, threshold=threshold)
volume = volume_t.unsqueeze(-1).unsqueeze(0)
return volume, mask return volume, mask
def exec( def exec(
@ -101,24 +128,20 @@ class Pipeline(object):
useFinalProj, useFinalProj,
protect=0.5 protect=0.5
): ):
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。 audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
audio = audio.unsqueeze(0) audio16k = self.resamplerIn(audio_t)
self.t_pad = 0 volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
n_frames = int(audio_pad.size(-1) // self.hop_size + 1)
volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0)
# ピッチ検出 # ピッチ検出
try: try:
# print("[SRC AUDIO----]", audio_pad) # print("[SRC AUDIO----]", audio_pad)
pitch, pitchf = self.pitchExtractor.extract( pitch, pitchf = self.pitchExtractor.extract(
audio_pad, audio16k.squeeze(),
pitchf, pitchf,
f0_up_key, f0_up_key,
16000, # 音声のサンプリングレート(既に16000) 16000, # 音声のサンプリングレート(既に16000)
# int(self.hop_size), # 処理のwindowサイズ (44100における512)
int(self.hop_size), # 処理のwindowサイズ (44100における512) int(self.hop_size), # 処理のwindowサイズ (44100における512)
silence_front=silence_front, silence_front=silence_front,
) )
@ -128,15 +151,19 @@ class Pipeline(object):
except IndexError as e: # NOQA except IndexError as e: # NOQA
# print(e) # print(e)
raise NotEnoughDataExtimateF0() raise NotEnoughDataExtimateF0()
print("[EMBEDDER EXTRACT:audio:4:]", audio_t.shape)
# f0 = self.f0ex.extract_f0(audio_pad, key=4, sr=44100) # f0 = self.f0ex.extract_f0(audio_pad, key=4, sr=44100)
# print("[Pitch_f0]", f0) # print("[Pitch_f0]", f0)
# tensor型調整 # tensor型調整
feats = audio_pad feats = audio16k.squeeze()
if feats.dim() == 2: # double channels if feats.dim() == 2: # double channels
feats = feats.mean(-1) feats = feats.mean(-1)
feats = feats.view(1, -1) feats = feats.view(1, -1)
print("[EMBEDDER EXTRACT:audio:5:]", audio_t.shape)
print("[EMBEDDER EXTRACT:::]", feats.shape)
# embedding # embedding
with autocast(enabled=self.isHalf): with autocast(enabled=self.isHalf):
@ -190,6 +217,7 @@ class Pipeline(object):
try: try:
with torch.no_grad(): with torch.no_grad():
with autocast(enabled=self.isHalf): with autocast(enabled=self.isHalf):
print("[EMBEDDER EXTRACT:::]", feats.shape, pitchf.unsqueeze(-1).shape, volume.shape, mask.shape)
audio1 = ( audio1 = (
torch.clip( torch.clip(
self.inferencer.infer( self.inferencer.infer(
@ -222,5 +250,5 @@ class Pipeline(object):
del pitch, pitchf, feats, sid del pitch, pitchf, feats, sid
torch.cuda.empty_cache() torch.cuda.empty_cache()
audio1 = self.resamplerOut(audio1.float())
return audio1, pitchf_buffer, feats_buffer return audio1, pitchf_buffer, feats_buffer

View File

@ -7,8 +7,11 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import Pitc
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
import torch
from torchaudio.transforms import Resample
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str, inputSampleRate: int, outputSampleRate: int):
dev = DeviceManager.get_instance().getDevice(gpu) dev = DeviceManager.get_instance().getDevice(gpu)
# half = DeviceManager.get_instance().halfPrecisionAvailable(gpu) # half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
half = False half = False
@ -35,6 +38,9 @@ def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
# pitchExtractor # pitchExtractor
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu) pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
resamplerIn = Resample(inputSampleRate, 16000, dtype=torch.int16).to(dev)
resamplerOut = Resample(modelSlot.samplingRate, outputSampleRate, dtype=torch.int16).to(dev)
pipeline = Pipeline( pipeline = Pipeline(
embedder, embedder,
inferencer, inferencer,
@ -42,6 +48,8 @@ def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
modelSlot.samplingRate, modelSlot.samplingRate,
dev, dev,
half, half,
resamplerIn,
resamplerOut
) )
return pipeline return pipeline

View File

@ -45,6 +45,7 @@ class VoiceChangerSettings:
intData: list[str] = field( intData: list[str] = field(
default_factory=lambda: [ default_factory=lambda: [
"inputSampleRate", "inputSampleRate",
"outputSampleRate",
"crossFadeOverlapSize", "crossFadeOverlapSize",
"recordIO", "recordIO",
] ]

View File

@ -10,6 +10,7 @@ from voice_changer.ModelSlotManager import ModelSlotManager
from voice_changer.RVC.RVCModelMerger import RVCModelMerger from voice_changer.RVC.RVCModelMerger import RVCModelMerger
from voice_changer.VoiceChanger import VoiceChanger from voice_changer.VoiceChanger import VoiceChanger
from const import STORED_SETTING_FILE, UPLOAD_DIR from const import STORED_SETTING_FILE, UPLOAD_DIR
from voice_changer.VoiceChangerV2 import VoiceChangerV2
from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams
from voice_changer.utils.ModelMerger import MergeElement, ModelMergerRequest from voice_changer.utils.ModelMerger import MergeElement, ModelMergerRequest
from voice_changer.utils.VoiceChangerModel import AudioInOut from voice_changer.utils.VoiceChangerModel import AudioInOut
@ -242,7 +243,7 @@ class VoiceChangerManager(ServerDeviceCallbacks):
from voice_changer.DiffusionSVC.DiffusionSVC import DiffusionSVC from voice_changer.DiffusionSVC.DiffusionSVC import DiffusionSVC
self.voiceChangerModel = DiffusionSVC(self.params, slotInfo) self.voiceChangerModel = DiffusionSVC(self.params, slotInfo)
self.voiceChanger = VoiceChanger(self.params) self.voiceChanger = VoiceChangerV2(self.params)
self.voiceChanger.setModel(self.voiceChangerModel) self.voiceChanger.setModel(self.voiceChangerModel)
else: else:
print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}") print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}")

View File

@ -0,0 +1,331 @@
'''
VoiceChangerV2
- VoiceChangerとの差分
リサンプル処理の無駄を省くためVoiceChangerModelにリサンプル処理を移譲
前処理メイン処理の分割を廃止(VoiceChangeModelでの無駄な型変換などを回避するため)
- 適用VoiceChangerModel
DiffusionSVC
'''
from typing import Any, Union
from const import TMP_DIR
import torch
import os
import traceback
import numpy as np
from dataclasses import dataclass, asdict, field
import onnxruntime
from voice_changer.IORecorder import IORecorder
from voice_changer.utils.Timer import Timer
from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel
from Exceptions import (
DeviceCannotSupportHalfPrecisionException,
DeviceChangingException,
HalfPrecisionChangingException,
NoModeLoadedException,
NotEnoughDataExtimateF0,
ONNXInputArgumentException,
VoiceChangerIsNotSelectedException,
)
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
@dataclass
class VoiceChangerV2Settings:
inputSampleRate: int = 48000 # 48000 or 24000
outputSampleRate: int = 48000 # 48000 or 24000
crossFadeOffsetRate: float = 0.1
crossFadeEndRate: float = 0.9
crossFadeOverlapSize: int = 4096
recordIO: int = 0 # 0:off, 1:on
performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
# ↓mutableな物だけ列挙
intData: list[str] = field(
default_factory=lambda: [
"inputSampleRate",
"outputSampleRate",
"crossFadeOverlapSize",
"recordIO",
]
)
floatData: list[str] = field(
default_factory=lambda: [
"crossFadeOffsetRate",
"crossFadeEndRate",
]
)
strData: list[str] = field(default_factory=lambda: [])
class VoiceChangerV2:
ioRecorder: IORecorder
sola_buffer: AudioInOut
def __init__(self, params: VoiceChangerParams):
# 初期化
self.settings = VoiceChangerV2Settings()
self.currentCrossFadeOffsetRate = 0.0
self.currentCrossFadeEndRate = 0.0
self.currentCrossFadeOverlapSize = 0 # setting
self.crossfadeSize = 0 # calculated
self.voiceChanger: VoiceChangerModel | None = None
self.params = params
self.gpu_num = torch.cuda.device_count()
self.prev_audio = np.zeros(4096)
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
self.onnx_device = onnxruntime.get_device()
print(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})")
def setModel(self, model: VoiceChangerModel):
self.voiceChanger = model
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
def get_info(self):
data = asdict(self.settings)
if self.voiceChanger is not None:
data.update(self.voiceChanger.get_info())
return data
def get_performance(self):
return self.settings.performance
def update_settings(self, key: str, val: Any):
if self.voiceChanger is None:
print("[Voice Changer] Voice Changer is not selected.")
return self.get_info()
if key == "serverAudioStated" and val == 0:
self.settings.inputSampleRate = 48000
self.settings.outputSampleRate = 48000
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
if key in self.settings.intData:
setattr(self.settings, key, int(val))
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
self.crossfadeSize = 0
if key == "recordIO" and val == 1:
if hasattr(self, "ioRecorder"):
self.ioRecorder.close()
self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate, self.settings.outputSampleRate)
if key == "recordIO" and val == 0:
if hasattr(self, "ioRecorder"):
self.ioRecorder.close()
pass
if key == "recordIO" and val == 2:
if hasattr(self, "ioRecorder"):
self.ioRecorder.close()
if key == "inputSampleRate" or key == "outputSampleRate":
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
else:
ret = self.voiceChanger.update_settings(key, val)
if ret is False:
pass
# print(f"({key} is not mutable variable or unknown variable)")
return self.get_info()
def _generate_strength(self, crossfadeSize: int):
if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
self.crossfadeSize = crossfadeSize
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
self.currentCrossFadeOverlapSize = self.settings.crossFadeOverlapSize
cf_offset = int(crossfadeSize * self.settings.crossFadeOffsetRate)
cf_end = int(crossfadeSize * self.settings.crossFadeEndRate)
cf_range = cf_end - cf_offset
percent = np.arange(cf_range) / cf_range
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
self.np_prev_strength = np.concatenate(
[
np.ones(cf_offset),
np_prev_strength,
np.zeros(crossfadeSize - cf_offset - len(np_prev_strength)),
]
)
self.np_cur_strength = np.concatenate(
[
np.zeros(cf_offset),
np_cur_strength,
np.ones(crossfadeSize - cf_offset - len(np_cur_strength)),
]
)
print(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
# ひとつ前の結果とサイズが変わるため、記録は消去する。
if hasattr(self, "np_prev_audio1") is True:
delattr(self, "np_prev_audio1")
if hasattr(self, "sola_buffer") is True:
del self.sola_buffer
def get_processing_sampling_rate(self):
if self.voiceChanger is None:
return 0
else:
return self.voiceChanger.get_processing_sampling_rate()
# receivedData: tuple of short
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
try:
if self.voiceChanger is None:
raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.")
with Timer("main-process") as t:
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
sola_search_frame = int(0.012 * processing_sampling_rate)
block_frame = receivedData.shape[0]
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
self._generate_strength(crossfade_frame)
# data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
audio = self.voiceChanger.inference(
receivedData,
crossfade_frame=crossfade_frame,
sola_search_frame=sola_search_frame
)
if hasattr(self, "sola_buffer") is True:
np.set_printoptions(threshold=10000)
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
audio = audio[audio_offset:]
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
cor_nom = np.convolve(
audio[: crossfade_frame + sola_search_frame],
np.flip(self.sola_buffer),
"valid",
)
cor_den = np.sqrt(
np.convolve(
audio[: crossfade_frame + sola_search_frame] ** 2,
np.ones(crossfade_frame),
"valid",
)
+ 1e-3
)
sola_offset = int(np.argmax(cor_nom / cor_den))
sola_end = sola_offset + block_frame
output_wav = audio[sola_offset:sola_end].astype(np.float64)
output_wav[:crossfade_frame] *= self.np_cur_strength
output_wav[:crossfade_frame] += self.sola_buffer[:]
result = output_wav
else:
print("[Voice Changer] warming up... generating sola buffer.")
result = np.zeros(4096).astype(np.int16)
if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame:
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
end = -1 * (sola_search_frame - sola_offset)
sola_buf_org = audio[offset:end]
self.sola_buffer = sola_buf_org * self.np_prev_strength
else:
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
# self.sola_buffer = audio[- crossfade_frame:]
mainprocess_time = t.secs
# 後処理
with Timer("post-process") as t:
result = result.astype(np.int16)
print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz")
if receivedData.shape[0] != result .shape[0]:
outputData = pad_array(result, receivedData.shape[0])
pass
else:
outputData = result
if self.settings.recordIO == 1:
self.ioRecorder.writeInput(receivedData)
self.ioRecorder.writeOutput(outputData.tobytes())
postprocess_time = t.secs
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
perf = [0, mainprocess_time, postprocess_time]
return outputData, perf
except NoModeLoadedException as e:
print("[Voice Changer] [Exception]", e)
return np.zeros(1).astype(np.int16), [0, 0, 0]
except ONNXInputArgumentException as e:
print("[Voice Changer] [Exception] onnx are waiting valid input.", e)
return np.zeros(1).astype(np.int16), [0, 0, 0]
except HalfPrecisionChangingException:
print("[Voice Changer] Switching model configuration....")
return np.zeros(1).astype(np.int16), [0, 0, 0]
except NotEnoughDataExtimateF0:
print("[Voice Changer] warming up... waiting more data.")
return np.zeros(1).astype(np.int16), [0, 0, 0]
except DeviceChangingException as e:
print("[Voice Changer] embedder:", e)
return np.zeros(1).astype(np.int16), [0, 0, 0]
except VoiceChangerIsNotSelectedException:
print("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.")
return np.zeros(1).astype(np.int16), [0, 0, 0]
except DeviceCannotSupportHalfPrecisionException:
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
return np.zeros(1).astype(np.int16), [0, 0, 0]
except Exception as e:
print("[Voice Changer] VC PROCESSING EXCEPTION!!!", e)
print(traceback.format_exc())
return np.zeros(1).astype(np.int16), [0, 0, 0]
def export2onnx(self):
return self.voiceChanger.export2onnx()
##############
def merge_models(self, request: str):
if self.voiceChanger is None:
print("[Voice Changer] Voice Changer is not selected.")
return
self.voiceChanger.merge_models(request)
return self.get_info()
PRINT_CONVERT_PROCESSING: bool = False
# PRINT_CONVERT_PROCESSING = True
def print_convert_processing(mess: str):
if PRINT_CONVERT_PROCESSING is True:
print(mess)
def pad_array(arr: AudioInOut, target_length: int):
current_length = arr.shape[0]
if current_length >= target_length:
return arr
else:
pad_width = target_length - current_length
pad_left = pad_width // 2
pad_right = pad_width - pad_left
# padded_arr = np.pad(
# arr, (pad_left, pad_right), "constant", constant_values=(0, 0)
# )
padded_arr = np.pad(arr, (pad_left, pad_right), "edge")
return padded_arr

View File

@ -2,6 +2,8 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from voice_changer.utils.VoiceChangerModel import AudioInOut
class VolumeExtractor: class VolumeExtractor:
@ -13,7 +15,7 @@ class VolumeExtractor:
"hop_size": self.hop_size "hop_size": self.hop_size
} }
def extract(self, audio): # audio: 1d numpy array def extract(self, audio: torch.Tensor):
audio = audio.squeeze().cpu() audio = audio.squeeze().cpu()
n_frames = int(len(audio) // self.hop_size) + 1 n_frames = int(len(audio) // self.hop_size) + 1
audio2 = audio ** 2 audio2 = audio ** 2
@ -23,14 +25,40 @@ class VolumeExtractor:
volume = np.sqrt(volume) volume = np.sqrt(volume)
return volume return volume
def get_mask_from_volume(self, volume, block_size: int, threhold=-60.0, device='cpu') -> torch.Tensor: def extract_t(self, audio: torch.Tensor):
mask = (volume > 10 ** (float(threhold) / 20)).astype('float') with torch.no_grad():
audio = audio.squeeze()
n_frames = int(audio.size(0) // self.hop_size) + 1
audio2 = audio ** 2
audio2_frames = audio2.unfold(0, int(self.hop_size), int(self.hop_size)).contiguous()
volume = torch.mean(audio2_frames, dim=-1)
volume = torch.sqrt(volume)
if volume.size(0) < n_frames:
volume = torch.nn.functional.pad(volume, (0, n_frames - volume.size(0)), 'constant', volume[-1])
return volume
def get_mask_from_volume(self, volume, block_size: int, threshold=-60.0, device='cpu') -> torch.Tensor:
volume = volume.cpu().numpy()
mask = (volume > 10 ** (float(threshold) / 20)).astype('float')
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1])) mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)]) mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0) mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
mask = upsample(mask, block_size).squeeze(-1) mask = upsample(mask, block_size).squeeze(-1)
return mask return mask
def get_mask_from_volume_t(self, volume: torch.Tensor, block_size: int, threshold=-60.0, device='cpu') -> torch.Tensor:
volume = volume.squeeze()
mask = (volume > 10.0 ** (float(threshold) / 20)).float()
mask = nn.functional.pad(mask, (4, 0), 'constant', mask[0])
mask = nn.functional.pad(mask, (0, 4), 'constant', mask[-1])
mask = torch.max(mask.unfold(-1, 9, 1), -1)[0]
mask = mask.to(device).unsqueeze(-1).unsqueeze(0)
mask = upsample(mask, block_size).squeeze(-1)
print("[get_mask_from_volume_t 3]", mask.shape)
return mask
def upsample(signal: torch.Tensor, factor: int) -> torch.Tensor: def upsample(signal: torch.Tensor, factor: int) -> torch.Tensor:
signal = signal.permute(0, 2, 1) signal = signal.permute(0, 2, 1)

View File

@ -9,7 +9,6 @@ PitchfInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
class VoiceChangerModel(Protocol): class VoiceChangerModel(Protocol):
# loadModel: Callable[..., dict[str, Any]] # loadModel: Callable[..., dict[str, Any]]
def loadModel(self, params: LoadModelParams): def loadModel(self, params: LoadModelParams):