WIP: Volume Extaractor torch
This commit is contained in:
parent
a1db94c0af
commit
a69c89255b
@ -1,9 +1,5 @@
|
|||||||
# import sys
|
|
||||||
# import os
|
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
|
||||||
import torchaudio
|
|
||||||
from data.ModelSlot import DiffusionSVCModelSlot
|
from data.ModelSlot import DiffusionSVCModelSlot
|
||||||
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
||||||
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||||
@ -13,7 +9,7 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import Pitc
|
|||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
|
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
|
||||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||||
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
# from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
|
|
||||||
from Exceptions import DeviceCannotSupportHalfPrecisionException
|
from Exceptions import DeviceCannotSupportHalfPrecisionException
|
||||||
@ -36,13 +32,12 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
self.feature_buffer: FeatureInOut | None = None
|
self.feature_buffer: FeatureInOut | None = None
|
||||||
self.prevVol = 0.0
|
self.prevVol = 0.0
|
||||||
self.slotInfo = slotInfo
|
self.slotInfo = slotInfo
|
||||||
self.initialize()
|
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
print("[Voice Changer] [DiffusionSVC] Initializing... ")
|
print("[Voice Changer] [DiffusionSVC] Initializing... ")
|
||||||
|
|
||||||
# pipelineの生成
|
# pipelineの生成
|
||||||
self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector)
|
self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector, self.inputSampleRate, self.outputSampleRate)
|
||||||
|
|
||||||
# その他の設定
|
# その他の設定
|
||||||
self.settings.tran = self.slotInfo.defaultTune
|
self.settings.tran = self.slotInfo.defaultTune
|
||||||
@ -51,6 +46,11 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
|
|
||||||
print("[Voice Changer] [DiffusionSVC] Initializing... done")
|
print("[Voice Changer] [DiffusionSVC] Initializing... done")
|
||||||
|
|
||||||
|
def setSamplingRate(self, inputSampleRate, outputSampleRate):
|
||||||
|
self.inputSampleRate = inputSampleRate
|
||||||
|
self.outputSampleRate = outputSampleRate
|
||||||
|
self.initialize()
|
||||||
|
|
||||||
def update_settings(self, key: str, val: int | float | str):
|
def update_settings(self, key: str, val: int | float | str):
|
||||||
print("[Voice Changer][DiffusionSVC]: update_settings", key, val)
|
print("[Voice Changer][DiffusionSVC]: update_settings", key, val)
|
||||||
if key in self.settings.intData:
|
if key in self.settings.intData:
|
||||||
@ -82,7 +82,6 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
def generate_input(
|
def generate_input(
|
||||||
self,
|
self,
|
||||||
newData: AudioInOut,
|
newData: AudioInOut,
|
||||||
inputSize: int,
|
|
||||||
crossfadeSize: int,
|
crossfadeSize: int,
|
||||||
solaSearchFrame: int = 0,
|
solaSearchFrame: int = 0,
|
||||||
):
|
):
|
||||||
@ -99,11 +98,10 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
self.pitchf_buffer = np.zeros(new_feature_length)
|
self.pitchf_buffer = np.zeros(new_feature_length)
|
||||||
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
|
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
|
||||||
|
|
||||||
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
convertSize = newData.shape[0] + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
||||||
|
|
||||||
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||||
convertSize = convertSize + (128 - (convertSize % 128))
|
convertSize = convertSize + (128 - (convertSize % 128))
|
||||||
outSize = convertSize - self.settings.extraConvertSize
|
|
||||||
|
|
||||||
# バッファがたまっていない場合はzeroで補う
|
# バッファがたまっていない場合はzeroで補う
|
||||||
if self.audio_buffer.shape[0] < convertSize:
|
if self.audio_buffer.shape[0] < convertSize:
|
||||||
@ -118,37 +116,39 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
self.feature_buffer = self.feature_buffer[featureOffset:]
|
self.feature_buffer = self.feature_buffer[featureOffset:]
|
||||||
|
|
||||||
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
|
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
|
||||||
cropOffset = -1 * (inputSize + crossfadeSize)
|
cropOffset = -1 * (newData.shape[0] + crossfadeSize)
|
||||||
cropEnd = -1 * (crossfadeSize)
|
cropEnd = -1 * (crossfadeSize)
|
||||||
crop = self.audio_buffer[cropOffset:cropEnd]
|
crop = self.audio_buffer[cropOffset:cropEnd]
|
||||||
vol = np.sqrt(np.square(crop).mean())
|
vol = np.sqrt(np.square(crop).mean())
|
||||||
vol = max(vol, self.prevVol * 0.0)
|
vol = float(max(vol, self.prevVol * 0.0))
|
||||||
self.prevVol = vol
|
self.prevVol = vol
|
||||||
|
|
||||||
return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize)
|
return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol)
|
||||||
|
|
||||||
def inference(self, data):
|
def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int):
|
||||||
audio = data[0]
|
data = self.generate_input(receivedData, crossfade_frame, sola_search_frame)
|
||||||
pitchf = data[1]
|
audio: AudioInOut = data[0]
|
||||||
feature = data[2]
|
pitchf: PitchfInOut = data[1]
|
||||||
convertSize = data[3]
|
feature: FeatureInOut = data[2]
|
||||||
vol = data[4]
|
convertSize: int = data[3]
|
||||||
|
vol: float = data[4]
|
||||||
|
|
||||||
if vol < self.settings.silentThreshold:
|
if vol < self.settings.silentThreshold:
|
||||||
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
|
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
|
||||||
|
|
||||||
if self.pipeline is not None:
|
if self.pipeline is None:
|
||||||
device = self.pipeline.device
|
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
|
||||||
else:
|
|
||||||
device = torch.device("cpu") # TODO:pipelineが存在しない場合はzeroを返してもいいかも(要確認)。
|
# device = self.pipeline.device
|
||||||
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
|
# audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
|
||||||
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
|
# audio = self.resampler16K(audio)
|
||||||
sid = self.settings.dstId
|
sid = self.settings.dstId
|
||||||
f0_up_key = self.settings.tran
|
f0_up_key = self.settings.tran
|
||||||
protect = 0
|
protect = 0
|
||||||
|
|
||||||
embOutputLayer = 12
|
embOutputLayer = 12
|
||||||
useFinalProj = False
|
useFinalProj = False
|
||||||
|
silenceFrontSec = self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0. # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。
|
||||||
|
|
||||||
try:
|
try:
|
||||||
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
||||||
@ -157,12 +157,11 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
pitchf,
|
pitchf,
|
||||||
feature,
|
feature,
|
||||||
f0_up_key,
|
f0_up_key,
|
||||||
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。
|
silenceFrontSec,
|
||||||
embOutputLayer,
|
embOutputLayer,
|
||||||
useFinalProj,
|
useFinalProj,
|
||||||
protect
|
protect
|
||||||
)
|
)
|
||||||
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
|
||||||
result = audio_out.detach().cpu().numpy()
|
result = audio_out.detach().cpu().numpy()
|
||||||
|
|
||||||
return result
|
return result
|
||||||
@ -173,36 +172,36 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
# raise e
|
# raise e
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
del self.pipeline
|
del self.pipeline
|
||||||
|
|
||||||
def export2onnx(self):
|
# def export2onnx(self):
|
||||||
modelSlot = self.slotInfo
|
# modelSlot = self.slotInfo
|
||||||
|
|
||||||
if modelSlot.isONNX:
|
# if modelSlot.isONNX:
|
||||||
print("[Voice Changer] export2onnx, No pyTorch filepath.")
|
# print("[Voice Changer] export2onnx, No pyTorch filepath.")
|
||||||
return {"status": "ng", "path": ""}
|
# return {"status": "ng", "path": ""}
|
||||||
|
|
||||||
output_file_simple = export2onnx(self.settings.gpu, modelSlot)
|
# output_file_simple = export2onnx(self.settings.gpu, modelSlot)
|
||||||
return {
|
# return {
|
||||||
"status": "ok",
|
# "status": "ok",
|
||||||
"path": f"/tmp/{output_file_simple}",
|
# "path": f"/tmp/{output_file_simple}",
|
||||||
"filename": output_file_simple,
|
# "filename": output_file_simple,
|
||||||
}
|
# }
|
||||||
|
|
||||||
def get_model_current(self):
|
# def get_model_current(self):
|
||||||
return [
|
# return [
|
||||||
{
|
# {
|
||||||
"key": "defaultTune",
|
# "key": "defaultTune",
|
||||||
"val": self.settings.tran,
|
# "val": self.settings.tran,
|
||||||
},
|
# },
|
||||||
{
|
# {
|
||||||
"key": "defaultIndexRatio",
|
# "key": "defaultIndexRatio",
|
||||||
"val": self.settings.indexRatio,
|
# "val": self.settings.indexRatio,
|
||||||
},
|
# },
|
||||||
{
|
# {
|
||||||
"key": "defaultProtect",
|
# "key": "defaultProtect",
|
||||||
"val": self.settings.protect,
|
# "val": self.settings.protect,
|
||||||
},
|
# },
|
||||||
]
|
# ]
|
||||||
|
@ -15,6 +15,9 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtrac
|
|||||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||||
|
|
||||||
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
||||||
|
from torchaudio.transforms import Resample
|
||||||
|
|
||||||
|
from voice_changer.utils.Timer import Timer
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(object):
|
class Pipeline(object):
|
||||||
@ -39,16 +42,22 @@ class Pipeline(object):
|
|||||||
targetSR,
|
targetSR,
|
||||||
device,
|
device,
|
||||||
isHalf,
|
isHalf,
|
||||||
|
resamplerIn: Resample,
|
||||||
|
resamplerOut: Resample
|
||||||
):
|
):
|
||||||
self.inferencer = inferencer
|
self.inferencer = inferencer
|
||||||
inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig()
|
inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig()
|
||||||
self.hop_size = inferencer_block_size * 16000 / inferencer_sampling_rate # 16000はオーディオのサンプルレート。この時点で16Kになっている。
|
self.hop_size = inferencer_block_size * 16000 / inferencer_sampling_rate # 16000はオーディオのサンプルレート。16Kで処理
|
||||||
self.inferencer_block_size = inferencer_block_size
|
self.inferencer_block_size = inferencer_block_size
|
||||||
self.inferencer_sampling_rate = inferencer_sampling_rate
|
self.inferencer_sampling_rate = inferencer_sampling_rate
|
||||||
|
|
||||||
self.volumeExtractor = VolumeExtractor(self.hop_size)
|
self.volumeExtractor = VolumeExtractor(self.hop_size)
|
||||||
self.embedder = embedder
|
self.embedder = embedder
|
||||||
self.pitchExtractor = pitchExtractor
|
self.pitchExtractor = pitchExtractor
|
||||||
|
|
||||||
|
self.resamplerIn = resamplerIn
|
||||||
|
self.resamplerOut = resamplerOut
|
||||||
|
|
||||||
# self.f0ex = self.load_f0_extractor(f0_model="harvest", f0_min=50, f0_max=1100)
|
# self.f0ex = self.load_f0_extractor(f0_model="harvest", f0_min=50, f0_max=1100)
|
||||||
|
|
||||||
print("VOLUME EXTRACTOR", self.volumeExtractor)
|
print("VOLUME EXTRACTOR", self.volumeExtractor)
|
||||||
@ -83,10 +92,28 @@ class Pipeline(object):
|
|||||||
self.pitchExtractor = pitchExtractor
|
self.pitchExtractor = pitchExtractor
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def extract_volume_and_mask(self, audio, threhold):
|
def extract_volume_and_mask(self, audio: torch.Tensor, threshold: float):
|
||||||
volume = self.volumeExtractor.extract(audio)
|
'''
|
||||||
mask = self.volumeExtractor.get_mask_from_volume(volume, self.inferencer_block_size, threhold=threhold, device=self.device)
|
with Timer("[VolumeExt np]") as t:
|
||||||
volume = torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
for i in range(100):
|
||||||
|
volume = self.volumeExtractor.extract(audio)
|
||||||
|
time_np = t.secs
|
||||||
|
with Timer("[VolumeExt pt]") as t:
|
||||||
|
for i in range(100):
|
||||||
|
volume_t = self.volumeExtractor.extract_t(audio)
|
||||||
|
time_pt = t.secs
|
||||||
|
|
||||||
|
print("[Volume np]:", volume)
|
||||||
|
print("[Volume pt]:", volume_t)
|
||||||
|
print("[Perform]:", time_np, time_pt)
|
||||||
|
# -> [Perform]: 0.030178070068359375 0.005780220031738281 (RTX4090)
|
||||||
|
# -> [Perform]: 0.029046058654785156 0.0025115013122558594 (CPU i9 13900KF)
|
||||||
|
# ---> これくらいの処理ならCPU上のTorchでやった方が早い?
|
||||||
|
'''
|
||||||
|
# volume_t = self.volumeExtractor.extract_t(audio)
|
||||||
|
volume_t = self.volumeExtractor.extract_t(audio)
|
||||||
|
mask = self.volumeExtractor.get_mask_from_volume_t(volume_t, self.inferencer_block_size, threshold=threshold)
|
||||||
|
volume = volume_t.unsqueeze(-1).unsqueeze(0)
|
||||||
return volume, mask
|
return volume, mask
|
||||||
|
|
||||||
def exec(
|
def exec(
|
||||||
@ -101,24 +128,20 @@ class Pipeline(object):
|
|||||||
useFinalProj,
|
useFinalProj,
|
||||||
protect=0.5
|
protect=0.5
|
||||||
):
|
):
|
||||||
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||||
audio = audio.unsqueeze(0)
|
audio16k = self.resamplerIn(audio_t)
|
||||||
self.t_pad = 0
|
volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
|
||||||
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
|
|
||||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||||
|
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
|
||||||
n_frames = int(audio_pad.size(-1) // self.hop_size + 1)
|
|
||||||
volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0)
|
|
||||||
|
|
||||||
# ピッチ検出
|
# ピッチ検出
|
||||||
try:
|
try:
|
||||||
# print("[SRC AUDIO----]", audio_pad)
|
# print("[SRC AUDIO----]", audio_pad)
|
||||||
pitch, pitchf = self.pitchExtractor.extract(
|
pitch, pitchf = self.pitchExtractor.extract(
|
||||||
audio_pad,
|
audio16k.squeeze(),
|
||||||
pitchf,
|
pitchf,
|
||||||
f0_up_key,
|
f0_up_key,
|
||||||
16000, # 音声のサンプリングレート(既に16000)
|
16000, # 音声のサンプリングレート(既に16000)
|
||||||
# int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
|
||||||
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||||
silence_front=silence_front,
|
silence_front=silence_front,
|
||||||
)
|
)
|
||||||
@ -128,15 +151,19 @@ class Pipeline(object):
|
|||||||
except IndexError as e: # NOQA
|
except IndexError as e: # NOQA
|
||||||
# print(e)
|
# print(e)
|
||||||
raise NotEnoughDataExtimateF0()
|
raise NotEnoughDataExtimateF0()
|
||||||
|
print("[EMBEDDER EXTRACT:audio:4:]", audio_t.shape)
|
||||||
|
|
||||||
# f0 = self.f0ex.extract_f0(audio_pad, key=4, sr=44100)
|
# f0 = self.f0ex.extract_f0(audio_pad, key=4, sr=44100)
|
||||||
# print("[Pitch_f0]", f0)
|
# print("[Pitch_f0]", f0)
|
||||||
|
|
||||||
# tensor型調整
|
# tensor型調整
|
||||||
feats = audio_pad
|
feats = audio16k.squeeze()
|
||||||
if feats.dim() == 2: # double channels
|
if feats.dim() == 2: # double channels
|
||||||
feats = feats.mean(-1)
|
feats = feats.mean(-1)
|
||||||
feats = feats.view(1, -1)
|
feats = feats.view(1, -1)
|
||||||
|
print("[EMBEDDER EXTRACT:audio:5:]", audio_t.shape)
|
||||||
|
|
||||||
|
print("[EMBEDDER EXTRACT:::]", feats.shape)
|
||||||
|
|
||||||
# embedding
|
# embedding
|
||||||
with autocast(enabled=self.isHalf):
|
with autocast(enabled=self.isHalf):
|
||||||
@ -190,6 +217,7 @@ class Pipeline(object):
|
|||||||
try:
|
try:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
with autocast(enabled=self.isHalf):
|
with autocast(enabled=self.isHalf):
|
||||||
|
print("[EMBEDDER EXTRACT:::]", feats.shape, pitchf.unsqueeze(-1).shape, volume.shape, mask.shape)
|
||||||
audio1 = (
|
audio1 = (
|
||||||
torch.clip(
|
torch.clip(
|
||||||
self.inferencer.infer(
|
self.inferencer.infer(
|
||||||
@ -222,5 +250,5 @@ class Pipeline(object):
|
|||||||
|
|
||||||
del pitch, pitchf, feats, sid
|
del pitch, pitchf, feats, sid
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
audio1 = self.resamplerOut(audio1.float())
|
||||||
return audio1, pitchf_buffer, feats_buffer
|
return audio1, pitchf_buffer, feats_buffer
|
||||||
|
@ -7,8 +7,11 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import Pitc
|
|||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torchaudio.transforms import Resample
|
||||||
|
|
||||||
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
|
||||||
|
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str, inputSampleRate: int, outputSampleRate: int):
|
||||||
dev = DeviceManager.get_instance().getDevice(gpu)
|
dev = DeviceManager.get_instance().getDevice(gpu)
|
||||||
# half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
# half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||||
half = False
|
half = False
|
||||||
@ -35,6 +38,9 @@ def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
|||||||
# pitchExtractor
|
# pitchExtractor
|
||||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
|
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
|
||||||
|
|
||||||
|
resamplerIn = Resample(inputSampleRate, 16000, dtype=torch.int16).to(dev)
|
||||||
|
resamplerOut = Resample(modelSlot.samplingRate, outputSampleRate, dtype=torch.int16).to(dev)
|
||||||
|
|
||||||
pipeline = Pipeline(
|
pipeline = Pipeline(
|
||||||
embedder,
|
embedder,
|
||||||
inferencer,
|
inferencer,
|
||||||
@ -42,6 +48,8 @@ def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
|||||||
modelSlot.samplingRate,
|
modelSlot.samplingRate,
|
||||||
dev,
|
dev,
|
||||||
half,
|
half,
|
||||||
|
resamplerIn,
|
||||||
|
resamplerOut
|
||||||
)
|
)
|
||||||
|
|
||||||
return pipeline
|
return pipeline
|
||||||
|
@ -45,6 +45,7 @@ class VoiceChangerSettings:
|
|||||||
intData: list[str] = field(
|
intData: list[str] = field(
|
||||||
default_factory=lambda: [
|
default_factory=lambda: [
|
||||||
"inputSampleRate",
|
"inputSampleRate",
|
||||||
|
"outputSampleRate",
|
||||||
"crossFadeOverlapSize",
|
"crossFadeOverlapSize",
|
||||||
"recordIO",
|
"recordIO",
|
||||||
]
|
]
|
||||||
|
@ -10,6 +10,7 @@ from voice_changer.ModelSlotManager import ModelSlotManager
|
|||||||
from voice_changer.RVC.RVCModelMerger import RVCModelMerger
|
from voice_changer.RVC.RVCModelMerger import RVCModelMerger
|
||||||
from voice_changer.VoiceChanger import VoiceChanger
|
from voice_changer.VoiceChanger import VoiceChanger
|
||||||
from const import STORED_SETTING_FILE, UPLOAD_DIR
|
from const import STORED_SETTING_FILE, UPLOAD_DIR
|
||||||
|
from voice_changer.VoiceChangerV2 import VoiceChangerV2
|
||||||
from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams
|
from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams
|
||||||
from voice_changer.utils.ModelMerger import MergeElement, ModelMergerRequest
|
from voice_changer.utils.ModelMerger import MergeElement, ModelMergerRequest
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
@ -242,7 +243,7 @@ class VoiceChangerManager(ServerDeviceCallbacks):
|
|||||||
from voice_changer.DiffusionSVC.DiffusionSVC import DiffusionSVC
|
from voice_changer.DiffusionSVC.DiffusionSVC import DiffusionSVC
|
||||||
|
|
||||||
self.voiceChangerModel = DiffusionSVC(self.params, slotInfo)
|
self.voiceChangerModel = DiffusionSVC(self.params, slotInfo)
|
||||||
self.voiceChanger = VoiceChanger(self.params)
|
self.voiceChanger = VoiceChangerV2(self.params)
|
||||||
self.voiceChanger.setModel(self.voiceChangerModel)
|
self.voiceChanger.setModel(self.voiceChangerModel)
|
||||||
else:
|
else:
|
||||||
print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}")
|
print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}")
|
||||||
|
331
server/voice_changer/VoiceChangerV2.py
Normal file
331
server/voice_changer/VoiceChangerV2.py
Normal file
@ -0,0 +1,331 @@
|
|||||||
|
'''
|
||||||
|
■ VoiceChangerV2
|
||||||
|
- VoiceChangerとの差分
|
||||||
|
・リサンプル処理の無駄を省くため、VoiceChangerModelにリサンプル処理を移譲
|
||||||
|
・前処理、メイン処理の分割を廃止(VoiceChangeModelでの無駄な型変換などを回避するため)
|
||||||
|
|
||||||
|
- 適用VoiceChangerModel
|
||||||
|
・DiffusionSVC
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from const import TMP_DIR
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
|
import numpy as np
|
||||||
|
from dataclasses import dataclass, asdict, field
|
||||||
|
import onnxruntime
|
||||||
|
|
||||||
|
from voice_changer.IORecorder import IORecorder
|
||||||
|
|
||||||
|
from voice_changer.utils.Timer import Timer
|
||||||
|
from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel
|
||||||
|
from Exceptions import (
|
||||||
|
DeviceCannotSupportHalfPrecisionException,
|
||||||
|
DeviceChangingException,
|
||||||
|
HalfPrecisionChangingException,
|
||||||
|
NoModeLoadedException,
|
||||||
|
NotEnoughDataExtimateF0,
|
||||||
|
ONNXInputArgumentException,
|
||||||
|
VoiceChangerIsNotSelectedException,
|
||||||
|
)
|
||||||
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
|
|
||||||
|
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
|
||||||
|
STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VoiceChangerV2Settings:
|
||||||
|
inputSampleRate: int = 48000 # 48000 or 24000
|
||||||
|
outputSampleRate: int = 48000 # 48000 or 24000
|
||||||
|
|
||||||
|
crossFadeOffsetRate: float = 0.1
|
||||||
|
crossFadeEndRate: float = 0.9
|
||||||
|
crossFadeOverlapSize: int = 4096
|
||||||
|
|
||||||
|
recordIO: int = 0 # 0:off, 1:on
|
||||||
|
|
||||||
|
performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
|
||||||
|
|
||||||
|
# ↓mutableな物だけ列挙
|
||||||
|
intData: list[str] = field(
|
||||||
|
default_factory=lambda: [
|
||||||
|
"inputSampleRate",
|
||||||
|
"outputSampleRate",
|
||||||
|
"crossFadeOverlapSize",
|
||||||
|
"recordIO",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
floatData: list[str] = field(
|
||||||
|
default_factory=lambda: [
|
||||||
|
"crossFadeOffsetRate",
|
||||||
|
"crossFadeEndRate",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
strData: list[str] = field(default_factory=lambda: [])
|
||||||
|
|
||||||
|
|
||||||
|
class VoiceChangerV2:
|
||||||
|
ioRecorder: IORecorder
|
||||||
|
sola_buffer: AudioInOut
|
||||||
|
|
||||||
|
def __init__(self, params: VoiceChangerParams):
|
||||||
|
# 初期化
|
||||||
|
self.settings = VoiceChangerV2Settings()
|
||||||
|
self.currentCrossFadeOffsetRate = 0.0
|
||||||
|
self.currentCrossFadeEndRate = 0.0
|
||||||
|
self.currentCrossFadeOverlapSize = 0 # setting
|
||||||
|
self.crossfadeSize = 0 # calculated
|
||||||
|
|
||||||
|
self.voiceChanger: VoiceChangerModel | None = None
|
||||||
|
self.params = params
|
||||||
|
self.gpu_num = torch.cuda.device_count()
|
||||||
|
self.prev_audio = np.zeros(4096)
|
||||||
|
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
|
||||||
|
self.onnx_device = onnxruntime.get_device()
|
||||||
|
|
||||||
|
print(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})")
|
||||||
|
|
||||||
|
def setModel(self, model: VoiceChangerModel):
|
||||||
|
self.voiceChanger = model
|
||||||
|
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
||||||
|
|
||||||
|
def get_info(self):
|
||||||
|
data = asdict(self.settings)
|
||||||
|
if self.voiceChanger is not None:
|
||||||
|
data.update(self.voiceChanger.get_info())
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_performance(self):
|
||||||
|
return self.settings.performance
|
||||||
|
|
||||||
|
def update_settings(self, key: str, val: Any):
|
||||||
|
if self.voiceChanger is None:
|
||||||
|
print("[Voice Changer] Voice Changer is not selected.")
|
||||||
|
return self.get_info()
|
||||||
|
|
||||||
|
if key == "serverAudioStated" and val == 0:
|
||||||
|
self.settings.inputSampleRate = 48000
|
||||||
|
self.settings.outputSampleRate = 48000
|
||||||
|
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
||||||
|
|
||||||
|
if key in self.settings.intData:
|
||||||
|
setattr(self.settings, key, int(val))
|
||||||
|
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
|
||||||
|
self.crossfadeSize = 0
|
||||||
|
if key == "recordIO" and val == 1:
|
||||||
|
if hasattr(self, "ioRecorder"):
|
||||||
|
self.ioRecorder.close()
|
||||||
|
self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate, self.settings.outputSampleRate)
|
||||||
|
if key == "recordIO" and val == 0:
|
||||||
|
if hasattr(self, "ioRecorder"):
|
||||||
|
self.ioRecorder.close()
|
||||||
|
pass
|
||||||
|
if key == "recordIO" and val == 2:
|
||||||
|
if hasattr(self, "ioRecorder"):
|
||||||
|
self.ioRecorder.close()
|
||||||
|
if key == "inputSampleRate" or key == "outputSampleRate":
|
||||||
|
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
||||||
|
elif key in self.settings.floatData:
|
||||||
|
setattr(self.settings, key, float(val))
|
||||||
|
elif key in self.settings.strData:
|
||||||
|
setattr(self.settings, key, str(val))
|
||||||
|
else:
|
||||||
|
ret = self.voiceChanger.update_settings(key, val)
|
||||||
|
if ret is False:
|
||||||
|
pass
|
||||||
|
# print(f"({key} is not mutable variable or unknown variable)")
|
||||||
|
return self.get_info()
|
||||||
|
|
||||||
|
def _generate_strength(self, crossfadeSize: int):
|
||||||
|
if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
|
||||||
|
self.crossfadeSize = crossfadeSize
|
||||||
|
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
|
||||||
|
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
|
||||||
|
self.currentCrossFadeOverlapSize = self.settings.crossFadeOverlapSize
|
||||||
|
|
||||||
|
cf_offset = int(crossfadeSize * self.settings.crossFadeOffsetRate)
|
||||||
|
cf_end = int(crossfadeSize * self.settings.crossFadeEndRate)
|
||||||
|
cf_range = cf_end - cf_offset
|
||||||
|
percent = np.arange(cf_range) / cf_range
|
||||||
|
|
||||||
|
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
|
||||||
|
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
|
||||||
|
|
||||||
|
self.np_prev_strength = np.concatenate(
|
||||||
|
[
|
||||||
|
np.ones(cf_offset),
|
||||||
|
np_prev_strength,
|
||||||
|
np.zeros(crossfadeSize - cf_offset - len(np_prev_strength)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.np_cur_strength = np.concatenate(
|
||||||
|
[
|
||||||
|
np.zeros(cf_offset),
|
||||||
|
np_cur_strength,
|
||||||
|
np.ones(crossfadeSize - cf_offset - len(np_cur_strength)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
|
||||||
|
|
||||||
|
# ひとつ前の結果とサイズが変わるため、記録は消去する。
|
||||||
|
if hasattr(self, "np_prev_audio1") is True:
|
||||||
|
delattr(self, "np_prev_audio1")
|
||||||
|
if hasattr(self, "sola_buffer") is True:
|
||||||
|
del self.sola_buffer
|
||||||
|
|
||||||
|
def get_processing_sampling_rate(self):
|
||||||
|
if self.voiceChanger is None:
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return self.voiceChanger.get_processing_sampling_rate()
|
||||||
|
|
||||||
|
# receivedData: tuple of short
|
||||||
|
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
|
||||||
|
try:
|
||||||
|
if self.voiceChanger is None:
|
||||||
|
raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.")
|
||||||
|
|
||||||
|
with Timer("main-process") as t:
|
||||||
|
|
||||||
|
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
||||||
|
sola_search_frame = int(0.012 * processing_sampling_rate)
|
||||||
|
block_frame = receivedData.shape[0]
|
||||||
|
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
|
||||||
|
self._generate_strength(crossfade_frame)
|
||||||
|
# data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
|
||||||
|
audio = self.voiceChanger.inference(
|
||||||
|
receivedData,
|
||||||
|
crossfade_frame=crossfade_frame,
|
||||||
|
sola_search_frame=sola_search_frame
|
||||||
|
)
|
||||||
|
if hasattr(self, "sola_buffer") is True:
|
||||||
|
np.set_printoptions(threshold=10000)
|
||||||
|
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
|
||||||
|
audio = audio[audio_offset:]
|
||||||
|
|
||||||
|
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
|
||||||
|
cor_nom = np.convolve(
|
||||||
|
audio[: crossfade_frame + sola_search_frame],
|
||||||
|
np.flip(self.sola_buffer),
|
||||||
|
"valid",
|
||||||
|
)
|
||||||
|
cor_den = np.sqrt(
|
||||||
|
np.convolve(
|
||||||
|
audio[: crossfade_frame + sola_search_frame] ** 2,
|
||||||
|
np.ones(crossfade_frame),
|
||||||
|
"valid",
|
||||||
|
)
|
||||||
|
+ 1e-3
|
||||||
|
)
|
||||||
|
sola_offset = int(np.argmax(cor_nom / cor_den))
|
||||||
|
sola_end = sola_offset + block_frame
|
||||||
|
output_wav = audio[sola_offset:sola_end].astype(np.float64)
|
||||||
|
output_wav[:crossfade_frame] *= self.np_cur_strength
|
||||||
|
output_wav[:crossfade_frame] += self.sola_buffer[:]
|
||||||
|
|
||||||
|
result = output_wav
|
||||||
|
else:
|
||||||
|
print("[Voice Changer] warming up... generating sola buffer.")
|
||||||
|
result = np.zeros(4096).astype(np.int16)
|
||||||
|
|
||||||
|
if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame:
|
||||||
|
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
|
||||||
|
end = -1 * (sola_search_frame - sola_offset)
|
||||||
|
sola_buf_org = audio[offset:end]
|
||||||
|
self.sola_buffer = sola_buf_org * self.np_prev_strength
|
||||||
|
else:
|
||||||
|
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
|
||||||
|
# self.sola_buffer = audio[- crossfade_frame:]
|
||||||
|
|
||||||
|
mainprocess_time = t.secs
|
||||||
|
|
||||||
|
# 後処理
|
||||||
|
with Timer("post-process") as t:
|
||||||
|
result = result.astype(np.int16)
|
||||||
|
|
||||||
|
print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz")
|
||||||
|
|
||||||
|
if receivedData.shape[0] != result .shape[0]:
|
||||||
|
outputData = pad_array(result, receivedData.shape[0])
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
outputData = result
|
||||||
|
|
||||||
|
if self.settings.recordIO == 1:
|
||||||
|
self.ioRecorder.writeInput(receivedData)
|
||||||
|
self.ioRecorder.writeOutput(outputData.tobytes())
|
||||||
|
|
||||||
|
postprocess_time = t.secs
|
||||||
|
|
||||||
|
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
|
||||||
|
perf = [0, mainprocess_time, postprocess_time]
|
||||||
|
|
||||||
|
return outputData, perf
|
||||||
|
|
||||||
|
except NoModeLoadedException as e:
|
||||||
|
print("[Voice Changer] [Exception]", e)
|
||||||
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
|
except ONNXInputArgumentException as e:
|
||||||
|
print("[Voice Changer] [Exception] onnx are waiting valid input.", e)
|
||||||
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
|
except HalfPrecisionChangingException:
|
||||||
|
print("[Voice Changer] Switching model configuration....")
|
||||||
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
|
except NotEnoughDataExtimateF0:
|
||||||
|
print("[Voice Changer] warming up... waiting more data.")
|
||||||
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
|
except DeviceChangingException as e:
|
||||||
|
print("[Voice Changer] embedder:", e)
|
||||||
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
|
except VoiceChangerIsNotSelectedException:
|
||||||
|
print("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.")
|
||||||
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
|
except DeviceCannotSupportHalfPrecisionException:
|
||||||
|
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
|
||||||
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
|
except Exception as e:
|
||||||
|
print("[Voice Changer] VC PROCESSING EXCEPTION!!!", e)
|
||||||
|
print(traceback.format_exc())
|
||||||
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
|
|
||||||
|
def export2onnx(self):
|
||||||
|
return self.voiceChanger.export2onnx()
|
||||||
|
|
||||||
|
##############
|
||||||
|
|
||||||
|
def merge_models(self, request: str):
|
||||||
|
if self.voiceChanger is None:
|
||||||
|
print("[Voice Changer] Voice Changer is not selected.")
|
||||||
|
return
|
||||||
|
self.voiceChanger.merge_models(request)
|
||||||
|
return self.get_info()
|
||||||
|
|
||||||
|
|
||||||
|
PRINT_CONVERT_PROCESSING: bool = False
|
||||||
|
# PRINT_CONVERT_PROCESSING = True
|
||||||
|
|
||||||
|
|
||||||
|
def print_convert_processing(mess: str):
|
||||||
|
if PRINT_CONVERT_PROCESSING is True:
|
||||||
|
print(mess)
|
||||||
|
|
||||||
|
|
||||||
|
def pad_array(arr: AudioInOut, target_length: int):
|
||||||
|
current_length = arr.shape[0]
|
||||||
|
if current_length >= target_length:
|
||||||
|
return arr
|
||||||
|
else:
|
||||||
|
pad_width = target_length - current_length
|
||||||
|
pad_left = pad_width // 2
|
||||||
|
pad_right = pad_width - pad_left
|
||||||
|
# padded_arr = np.pad(
|
||||||
|
# arr, (pad_left, pad_right), "constant", constant_values=(0, 0)
|
||||||
|
# )
|
||||||
|
padded_arr = np.pad(arr, (pad_left, pad_right), "edge")
|
||||||
|
return padded_arr
|
@ -2,6 +2,8 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
|
|
||||||
|
|
||||||
class VolumeExtractor:
|
class VolumeExtractor:
|
||||||
|
|
||||||
@ -13,7 +15,7 @@ class VolumeExtractor:
|
|||||||
"hop_size": self.hop_size
|
"hop_size": self.hop_size
|
||||||
}
|
}
|
||||||
|
|
||||||
def extract(self, audio): # audio: 1d numpy array
|
def extract(self, audio: torch.Tensor):
|
||||||
audio = audio.squeeze().cpu()
|
audio = audio.squeeze().cpu()
|
||||||
n_frames = int(len(audio) // self.hop_size) + 1
|
n_frames = int(len(audio) // self.hop_size) + 1
|
||||||
audio2 = audio ** 2
|
audio2 = audio ** 2
|
||||||
@ -23,14 +25,40 @@ class VolumeExtractor:
|
|||||||
volume = np.sqrt(volume)
|
volume = np.sqrt(volume)
|
||||||
return volume
|
return volume
|
||||||
|
|
||||||
def get_mask_from_volume(self, volume, block_size: int, threhold=-60.0, device='cpu') -> torch.Tensor:
|
def extract_t(self, audio: torch.Tensor):
|
||||||
mask = (volume > 10 ** (float(threhold) / 20)).astype('float')
|
with torch.no_grad():
|
||||||
|
audio = audio.squeeze()
|
||||||
|
n_frames = int(audio.size(0) // self.hop_size) + 1
|
||||||
|
audio2 = audio ** 2
|
||||||
|
|
||||||
|
audio2_frames = audio2.unfold(0, int(self.hop_size), int(self.hop_size)).contiguous()
|
||||||
|
|
||||||
|
volume = torch.mean(audio2_frames, dim=-1)
|
||||||
|
volume = torch.sqrt(volume)
|
||||||
|
if volume.size(0) < n_frames:
|
||||||
|
volume = torch.nn.functional.pad(volume, (0, n_frames - volume.size(0)), 'constant', volume[-1])
|
||||||
|
return volume
|
||||||
|
|
||||||
|
def get_mask_from_volume(self, volume, block_size: int, threshold=-60.0, device='cpu') -> torch.Tensor:
|
||||||
|
volume = volume.cpu().numpy()
|
||||||
|
mask = (volume > 10 ** (float(threshold) / 20)).astype('float')
|
||||||
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
|
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
|
||||||
mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
|
mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
|
||||||
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
|
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
|
||||||
mask = upsample(mask, block_size).squeeze(-1)
|
mask = upsample(mask, block_size).squeeze(-1)
|
||||||
return mask
|
return mask
|
||||||
|
|
||||||
|
def get_mask_from_volume_t(self, volume: torch.Tensor, block_size: int, threshold=-60.0, device='cpu') -> torch.Tensor:
|
||||||
|
volume = volume.squeeze()
|
||||||
|
mask = (volume > 10.0 ** (float(threshold) / 20)).float()
|
||||||
|
mask = nn.functional.pad(mask, (4, 0), 'constant', mask[0])
|
||||||
|
mask = nn.functional.pad(mask, (0, 4), 'constant', mask[-1])
|
||||||
|
mask = torch.max(mask.unfold(-1, 9, 1), -1)[0]
|
||||||
|
mask = mask.to(device).unsqueeze(-1).unsqueeze(0)
|
||||||
|
mask = upsample(mask, block_size).squeeze(-1)
|
||||||
|
print("[get_mask_from_volume_t 3]", mask.shape)
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
def upsample(signal: torch.Tensor, factor: int) -> torch.Tensor:
|
def upsample(signal: torch.Tensor, factor: int) -> torch.Tensor:
|
||||||
signal = signal.permute(0, 2, 1)
|
signal = signal.permute(0, 2, 1)
|
||||||
|
@ -9,7 +9,6 @@ PitchfInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
|
|||||||
FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
|
FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class VoiceChangerModel(Protocol):
|
class VoiceChangerModel(Protocol):
|
||||||
# loadModel: Callable[..., dict[str, Any]]
|
# loadModel: Callable[..., dict[str, Any]]
|
||||||
def loadModel(self, params: LoadModelParams):
|
def loadModel(self, params: LoadModelParams):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user