WIP: first communication confirmation

This commit is contained in:
w-okada 2023-07-14 13:54:08 +09:00
parent fbabfed566
commit a1db94c0af
11 changed files with 542 additions and 16 deletions

View File

@ -6,15 +6,15 @@ import torch
import torchaudio import torchaudio
from data.ModelSlot import DiffusionSVCModelSlot from data.ModelSlot import DiffusionSVCModelSlot
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.pipeline.Pipeline import Pipeline
from Exceptions import DeviceCannotSupportHalfPrecisionException from Exceptions import DeviceCannotSupportHalfPrecisionException
@ -165,8 +165,6 @@ class DiffusionSVC(VoiceChangerModel):
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol) # result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
result = audio_out.detach().cpu().numpy() result = audio_out.detach().cpu().numpy()
print("RESULT", result)
return result return result
except DeviceCannotSupportHalfPrecisionException as e: # NOQA except DeviceCannotSupportHalfPrecisionException as e: # NOQA
print("[Device Manager] Device cannot support half precision. Fallback to float....") print("[Device Manager] Device cannot support half precision. Fallback to float....")

View File

@ -0,0 +1,169 @@
from torchaudio.transforms import Resample
import pyworld as pw
import numpy as np
import torchcrepe
import torch
import torch.nn.functional as F
CREPE_RESAMPLE_KERNEL = {}
def median_pool_1d(x, kernel_size):
x = x.unsqueeze(1)
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
x = x.squeeze(1)
x = x.unfold(1, kernel_size, 1)
x, _ = torch.sort(x, dim=-1)
return x[:, :, (kernel_size - 1) // 2]
def masked_avg_pool_1d(x, kernel_size):
x = x.unsqueeze(1)
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
mask = ~torch.isnan(x)
masked_x = torch.where(mask, x, torch.zeros_like(x))
ones_kernel = torch.ones(x.size(1), 1, kernel_size, device=x.device)
# Perform sum pooling
sum_pooled = F.conv1d(
masked_x,
ones_kernel,
stride=1,
padding=0,
groups=x.size(1),
)
# Count the non-masked (valid) elements in each pooling window
valid_count = F.conv1d(
mask.float(),
ones_kernel,
stride=1,
padding=0,
groups=x.size(1),
)
valid_count = valid_count.clamp(min=1) # Avoid division by zero
# Perform masked average pooling
avg_pooled = sum_pooled / valid_count
return avg_pooled.squeeze(1)
class F0_Extractor:
def __init__(self, f0_extractor, sample_rate=44100, hop_size=512, f0_min=65, f0_max=800,
block_size=None, model_sampling_rate=None):
self.block_size = block_size
self.model_sampling_rate = model_sampling_rate
self.f0_extractor = f0_extractor
self.sample_rate = sample_rate
self.hop_size = hop_size
self.f0_min = f0_min
self.f0_max = f0_max
self.transformer_f0 = None
if f0_extractor == 'crepe':
key_str = str(sample_rate)
if key_str not in CREPE_RESAMPLE_KERNEL:
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
if (self.block_size is not None) or (self.model_sampling_rate is not None):
assert (self.block_size is not None) and (self.model_sampling_rate is not None)
self.hop_size_follow_input = True
else:
self.hop_size_follow_input = False
@torch.no_grad()
def extract_f0(self, audio, key=0, sr=44100, silence_front=0):
f0 = self.extract(audio.cpu().numpy(), uv_interp=True, silence_front=silence_front, sr=sr)
# f0 = torch.from_numpy(f0).float().to(self.device).unsqueeze(-1).unsqueeze(0)
print("[PITCH_F0_ORG1]", f0)
f0 = torch.from_numpy(f0).float().unsqueeze(-1).unsqueeze(0)
f0 = f0 * 2 ** (float(key) / 12)
print("[PITCH_F0_ORG2]", f0)
return f0
def extract(self, audio, uv_interp=False, device=None, silence_front=0, sr=None): # audio: 1d numpy array
if sr is not None:
assert self.hop_size_follow_input
self.hop_size = self.block_size * sr / self.model_sampling_rate
if (self.f0_extractor == 'crepe') and (sr != self.sample_rate):
key_str = str(sr)
if key_str not in CREPE_RESAMPLE_KERNEL:
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sr, 16000, lowpass_filter_width=128)
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
self.sample_rate = sr
# extractor start time
raw_audio = audio
n_frames = int(len(audio) // self.hop_size) + 1
start_frame = int(silence_front * self.sample_rate / self.hop_size)
real_silence_front = start_frame * self.hop_size / self.sample_rate
audio = audio[int(np.round(real_silence_front * self.sample_rate)):]
if self.f0_extractor == 'dio':
_f0, t = pw.dio(
audio.astype('double'),
self.sample_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
channels_in_octave=2,
frame_period=(1000 * self.hop_size / self.sample_rate))
f0 = pw.stonemask(audio.astype('double'), _f0, t, self.sample_rate)
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
# extract f0 using harvest
elif self.f0_extractor == 'harvest':
print("[SRC AUDIO2]", audio[:10])
print("_____hopsize______", (1000 * self.hop_size / self.sample_rate), self.sample_rate)
f0, _ = pw.harvest(
audio.astype('double'),
self.sample_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
frame_period=(1000 * self.hop_size / self.sample_rate))
print("[HARVEST-----1111]", f0)
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
print("[HARVEST-----1112]", f0)
# extract f0 using crepe
elif self.f0_extractor == 'crepe':
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
resample_kernel = self.resample_kernel.to(device)
wav16k_torch = resample_kernel(torch.FloatTensor(audio).unsqueeze(0).to(device))
f0, pd = torchcrepe.predict(wav16k_torch, 16000, 80, self.f0_min, self.f0_max, pad=True, model='full',
batch_size=512, device=device, return_periodicity=True)
pd = median_pool_1d(pd, 4)
f0 = torchcrepe.threshold.At(0.05)(f0, pd)
f0 = masked_avg_pool_1d(f0, 4)
f0 = f0.squeeze(0).cpu().numpy()
f0 = np.array(
[f0[int(min(int(np.round(n * self.hop_size / self.sample_rate / 0.005)), len(f0) - 1))] for n in
range(n_frames - start_frame)])
f0 = np.pad(f0, (start_frame, 0))
elif self.f0_extractor == "transformer_f0":
if self.transformer_f0 is None:
from transformer_f0.model import TransformerF0Infer
self.transformer_f0 = TransformerF0Infer(model_path='exp/f0_test_genshin/model_540000.pt')
# raw_audio = audio
f0 = self.transformer_f0(audio=raw_audio, sr=self.sample_rate)
# f0 = f0.transpose(1, 2)
# f0 = torch.nn.functional.interpolate(f0, size=int(n_frames), mode='nearest')
# f0 = f0.transpose(1, 2)
f0 = f0.squeeze().cpu().numpy()
# f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
else:
raise ValueError(f" [x] Unknown f0 extractor: {self.f0_extractor}")
# interpolate the unvoiced f0
if uv_interp:
uv = f0 == 0
if len(f0[~uv]) > 0:
f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
f0[f0 < self.f0_min] = self.f0_min
print("[HARVEST-----1113]", f0)
return f0

View File

@ -9,10 +9,11 @@ from Exceptions import (
NotEnoughDataExtimateF0, NotEnoughDataExtimateF0,
) )
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.F0Extractor import F0_Extractor
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.common.VolumeExtractor import VolumeExtractor from voice_changer.common.VolumeExtractor import VolumeExtractor
@ -48,6 +49,7 @@ class Pipeline(object):
self.volumeExtractor = VolumeExtractor(self.hop_size) self.volumeExtractor = VolumeExtractor(self.hop_size)
self.embedder = embedder self.embedder = embedder
self.pitchExtractor = pitchExtractor self.pitchExtractor = pitchExtractor
# self.f0ex = self.load_f0_extractor(f0_model="harvest", f0_min=50, f0_max=1100)
print("VOLUME EXTRACTOR", self.volumeExtractor) print("VOLUME EXTRACTOR", self.volumeExtractor)
print("GENERATE INFERENCER", self.inferencer) print("GENERATE INFERENCER", self.inferencer)
@ -58,6 +60,18 @@ class Pipeline(object):
self.device = device self.device = device
self.isHalf = False self.isHalf = False
def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
f0_extractor = F0_Extractor(
f0_extractor=f0_model,
sample_rate=44100,
hop_size=512,
f0_min=f0_min,
f0_max=f0_max,
block_size=512,
model_sampling_rate=44100
)
return f0_extractor
def getPipelineInfo(self): def getPipelineInfo(self):
volumeExtractorInfo = self.volumeExtractor.getVolumeExtractorInfo() volumeExtractorInfo = self.volumeExtractor.getVolumeExtractorInfo()
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {} inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
@ -95,8 +109,10 @@ class Pipeline(object):
n_frames = int(audio_pad.size(-1) // self.hop_size + 1) n_frames = int(audio_pad.size(-1) // self.hop_size + 1)
volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0) volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0)
# ピッチ検出 # ピッチ検出
try: try:
# print("[SRC AUDIO----]", audio_pad)
pitch, pitchf = self.pitchExtractor.extract( pitch, pitchf = self.pitchExtractor.extract(
audio_pad, audio_pad,
pitchf, pitchf,
@ -106,7 +122,6 @@ class Pipeline(object):
int(self.hop_size), # 処理のwindowサイズ (44100における512) int(self.hop_size), # 処理のwindowサイズ (44100における512)
silence_front=silence_front, silence_front=silence_front,
) )
print("[Pitch]", pitch)
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() # 160window sizeを前提にバッファを作っているので切る。 pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() # 160window sizeを前提にバッファを作っているので切る。
pitchf = torch.tensor(pitchf[-n_frames:], device=self.device, dtype=torch.float).unsqueeze(0) # 160window sizeを前提にバッファを作っているので切る。 pitchf = torch.tensor(pitchf[-n_frames:], device=self.device, dtype=torch.float).unsqueeze(0) # 160window sizeを前提にバッファを作っているので切る。
@ -114,6 +129,9 @@ class Pipeline(object):
# print(e) # print(e)
raise NotEnoughDataExtimateF0() raise NotEnoughDataExtimateF0()
# f0 = self.f0ex.extract_f0(audio_pad, key=4, sr=44100)
# print("[Pitch_f0]", f0)
# tensor型調整 # tensor型調整
feats = audio_pad feats = audio_pad
if feats.dim() == 2: # double channels if feats.dim() == 2: # double channels
@ -155,13 +173,13 @@ class Pipeline(object):
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる # pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。 # pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929 # https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
if protect < 0.5: # if protect < 0.5:
pitchff = pitchf.clone() # pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1 # pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect # pitchff[pitchf < 1] = protect
pitchff = pitchff.unsqueeze(-1) # pitchff = pitchff.unsqueeze(-1)
feats = feats * pitchff + feats0 * (1 - pitchff) # feats = feats * pitchff + feats0 * (1 - pitchff)
feats = feats.to(feats0.dtype) # feats = feats.to(feats0.dtype)
# # apply silent front for inference # # apply silent front for inference
# if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]: # if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
@ -176,7 +194,7 @@ class Pipeline(object):
torch.clip( torch.clip(
self.inferencer.infer( self.inferencer.infer(
feats, feats,
pitch.unsqueeze(-1), pitchf.unsqueeze(-1),
volume, volume,
mask, mask,
sid, sid,

View File

@ -2,10 +2,10 @@ import traceback
from data.ModelSlot import DiffusionSVCModelSlot from data.ModelSlot import DiffusionSVCModelSlot
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str): def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):

View File

@ -0,0 +1,66 @@
import numpy as np
from const import PitchExtractorType
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
import onnxruntime
from voice_changer.RVC.pitchExtractor import onnxcrepe
class CrepeOnnxPitchExtractor(PitchExtractor):
def __init__(self, pitchExtractorType: PitchExtractorType, file: str, gpu: int):
self.pitchExtractorType = pitchExtractorType
super().__init__()
(
onnxProviders,
onnxProviderOptions,
) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
self.onnx_session = onnxruntime.InferenceSession(
file, providers=onnxProviders, provider_options=onnxProviderOptions
)
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
silence_front_offset = int(np.round(real_silence_front * sr))
audio = audio[silence_front_offset:]
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
precision = 10.0
audio_num = audio.cpu()
onnx_f0, onnx_pd = onnxcrepe.predict(
self.onnx_session,
audio_num,
sr,
precision=precision,
fmin=f0_min,
fmax=f0_max,
batch_size=256,
return_periodicity=True,
decoder=onnxcrepe.decode.weighted_argmax,
)
f0 = onnxcrepe.filter.median(onnx_f0, 3)
pd = onnxcrepe.filter.median(onnx_pd, 3)
f0[pd < 0.1] = 0
f0 = f0.squeeze()
f0 *= pow(2, f0_up_key / 12)
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
f0_mel = np.clip(
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
)
pitch_coarse = f0_mel.astype(int)
return pitch_coarse, pitchf

View File

@ -0,0 +1,59 @@
import torchcrepe
import torch
import numpy as np
from const import PitchExtractorType
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class CrepePitchExtractor(PitchExtractor):
def __init__(self):
super().__init__()
self.pitchExtractorType: PitchExtractorType = "crepe"
if torch.cuda.is_available():
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
else:
self.device = torch.device("cpu")
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
silence_front_offset = int(np.round(real_silence_front * sr))
audio = audio[silence_front_offset:]
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0, pd = torchcrepe.predict(
audio.unsqueeze(0),
sr,
hop_length=window,
fmin=f0_min,
fmax=f0_max,
# model="tiny",
model="full",
batch_size=256,
decoder=torchcrepe.decode.weighted_argmax,
device=self.device,
return_periodicity=True,
)
f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ
pd = torchcrepe.filter.median(pd, 3)
f0[pd < 0.1] = 0
f0 = f0.squeeze()
f0 *= pow(2, f0_up_key / 12)
pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
f0_mel = np.clip(
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
)
pitch_coarse = f0_mel.astype(int)
return pitch_coarse, pitchf

View File

@ -0,0 +1,49 @@
import pyworld
import numpy as np
from const import PitchExtractorType
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class DioPitchExtractor(PitchExtractor):
def __init__(self):
super().__init__()
self.pitchExtractorType: PitchExtractorType = "dio"
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1 # NOQA
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
silence_front_offset = max(min(int(np.round(real_silence_front * sr)), len(audio) - 3000), 0)
audio = audio[silence_front_offset:]
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
_f0, t = pyworld.dio(
audio.astype(np.double),
sr,
f0_floor=f0_min,
f0_ceil=f0_max,
channels_in_octave=2,
frame_period=10,
)
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
f0 *= pow(2, f0_up_key / 12)
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127 * np.log(1 + f0bak / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
pitch_coarse = np.rint(f0_mel).astype(int)
return pitch_coarse, pitchf

View File

@ -0,0 +1,112 @@
import pyworld
import numpy as np
import scipy.signal as signal
from const import PitchExtractorType
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class HarvestPitchExtractor(PitchExtractor):
def __init__(self):
super().__init__()
self.pitchExtractorType: PitchExtractorType = "harvest"
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1 # NOQA
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
# silence_front_offset = int(np.round(real_silence_front * sr))
# audio = audio[silence_front_offset:]
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0 = self.extract2(audio, uv_interp=True, hop_size=window, silence_front=silence_front)
f0 = f0 * 2 ** (float(f0_up_key) / 12)
pitchf = f0
# f0, t = pyworld.harvest(
# audio.astype(np.double),
# fs=sr,
# f0_ceil=f0_max,
# frame_period=10,
# )
# f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr)
# f0 = signal.medfilt(f0, 3)
# f0 *= pow(2, f0_up_key / 12)
# pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127 * np.log(1 + f0bak / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
pitch_coarse = np.rint(f0_mel).astype(int)
return pitch_coarse, pitchf
def extract2(self, audio, uv_interp, hop_size: int, silence_front=0): # audio: 1d numpy array
n_frames = int(len(audio) // hop_size) + 1
start_frame = int(silence_front * 16000 / hop_size)
real_silence_front = start_frame * hop_size / 16000
audio = audio[int(np.round(real_silence_front * 16000)):]
f0, _ = pyworld.harvest(
audio.astype('double'),
16000,
f0_floor=50,
f0_ceil=1100,
frame_period=(1000 * hop_size / 16000))
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
if uv_interp:
uv = f0 == 0
if len(f0[~uv]) > 0:
f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
f0[f0 < 50] = 50
return f0
def extract_old(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1 # NOQA
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
silence_front_offset = int(np.round(real_silence_front * sr))
audio = audio[silence_front_offset:]
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0, t = pyworld.harvest(
audio.astype(np.double),
fs=sr,
f0_ceil=f0_max,
frame_period=10,
)
f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr)
f0 = signal.medfilt(f0, 3)
f0 *= pow(2, f0_up_key / 12)
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127 * np.log(1 + f0bak / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
pitch_coarse = np.rint(f0_mel).astype(int)
return pitch_coarse, pitchf

View File

@ -0,0 +1,12 @@
from typing import Protocol
class PitchExtractor(Protocol):
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
...
def getPitchExtractorInfo(self):
return {
"pitchExtractorType": self.pitchExtractorType,
}

View File

@ -0,0 +1,41 @@
from typing import Protocol
from const import PitchExtractorType
from voice_changer.DiffusionSVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
class PitchExtractorManager(Protocol):
currentPitchExtractor: PitchExtractor | None = None
params: VoiceChangerParams
@classmethod
def initialize(cls, params: VoiceChangerParams):
cls.params = params
@classmethod
def getPitchExtractor(
cls, pitchExtractorType: PitchExtractorType, gpu: int
) -> PitchExtractor:
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType, gpu)
return cls.currentPitchExtractor
@classmethod
def loadPitchExtractor(
cls, pitchExtractorType: PitchExtractorType, gpu: int
) -> PitchExtractor:
if pitchExtractorType == "harvest":
return HarvestPitchExtractor()
# elif pitchExtractorType == "dio":
# return DioPitchExtractor()
# elif pitchExtractorType == "crepe":
# return CrepePitchExtractor()
# elif pitchExtractorType == "crepe_tiny":
# return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
# elif pitchExtractorType == "crepe_full":
# return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
else:
# return hubert as default
raise RuntimeError(
"[Voice Changer] PitchExctractor not found", pitchExtractorType
)

View File

@ -255,6 +255,7 @@ class VoiceChanger:
# 後処理 # 後処理
with Timer("post-process") as t: with Timer("post-process") as t:
result = result.astype(np.int16) result = result.astype(np.int16)
if self.settings.outputSampleRate != processing_sampling_rate: if self.settings.outputSampleRate != processing_sampling_rate:
# print( # print(
# "output samplingrate", # "output samplingrate",
@ -291,6 +292,7 @@ class VoiceChanger:
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
perf = [preprocess_time, mainprocess_time, postprocess_time] perf = [preprocess_time, mainprocess_time, postprocess_time]
return outputData, perf return outputData, perf
except NoModeLoadedException as e: except NoModeLoadedException as e: