diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVC.py b/server/voice_changer/DiffusionSVC/DiffusionSVC.py index a030f354..df74a556 100644 --- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py +++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py @@ -6,15 +6,15 @@ import torch import torchaudio from data.ModelSlot import DiffusionSVCModelSlot from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings +from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline +from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager -from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel from voice_changer.utils.VoiceChangerParams import VoiceChangerParams +from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager from voice_changer.RVC.onnxExporter.export2onnx import export2onnx -from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager -from voice_changer.RVC.pipeline.Pipeline import Pipeline from Exceptions import DeviceCannotSupportHalfPrecisionException @@ -165,8 +165,6 @@ class DiffusionSVC(VoiceChangerModel): # result = audio_out.detach().cpu().numpy() * np.sqrt(vol) result = audio_out.detach().cpu().numpy() - print("RESULT", result) - return result except DeviceCannotSupportHalfPrecisionException as e: # NOQA print("[Device Manager] Device cannot support half precision. Fallback to float....") diff --git a/server/voice_changer/DiffusionSVC/inferencer/diffusion_svc_model/F0Extractor.py b/server/voice_changer/DiffusionSVC/inferencer/diffusion_svc_model/F0Extractor.py new file mode 100644 index 00000000..7cd84a98 --- /dev/null +++ b/server/voice_changer/DiffusionSVC/inferencer/diffusion_svc_model/F0Extractor.py @@ -0,0 +1,169 @@ + +from torchaudio.transforms import Resample +import pyworld as pw +import numpy as np +import torchcrepe +import torch +import torch.nn.functional as F +CREPE_RESAMPLE_KERNEL = {} + + +def median_pool_1d(x, kernel_size): + x = x.unsqueeze(1) + x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect") + x = x.squeeze(1) + x = x.unfold(1, kernel_size, 1) + x, _ = torch.sort(x, dim=-1) + return x[:, :, (kernel_size - 1) // 2] + + +def masked_avg_pool_1d(x, kernel_size): + x = x.unsqueeze(1) + x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect") + mask = ~torch.isnan(x) + masked_x = torch.where(mask, x, torch.zeros_like(x)) + ones_kernel = torch.ones(x.size(1), 1, kernel_size, device=x.device) + + # Perform sum pooling + sum_pooled = F.conv1d( + masked_x, + ones_kernel, + stride=1, + padding=0, + groups=x.size(1), + ) + + # Count the non-masked (valid) elements in each pooling window + valid_count = F.conv1d( + mask.float(), + ones_kernel, + stride=1, + padding=0, + groups=x.size(1), + ) + valid_count = valid_count.clamp(min=1) # Avoid division by zero + + # Perform masked average pooling + avg_pooled = sum_pooled / valid_count + + return avg_pooled.squeeze(1) + + +class F0_Extractor: + def __init__(self, f0_extractor, sample_rate=44100, hop_size=512, f0_min=65, f0_max=800, + block_size=None, model_sampling_rate=None): + self.block_size = block_size + self.model_sampling_rate = model_sampling_rate + self.f0_extractor = f0_extractor + self.sample_rate = sample_rate + self.hop_size = hop_size + self.f0_min = f0_min + self.f0_max = f0_max + self.transformer_f0 = None + if f0_extractor == 'crepe': + key_str = str(sample_rate) + if key_str not in CREPE_RESAMPLE_KERNEL: + CREPE_RESAMPLE_KERNEL[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128) + self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str] + if (self.block_size is not None) or (self.model_sampling_rate is not None): + assert (self.block_size is not None) and (self.model_sampling_rate is not None) + self.hop_size_follow_input = True + else: + self.hop_size_follow_input = False + + @torch.no_grad() + def extract_f0(self, audio, key=0, sr=44100, silence_front=0): + f0 = self.extract(audio.cpu().numpy(), uv_interp=True, silence_front=silence_front, sr=sr) + # f0 = torch.from_numpy(f0).float().to(self.device).unsqueeze(-1).unsqueeze(0) + print("[PITCH_F0_ORG1]", f0) + f0 = torch.from_numpy(f0).float().unsqueeze(-1).unsqueeze(0) + f0 = f0 * 2 ** (float(key) / 12) + print("[PITCH_F0_ORG2]", f0) + return f0 + + def extract(self, audio, uv_interp=False, device=None, silence_front=0, sr=None): # audio: 1d numpy array + if sr is not None: + assert self.hop_size_follow_input + self.hop_size = self.block_size * sr / self.model_sampling_rate + if (self.f0_extractor == 'crepe') and (sr != self.sample_rate): + key_str = str(sr) + if key_str not in CREPE_RESAMPLE_KERNEL: + CREPE_RESAMPLE_KERNEL[key_str] = Resample(sr, 16000, lowpass_filter_width=128) + self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str] + self.sample_rate = sr + + # extractor start time + raw_audio = audio + n_frames = int(len(audio) // self.hop_size) + 1 + + start_frame = int(silence_front * self.sample_rate / self.hop_size) + real_silence_front = start_frame * self.hop_size / self.sample_rate + audio = audio[int(np.round(real_silence_front * self.sample_rate)):] + + if self.f0_extractor == 'dio': + _f0, t = pw.dio( + audio.astype('double'), + self.sample_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + channels_in_octave=2, + frame_period=(1000 * self.hop_size / self.sample_rate)) + f0 = pw.stonemask(audio.astype('double'), _f0, t, self.sample_rate) + f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame)) + + # extract f0 using harvest + elif self.f0_extractor == 'harvest': + print("[SRC AUDIO2]", audio[:10]) + print("_____hopsize______", (1000 * self.hop_size / self.sample_rate), self.sample_rate) + f0, _ = pw.harvest( + audio.astype('double'), + self.sample_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=(1000 * self.hop_size / self.sample_rate)) + print("[HARVEST-----1111]", f0) + f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame)) + print("[HARVEST-----1112]", f0) + + # extract f0 using crepe + elif self.f0_extractor == 'crepe': + if device is None: + device = 'cuda' if torch.cuda.is_available() else 'cpu' + resample_kernel = self.resample_kernel.to(device) + wav16k_torch = resample_kernel(torch.FloatTensor(audio).unsqueeze(0).to(device)) + + f0, pd = torchcrepe.predict(wav16k_torch, 16000, 80, self.f0_min, self.f0_max, pad=True, model='full', + batch_size=512, device=device, return_periodicity=True) + pd = median_pool_1d(pd, 4) + f0 = torchcrepe.threshold.At(0.05)(f0, pd) + f0 = masked_avg_pool_1d(f0, 4) + + f0 = f0.squeeze(0).cpu().numpy() + f0 = np.array( + [f0[int(min(int(np.round(n * self.hop_size / self.sample_rate / 0.005)), len(f0) - 1))] for n in + range(n_frames - start_frame)]) + f0 = np.pad(f0, (start_frame, 0)) + + elif self.f0_extractor == "transformer_f0": + if self.transformer_f0 is None: + from transformer_f0.model import TransformerF0Infer + self.transformer_f0 = TransformerF0Infer(model_path='exp/f0_test_genshin/model_540000.pt') + # raw_audio = audio + f0 = self.transformer_f0(audio=raw_audio, sr=self.sample_rate) + # f0 = f0.transpose(1, 2) + # f0 = torch.nn.functional.interpolate(f0, size=int(n_frames), mode='nearest') + # f0 = f0.transpose(1, 2) + f0 = f0.squeeze().cpu().numpy() + # f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame)) + else: + raise ValueError(f" [x] Unknown f0 extractor: {self.f0_extractor}") + + # interpolate the unvoiced f0 + if uv_interp: + uv = f0 == 0 + if len(f0[~uv]) > 0: + f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) + f0[f0 < self.f0_min] = self.f0_min + + print("[HARVEST-----1113]", f0) + return f0 diff --git a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py index e14b57a0..c126f575 100644 --- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py +++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py @@ -9,10 +9,11 @@ from Exceptions import ( NotEnoughDataExtimateF0, ) from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer +from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.F0Extractor import F0_Extractor +from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.RVC.embedder.Embedder import Embedder -from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.common.VolumeExtractor import VolumeExtractor @@ -48,6 +49,7 @@ class Pipeline(object): self.volumeExtractor = VolumeExtractor(self.hop_size) self.embedder = embedder self.pitchExtractor = pitchExtractor + # self.f0ex = self.load_f0_extractor(f0_model="harvest", f0_min=50, f0_max=1100) print("VOLUME EXTRACTOR", self.volumeExtractor) print("GENERATE INFERENCER", self.inferencer) @@ -58,6 +60,18 @@ class Pipeline(object): self.device = device self.isHalf = False + def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None): + f0_extractor = F0_Extractor( + f0_extractor=f0_model, + sample_rate=44100, + hop_size=512, + f0_min=f0_min, + f0_max=f0_max, + block_size=512, + model_sampling_rate=44100 + ) + return f0_extractor + def getPipelineInfo(self): volumeExtractorInfo = self.volumeExtractor.getVolumeExtractorInfo() inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {} @@ -95,8 +109,10 @@ class Pipeline(object): n_frames = int(audio_pad.size(-1) // self.hop_size + 1) volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0) + # ピッチ検出 try: + # print("[SRC AUDIO----]", audio_pad) pitch, pitchf = self.pitchExtractor.extract( audio_pad, pitchf, @@ -106,7 +122,6 @@ class Pipeline(object): int(self.hop_size), # 処理のwindowサイズ (44100における512) silence_front=silence_front, ) - print("[Pitch]", pitch) pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() # 160window sizeを前提にバッファを作っているので切る。 pitchf = torch.tensor(pitchf[-n_frames:], device=self.device, dtype=torch.float).unsqueeze(0) # 160window sizeを前提にバッファを作っているので切る。 @@ -114,6 +129,9 @@ class Pipeline(object): # print(e) raise NotEnoughDataExtimateF0() + # f0 = self.f0ex.extract_f0(audio_pad, key=4, sr=44100) + # print("[Pitch_f0]", f0) + # tensor型調整 feats = audio_pad if feats.dim() == 2: # double channels @@ -155,13 +173,13 @@ class Pipeline(object): # pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる # pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。 # https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929 - if protect < 0.5: - pitchff = pitchf.clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - feats = feats * pitchff + feats0 * (1 - pitchff) - feats = feats.to(feats0.dtype) + # if protect < 0.5: + # pitchff = pitchf.clone() + # pitchff[pitchf > 0] = 1 + # pitchff[pitchf < 1] = protect + # pitchff = pitchff.unsqueeze(-1) + # feats = feats * pitchff + feats0 * (1 - pitchff) + # feats = feats.to(feats0.dtype) # # apply silent front for inference # if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]: @@ -176,7 +194,7 @@ class Pipeline(object): torch.clip( self.inferencer.infer( feats, - pitch.unsqueeze(-1), + pitchf.unsqueeze(-1), volume, mask, sid, diff --git a/server/voice_changer/DiffusionSVC/pipeline/PipelineGenerator.py b/server/voice_changer/DiffusionSVC/pipeline/PipelineGenerator.py index 6545b6f1..59712004 100644 --- a/server/voice_changer/DiffusionSVC/pipeline/PipelineGenerator.py +++ b/server/voice_changer/DiffusionSVC/pipeline/PipelineGenerator.py @@ -2,10 +2,10 @@ import traceback from data.ModelSlot import DiffusionSVCModelSlot from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline +from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager -from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str): diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/CrepeOnnxPitchExtractor.py b/server/voice_changer/DiffusionSVC/pitchExtractor/CrepeOnnxPitchExtractor.py new file mode 100644 index 00000000..64b1ed42 --- /dev/null +++ b/server/voice_changer/DiffusionSVC/pitchExtractor/CrepeOnnxPitchExtractor.py @@ -0,0 +1,66 @@ +import numpy as np +from const import PitchExtractorType +from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor +import onnxruntime +from voice_changer.RVC.pitchExtractor import onnxcrepe + + +class CrepeOnnxPitchExtractor(PitchExtractor): + + def __init__(self, pitchExtractorType: PitchExtractorType, file: str, gpu: int): + self.pitchExtractorType = pitchExtractorType + super().__init__() + ( + onnxProviders, + onnxProviderOptions, + ) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu) + + self.onnx_session = onnxruntime.InferenceSession( + file, providers=onnxProviders, provider_options=onnxProviderOptions + ) + + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): + n_frames = int(len(audio) // window) + 1 + start_frame = int(silence_front * sr / window) + real_silence_front = start_frame * window / sr + + silence_front_offset = int(np.round(real_silence_front * sr)) + audio = audio[silence_front_offset:] + + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + precision = 10.0 + + audio_num = audio.cpu() + onnx_f0, onnx_pd = onnxcrepe.predict( + self.onnx_session, + audio_num, + sr, + precision=precision, + fmin=f0_min, + fmax=f0_max, + batch_size=256, + return_periodicity=True, + decoder=onnxcrepe.decode.weighted_argmax, + ) + + f0 = onnxcrepe.filter.median(onnx_f0, 3) + pd = onnxcrepe.filter.median(onnx_pd, 3) + + f0[pd < 0.1] = 0 + f0 = f0.squeeze() + + f0 *= pow(2, f0_up_key / 12) + pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) + f0_mel = np.clip( + (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 + ) + pitch_coarse = f0_mel.astype(int) + + return pitch_coarse, pitchf diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/DiffusionSVC/pitchExtractor/CrepePitchExtractor.py new file mode 100644 index 00000000..55528a0d --- /dev/null +++ b/server/voice_changer/DiffusionSVC/pitchExtractor/CrepePitchExtractor.py @@ -0,0 +1,59 @@ +import torchcrepe +import torch +import numpy as np +from const import PitchExtractorType + +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor + + +class CrepePitchExtractor(PitchExtractor): + + def __init__(self): + super().__init__() + self.pitchExtractorType: PitchExtractorType = "crepe" + if torch.cuda.is_available(): + self.device = torch.device("cuda:" + str(torch.cuda.current_device())) + else: + self.device = torch.device("cpu") + + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): + n_frames = int(len(audio) // window) + 1 + start_frame = int(silence_front * sr / window) + real_silence_front = start_frame * window / sr + + silence_front_offset = int(np.round(real_silence_front * sr)) + audio = audio[silence_front_offset:] + + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0, pd = torchcrepe.predict( + audio.unsqueeze(0), + sr, + hop_length=window, + fmin=f0_min, + fmax=f0_max, + # model="tiny", + model="full", + batch_size=256, + decoder=torchcrepe.decode.weighted_argmax, + device=self.device, + return_periodicity=True, + ) + f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ + pd = torchcrepe.filter.median(pd, 3) + f0[pd < 0.1] = 0 + f0 = f0.squeeze() + + f0 *= pow(2, f0_up_key / 12) + pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) + f0_mel = np.clip( + (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 + ) + pitch_coarse = f0_mel.astype(int) + + return pitch_coarse, pitchf diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py new file mode 100644 index 00000000..a7d4c95b --- /dev/null +++ b/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py @@ -0,0 +1,49 @@ +import pyworld +import numpy as np +from const import PitchExtractorType + +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor + + +class DioPitchExtractor(PitchExtractor): + + def __init__(self): + super().__init__() + self.pitchExtractorType: PitchExtractorType = "dio" + + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): + audio = audio.detach().cpu().numpy() + n_frames = int(len(audio) // window) + 1 # NOQA + start_frame = int(silence_front * sr / window) + real_silence_front = start_frame * window / sr + + silence_front_offset = max(min(int(np.round(real_silence_front * sr)), len(audio) - 3000), 0) + audio = audio[silence_front_offset:] + + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + _f0, t = pyworld.dio( + audio.astype(np.double), + sr, + f0_floor=f0_min, + f0_ceil=f0_max, + channels_in_octave=2, + frame_period=10, + ) + f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr) + + f0 *= pow(2, f0_up_key / 12) + pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127 * np.log(1 + f0bak / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch_coarse = np.rint(f0_mel).astype(int) + + return pitch_coarse, pitchf diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/HarvestPitchExtractor.py b/server/voice_changer/DiffusionSVC/pitchExtractor/HarvestPitchExtractor.py new file mode 100644 index 00000000..f475eb02 --- /dev/null +++ b/server/voice_changer/DiffusionSVC/pitchExtractor/HarvestPitchExtractor.py @@ -0,0 +1,112 @@ +import pyworld +import numpy as np +import scipy.signal as signal +from const import PitchExtractorType + +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor + + +class HarvestPitchExtractor(PitchExtractor): + + def __init__(self): + super().__init__() + self.pitchExtractorType: PitchExtractorType = "harvest" + + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): + audio = audio.detach().cpu().numpy() + n_frames = int(len(audio) // window) + 1 # NOQA + start_frame = int(silence_front * sr / window) + real_silence_front = start_frame * window / sr + + # silence_front_offset = int(np.round(real_silence_front * sr)) + # audio = audio[silence_front_offset:] + + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0 = self.extract2(audio, uv_interp=True, hop_size=window, silence_front=silence_front) + f0 = f0 * 2 ** (float(f0_up_key) / 12) + pitchf = f0 + + # f0, t = pyworld.harvest( + # audio.astype(np.double), + # fs=sr, + # f0_ceil=f0_max, + # frame_period=10, + # ) + # f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr) + # f0 = signal.medfilt(f0, 3) + + # f0 *= pow(2, f0_up_key / 12) + # pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127 * np.log(1 + f0bak / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch_coarse = np.rint(f0_mel).astype(int) + + return pitch_coarse, pitchf + + def extract2(self, audio, uv_interp, hop_size: int, silence_front=0): # audio: 1d numpy array + n_frames = int(len(audio) // hop_size) + 1 + + start_frame = int(silence_front * 16000 / hop_size) + real_silence_front = start_frame * hop_size / 16000 + audio = audio[int(np.round(real_silence_front * 16000)):] + + f0, _ = pyworld.harvest( + audio.astype('double'), + 16000, + f0_floor=50, + f0_ceil=1100, + frame_period=(1000 * hop_size / 16000)) + f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame)) + + if uv_interp: + uv = f0 == 0 + if len(f0[~uv]) > 0: + f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) + f0[f0 < 50] = 50 + + return f0 + + def extract_old(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): + audio = audio.detach().cpu().numpy() + n_frames = int(len(audio) // window) + 1 # NOQA + start_frame = int(silence_front * sr / window) + real_silence_front = start_frame * window / sr + + silence_front_offset = int(np.round(real_silence_front * sr)) + audio = audio[silence_front_offset:] + + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0, t = pyworld.harvest( + audio.astype(np.double), + fs=sr, + f0_ceil=f0_max, + frame_period=10, + ) + f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr) + f0 = signal.medfilt(f0, 3) + + f0 *= pow(2, f0_up_key / 12) + pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127 * np.log(1 + f0bak / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch_coarse = np.rint(f0_mel).astype(int) + + return pitch_coarse, pitchf \ No newline at end of file diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/PitchExtractor.py b/server/voice_changer/DiffusionSVC/pitchExtractor/PitchExtractor.py new file mode 100644 index 00000000..b75b8849 --- /dev/null +++ b/server/voice_changer/DiffusionSVC/pitchExtractor/PitchExtractor.py @@ -0,0 +1,12 @@ +from typing import Protocol + + +class PitchExtractor(Protocol): + + def extract(self, audio, f0_up_key, sr, window, silence_front=0): + ... + + def getPitchExtractorInfo(self): + return { + "pitchExtractorType": self.pitchExtractorType, + } diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/PitchExtractorManager.py b/server/voice_changer/DiffusionSVC/pitchExtractor/PitchExtractorManager.py new file mode 100644 index 00000000..56bee645 --- /dev/null +++ b/server/voice_changer/DiffusionSVC/pitchExtractor/PitchExtractorManager.py @@ -0,0 +1,41 @@ +from typing import Protocol +from const import PitchExtractorType +from voice_changer.DiffusionSVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor +from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor +from voice_changer.utils.VoiceChangerParams import VoiceChangerParams + + +class PitchExtractorManager(Protocol): + currentPitchExtractor: PitchExtractor | None = None + params: VoiceChangerParams + + @classmethod + def initialize(cls, params: VoiceChangerParams): + cls.params = params + + @classmethod + def getPitchExtractor( + cls, pitchExtractorType: PitchExtractorType, gpu: int + ) -> PitchExtractor: + cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType, gpu) + return cls.currentPitchExtractor + + @classmethod + def loadPitchExtractor( + cls, pitchExtractorType: PitchExtractorType, gpu: int + ) -> PitchExtractor: + if pitchExtractorType == "harvest": + return HarvestPitchExtractor() + # elif pitchExtractorType == "dio": + # return DioPitchExtractor() + # elif pitchExtractorType == "crepe": + # return CrepePitchExtractor() + # elif pitchExtractorType == "crepe_tiny": + # return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu) + # elif pitchExtractorType == "crepe_full": + # return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu) + else: + # return hubert as default + raise RuntimeError( + "[Voice Changer] PitchExctractor not found", pitchExtractorType + ) diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index d0a01934..3b0bc7b0 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -255,6 +255,7 @@ class VoiceChanger: # 後処理 with Timer("post-process") as t: result = result.astype(np.int16) + if self.settings.outputSampleRate != processing_sampling_rate: # print( # "output samplingrate", @@ -291,6 +292,7 @@ class VoiceChanger: print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") perf = [preprocess_time, mainprocess_time, postprocess_time] + return outputData, perf except NoModeLoadedException as e: