update information when upload model

This commit is contained in:
wataru 2023-01-28 15:56:56 +09:00
parent eb846bedcc
commit 9a473a07be
8 changed files with 6728 additions and 3172 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,7 @@
"postcss-loader": "^7.0.2", "postcss-loader": "^7.0.2",
"postcss-nested": "^6.0.0", "postcss-nested": "^6.0.0",
"prettier": "^2.8.3", "prettier": "^2.8.3",
"rimraf": "^4.1.1", "rimraf": "^4.1.2",
"style-loader": "^3.3.1", "style-loader": "^3.3.1",
"ts-loader": "^9.4.2", "ts-loader": "^9.4.2",
"tsconfig-paths": "^4.1.2", "tsconfig-paths": "^4.1.2",
@ -48,7 +48,7 @@
"webpack-dev-server": "^4.11.1" "webpack-dev-server": "^4.11.1"
}, },
"dependencies": { "dependencies": {
"@dannadori/voice-changer-client-js": "^1.0.20", "@dannadori/voice-changer-client-js": "^1.0.28",
"react": "^18.2.0", "react": "^18.2.0",
"react-dom": "^18.2.0" "react-dom": "^18.2.0"
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
{ {
"name": "@dannadori/voice-changer-client-js", "name": "@dannadori/voice-changer-client-js",
"version": "1.0.20", "version": "1.0.28",
"description": "", "description": "",
"main": "dist/index.js", "main": "dist/index.js",
"directories": { "directories": {
@ -26,17 +26,17 @@
"devDependencies": { "devDependencies": {
"@types/audioworklet": "^0.0.36", "@types/audioworklet": "^0.0.36",
"@types/node": "^18.11.18", "@types/node": "^18.11.18",
"@types/react": "18.0.26", "@types/react": "18.0.27",
"@types/react-dom": "18.0.10", "@types/react-dom": "18.0.10",
"eslint": "^8.32.0", "eslint": "^8.32.0",
"eslint-config-prettier": "^8.6.0", "eslint-config-prettier": "^8.6.0",
"eslint-plugin-prettier": "^4.2.1", "eslint-plugin-prettier": "^4.2.1",
"eslint-plugin-react": "^7.32.0", "eslint-plugin-react": "^7.32.1",
"eslint-webpack-plugin": "^3.2.0", "eslint-webpack-plugin": "^3.2.0",
"npm-run-all": "^4.1.5", "npm-run-all": "^4.1.5",
"prettier": "^2.8.3", "prettier": "^2.8.3",
"raw-loader": "^4.0.2", "raw-loader": "^4.0.2",
"rimraf": "^4.0.7", "rimraf": "^4.1.2",
"ts-loader": "^9.4.2", "ts-loader": "^9.4.2",
"typescript": "^4.9.4", "typescript": "^4.9.4",
"webpack": "^5.75.0", "webpack": "^5.75.0",

View File

@ -135,7 +135,7 @@ export const DefaultVoiceChangerServerSetting: VoiceChangerServerSetting = {
crossFadeOffsetRate: 0.1, crossFadeOffsetRate: 0.1,
crossFadeEndRate: 0.9, crossFadeEndRate: 0.9,
crossFadeOverlapRate: 0.5, crossFadeOverlapRate: 0.5,
framework: "PyTorch", framework: "ONNX",
onnxExecutionProvider: "CPUExecutionProvider" onnxExecutionProvider: "CPUExecutionProvider"
} }

View File

@ -179,10 +179,10 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
console.log(progress, end) console.log(progress, end)
}) })
const serverInfo = await props.voiceChangerClient.loadModel(fileUploadSetting.configFile, fileUploadSetting.pyTorchModel, fileUploadSetting.onnxModel) await props.voiceChangerClient.loadModel(fileUploadSetting.configFile, fileUploadSetting.pyTorchModel, fileUploadSetting.onnxModel)
console.log(serverInfo)
setUploadProgress(0) setUploadProgress(0)
setIsUploading(false) setIsUploading(false)
reloadServerInfo()
} }
}, [fileUploadSetting, props.voiceChangerClient]) }, [fileUploadSetting, props.voiceChangerClient])
@ -203,7 +203,6 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
framework: res.framework, framework: res.framework,
onnxExecutionProvider: (!!res.onnxExecutionProvider && res.onnxExecutionProvider.length > 0) ? res.onnxExecutionProvider[0] as OnnxExecutionProvider : DefaultVoiceChangerServerSetting.onnxExecutionProvider onnxExecutionProvider: (!!res.onnxExecutionProvider && res.onnxExecutionProvider.length > 0) ? res.onnxExecutionProvider[0] as OnnxExecutionProvider : DefaultVoiceChangerServerSetting.onnxExecutionProvider
}) })
} }
}, [props.voiceChangerClient]) }, [props.voiceChangerClient])

View File

@ -1,6 +1,7 @@
from const import ERROR_NO_ONNX_SESSION from const import ERROR_NO_ONNX_SESSION
import torch import torch
import os, traceback import os
import traceback
import numpy as np import numpy as np
from dataclasses import dataclass, asdict from dataclasses import dataclass, asdict
@ -11,39 +12,41 @@ from models import SynthesizerTrn
from voice_changer.TrainerFunctions import TextAudioSpeakerCollate, spectrogram_torch, load_checkpoint, get_hparams_from_file from voice_changer.TrainerFunctions import TextAudioSpeakerCollate, spectrogram_torch, load_checkpoint, get_hparams_from_file
providers = ['OpenVINOExecutionProvider',"CUDAExecutionProvider","DmlExecutionProvider","CPUExecutionProvider"] providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
@dataclass @dataclass
class VocieChangerSettings(): class VocieChangerSettings():
gpu:int = 0 gpu: int = 0
srcId:int = 107 srcId: int = 107
dstId:int = 100 dstId: int = 100
crossFadeOffsetRate:float = 0.1 crossFadeOffsetRate: float = 0.1
crossFadeEndRate:float = 0.9 crossFadeEndRate: float = 0.9
crossFadeOverlapRate:float = 0.9 crossFadeOverlapRate: float = 0.9
convertChunkNum:int = 32 convertChunkNum: int = 32
minConvertSize:int = 0 minConvertSize: int = 0
framework:str = "PyTorch" # PyTorch or ONNX framework: str = "ONNX" # PyTorch or ONNX
pyTorchModelFile:str = "" pyTorchModelFile: str = ""
onnxModelFile:str = "" onnxModelFile: str = ""
configFile:str = "" configFile: str = ""
# ↓mutableな物だけ列挙 # ↓mutableな物だけ列挙
intData = ["gpu","srcId", "dstId", "convertChunkNum", "minConvertSize"] intData = ["gpu", "srcId", "dstId", "convertChunkNum", "minConvertSize"]
floatData = [ "crossFadeOffsetRate", "crossFadeEndRate", "crossFadeOverlapRate"] floatData = ["crossFadeOffsetRate", "crossFadeEndRate", "crossFadeOverlapRate"]
strData = ["framework"] strData = ["framework"]
class VoiceChanger(): class VoiceChanger():
def __init__(self, config:str): def __init__(self, config: str):
# 初期化 # 初期化
self.settings = VocieChangerSettings(configFile=config) self.settings = VocieChangerSettings(configFile=config)
self.unpackedData_length=0 self.unpackedData_length = 0
self.net_g = None self.net_g = None
self.onnx_session = None self.onnx_session = None
self.currentCrossFadeOffsetRate=0 self.currentCrossFadeOffsetRate = 0
self.currentCrossFadeEndRate=0 self.currentCrossFadeEndRate = 0
self.currentCrossFadeOverlapRate=0 self.currentCrossFadeOverlapRate = 0
# 共通で使用する情報を収集 # 共通で使用する情報を収集
# self.hps = utils.get_hparams_from_file(config) # self.hps = utils.get_hparams_from_file(config)
self.hps = get_hparams_from_file(config) self.hps = get_hparams_from_file(config)
@ -55,20 +58,20 @@ class VoiceChanger():
# print("text_norm2: ",text_norm) # print("text_norm2: ",text_norm)
# self.text_norm = torch.LongTensor(text_norm) # self.text_norm = torch.LongTensor(text_norm)
self.text_norm = torch.LongTensor([0, 6, 0]) self.text_norm = torch.LongTensor([0, 6, 0])
self.audio_buffer = torch.zeros(1, 0) self.audio_buffer = torch.zeros(1, 0)
self.prev_audio = np.zeros(1) self.prev_audio = np.zeros(1)
self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available() self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})") print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
def loadModel(self, config:str, pyTorch_model_file:str=None, onnx_model_file:str=None): def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
self.settings.configFile = config self.settings.configFile = config
if pyTorch_model_file != None: if pyTorch_model_file != None:
self.settings.pyTorchModelFile = pyTorch_model_file self.settings.pyTorchModelFile = pyTorch_model_file
if onnx_model_file: if onnx_model_file:
self.settings.onnxModelFile = onnx_model_file self.settings.onnxModelFile = onnx_model_file
# PyTorchモデル生成 # PyTorchモデル生成
if pyTorch_model_file != None: if pyTorch_model_file != None:
self.net_g = SynthesizerTrn( self.net_g = SynthesizerTrn(
@ -93,7 +96,7 @@ class VoiceChanger():
def destroy(self): def destroy(self):
del self.net_g del self.net_g
del self.onnx_session del self.onnx_session
def get_info(self): def get_info(self):
data = asdict(self.settings) data = asdict(self.settings)
@ -101,19 +104,19 @@ class VoiceChanger():
data["onnxExecutionProvider"] = self.onnx_session.get_providers() if self.onnx_session != None else [] data["onnxExecutionProvider"] = self.onnx_session.get_providers() if self.onnx_session != None else []
files = ["configFile", "pyTorchModelFile", "onnxModelFile"] files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files: for f in files:
if data[f]!=None and os.path.exists(data[f]): if data[f] != None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f]) data[f] = os.path.basename(data[f])
else: else:
data[f] = "" data[f] = ""
return data return data
def update_setteings(self, key:str, val:any): def update_setteings(self, key: str, val: any):
if key == "onnxExecutionProvider" and self.onnx_session != None: if key == "onnxExecutionProvider" and self.onnx_session != None:
if val == "CUDAExecutionProvider": if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num: if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0 self.settings.gpu = 0
provider_options=[{'device_id': self.settings.gpu}] provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=[val], provider_options=provider_options) self.onnx_session.set_providers(providers=[val], provider_options=provider_options)
else: else:
self.onnx_session.set_providers(providers=[val]) self.onnx_session.set_providers(providers=[val])
@ -123,7 +126,7 @@ class VoiceChanger():
providers = self.onnx_session.get_providers() providers = self.onnx_session.get_providers()
print("Providers:", providers) print("Providers:", providers)
if "CUDAExecutionProvider" in providers: if "CUDAExecutionProvider" in providers:
provider_options=[{'device_id': self.settings.gpu}] provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options) self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate": if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
self.unpackedData_length = 0 self.unpackedData_length = 0
@ -136,7 +139,6 @@ class VoiceChanger():
return self.get_info() return self.get_info()
def _generate_strength(self, unpackedData): def _generate_strength(self, unpackedData):
if self.unpackedData_length != unpackedData.shape[0] or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapRate != self.settings.crossFadeOverlapRate: if self.unpackedData_length != unpackedData.shape[0] or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapRate != self.settings.crossFadeOverlapRate:
@ -148,12 +150,12 @@ class VoiceChanger():
overlapSize = int(unpackedData.shape[0] * self.settings.crossFadeOverlapRate) overlapSize = int(unpackedData.shape[0] * self.settings.crossFadeOverlapRate)
cf_offset = int(overlapSize * self.settings.crossFadeOffsetRate) cf_offset = int(overlapSize * self.settings.crossFadeOffsetRate)
cf_end = int(overlapSize * self.settings.crossFadeEndRate) cf_end = int(overlapSize * self.settings.crossFadeEndRate)
cf_range = cf_end - cf_offset cf_range = cf_end - cf_offset
percent = np.arange(cf_range) / cf_range percent = np.arange(cf_range) / cf_range
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2 np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
np_cur_strength = np.cos((1-percent) * 0.5 * np.pi) ** 2 np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength, np.zeros(overlapSize - cf_offset - len(np_prev_strength))]) self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength, np.zeros(overlapSize - cf_offset - len(np_prev_strength))])
self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(overlapSize - cf_offset - len(np_cur_strength))]) self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(overlapSize - cf_offset - len(np_cur_strength))])
@ -167,23 +169,23 @@ class VoiceChanger():
# print(f"target_len:{unpackedData.shape[0]}, prev_len:{len(self.prev_strength)} cur_len:{len(self.cur_strength)}") # print(f"target_len:{unpackedData.shape[0]}, prev_len:{len(self.prev_strength)} cur_len:{len(self.cur_strength)}")
# print("Prev", self.prev_strength) # print("Prev", self.prev_strength)
# print("Cur", self.cur_strength) # print("Cur", self.cur_strength)
# ひとつ前の結果とサイズが変わるため、記録は消去する。 # ひとつ前の結果とサイズが変わるため、記録は消去する。
if hasattr(self, 'prev_audio1') == True: if hasattr(self, 'prev_audio1') == True:
delattr(self,"prev_audio1") delattr(self, "prev_audio1")
def _generate_input(self, unpackedData:any, convertSize:int): def _generate_input(self, unpackedData: any, convertSize: int):
# 今回変換するデータをテンソルとして整形する # 今回変換するデータをテンソルとして整形する
audio = torch.FloatTensor(unpackedData.astype(np.float32)) # float32でtensorfを作成 audio = torch.FloatTensor(unpackedData.astype(np.float32)) # float32でtensorfを作成
audio_norm = audio / self.hps.data.max_wav_value # normalize audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio_norm.unsqueeze(0) # unsqueeze audio_norm = audio_norm.unsqueeze(0) # unsqueeze
self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1) # 過去のデータに連結 self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1) # 過去のデータに連結
audio_norm = self.audio_buffer[:, -convertSize:] # 変換対象の部分だけ抽出 audio_norm = self.audio_buffer[:, -convertSize:] # 変換対象の部分だけ抽出
self.audio_buffer = audio_norm self.audio_buffer = audio_norm
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length, spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False) center=False)
spec = torch.squeeze(spec, 0) spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(self.settings.srcId)]) sid = torch.LongTensor([int(self.settings.srcId)])
@ -191,7 +193,6 @@ class VoiceChanger():
data = TextAudioSpeakerCollate()([data]) data = TextAudioSpeakerCollate()([data])
return data return data
def _onnx_inference(self, data, inputSize): def _onnx_inference(self, data, inputSize):
if hasattr(self, "onnx_session") == False or self.onnx_session == None: if hasattr(self, "onnx_session") == False or self.onnx_session == None:
print("[Voice Changer] No ONNX session.") print("[Voice Changer] No ONNX session.")
@ -207,26 +208,26 @@ class VoiceChanger():
"lengths": spec_lengths.numpy(), "lengths": spec_lengths.numpy(),
"sid_src": sid_src.numpy(), "sid_src": sid_src.numpy(),
"sid_tgt": sid_tgt1.numpy() "sid_tgt": sid_tgt1.numpy()
})[0][0,0] * self.hps.data.max_wav_value })[0][0, 0] * self.hps.data.max_wav_value
if hasattr(self, 'np_prev_audio1') == True: if hasattr(self, 'np_prev_audio1') == True:
overlapSize = int(inputSize * self.settings.crossFadeOverlapRate) overlapSize = int(inputSize * self.settings.crossFadeOverlapRate)
prev_overlap = self.np_prev_audio1[-1*overlapSize:] prev_overlap = self.np_prev_audio1[-1 * overlapSize:]
cur_overlap = audio1[-1*(inputSize + overlapSize) :-1*inputSize] cur_overlap = audio1[-1 * (inputSize + overlapSize):-1 * inputSize]
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape) # print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
# print(">>>>>>>>>>>", -1*(inputSize + overlapSize) , -1*inputSize) # print(">>>>>>>>>>>", -1*(inputSize + overlapSize) , -1*inputSize)
powered_prev = prev_overlap * self.np_prev_strength powered_prev = prev_overlap * self.np_prev_strength
powered_cur = cur_overlap * self.np_cur_strength powered_cur = cur_overlap * self.np_cur_strength
powered_result = powered_prev + powered_cur powered_result = powered_prev + powered_cur
cur = audio1[-1*inputSize:-1*overlapSize] cur = audio1[-1 * inputSize:-1 * overlapSize]
result = np.concatenate([powered_result, cur],axis=0) result = np.concatenate([powered_result, cur], axis=0)
else: else:
result = np.zeros(1).astype(np.int16) result = np.zeros(1).astype(np.int16)
self.np_prev_audio1 = audio1 self.np_prev_audio1 = audio1
return result return result
def _pyTorch_inference(self, data, inputSize): def _pyTorch_inference(self, data, inputSize):
if hasattr(self, "net_g") == False or self.net_g ==None: if hasattr(self, "net_g") == False or self.net_g == None:
print("[Voice Changer] No pyTorch session.") print("[Voice Changer] No pyTorch session.")
return np.zeros(1).astype(np.int16) return np.zeros(1).astype(np.int16)
@ -234,7 +235,8 @@ class VoiceChanger():
with torch.no_grad(): with torch.no_grad():
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data] x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
sid_tgt1 = torch.LongTensor([self.settings.dstId]).cpu() sid_tgt1 = torch.LongTensor([self.settings.dstId]).cpu()
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0, 0].data * self.hps.data.max_wav_value) audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src,
sid_tgt=sid_tgt1)[0, 0].data * self.hps.data.max_wav_value)
if self.prev_strength.device != torch.device('cpu'): if self.prev_strength.device != torch.device('cpu'):
print(f"prev_strength move from {self.prev_strength.device} to cpu") print(f"prev_strength move from {self.prev_strength.device} to cpu")
@ -243,19 +245,19 @@ class VoiceChanger():
print(f"cur_strength move from {self.cur_strength.device} to cpu") print(f"cur_strength move from {self.cur_strength.device} to cpu")
self.cur_strength = self.cur_strength.cpu() self.cur_strength = self.cur_strength.cpu()
if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cpu'): # prev_audio1が所望のデバイスに無い場合は一回休み。 if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cpu'): # prev_audio1が所望のデバイスに無い場合は一回休み。
overlapSize = int(inputSize * self.settings.crossFadeOverlapRate) overlapSize = int(inputSize * self.settings.crossFadeOverlapRate)
prev_overlap = self.prev_audio1[-1*overlapSize:] prev_overlap = self.prev_audio1[-1 * overlapSize:]
cur_overlap = audio1[-1*(inputSize + overlapSize) :-1*inputSize] cur_overlap = audio1[-1 * (inputSize + overlapSize):-1 * inputSize]
powered_prev = prev_overlap * self.prev_strength powered_prev = prev_overlap * self.prev_strength
powered_cur = cur_overlap * self.cur_strength powered_cur = cur_overlap * self.cur_strength
powered_result = powered_prev + powered_cur powered_result = powered_prev + powered_cur
cur = audio1[-1*inputSize:-1*overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。 cur = audio1[-1 * inputSize:-1 * overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。
result = torch.cat([powered_result, cur],axis=0) # Crossfadeと今回のインプットの生部分を結合 result = torch.cat([powered_result, cur], axis=0) # Crossfadeと今回のインプットの生部分を結合
else: else:
cur = audio1[-2*inputSize:-1*inputSize] cur = audio1[-2 * inputSize:-1 * inputSize]
result = cur result = cur
self.prev_audio1 = audio1 self.prev_audio1 = audio1
@ -265,7 +267,8 @@ class VoiceChanger():
with torch.no_grad(): with torch.no_grad():
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(self.settings.gpu) for x in data] x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(self.settings.gpu) for x in data]
sid_tgt1 = torch.LongTensor([self.settings.dstId]).cuda(self.settings.gpu) sid_tgt1 = torch.LongTensor([self.settings.dstId]).cuda(self.settings.gpu)
audio1 = self.net_g.cuda(self.settings.gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0, 0].data * self.hps.data.max_wav_value audio1 = self.net_g.cuda(self.settings.gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src,
sid_tgt=sid_tgt1)[0, 0].data * self.hps.data.max_wav_value
if self.prev_strength.device != torch.device('cuda', self.settings.gpu): if self.prev_strength.device != torch.device('cuda', self.settings.gpu):
print(f"prev_strength move from {self.prev_strength.device} to gpu{self.settings.gpu}") print(f"prev_strength move from {self.prev_strength.device} to gpu{self.settings.gpu}")
@ -274,33 +277,30 @@ class VoiceChanger():
print(f"cur_strength move from {self.cur_strength.device} to gpu{self.settings.gpu}") print(f"cur_strength move from {self.cur_strength.device} to gpu{self.settings.gpu}")
self.cur_strength = self.cur_strength.cuda(self.settings.gpu) self.cur_strength = self.cur_strength.cuda(self.settings.gpu)
if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cuda', self.settings.gpu): if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cuda', self.settings.gpu):
overlapSize = int(inputSize * self.settings.crossFadeOverlapRate) overlapSize = int(inputSize * self.settings.crossFadeOverlapRate)
prev_overlap = self.prev_audio1[-1*overlapSize:] prev_overlap = self.prev_audio1[-1 * overlapSize:]
cur_overlap = audio1[-1*(inputSize + overlapSize) :-1*inputSize] cur_overlap = audio1[-1 * (inputSize + overlapSize):-1 * inputSize]
powered_prev = prev_overlap * self.prev_strength powered_prev = prev_overlap * self.prev_strength
powered_cur = cur_overlap * self.cur_strength powered_cur = cur_overlap * self.cur_strength
powered_result = powered_prev + powered_cur powered_result = powered_prev + powered_cur
cur = audio1[-1*inputSize:-1*overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。 cur = audio1[-1 * inputSize:-1 * overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。
result = torch.cat([powered_result, cur],axis=0) # Crossfadeと今回のインプットの生部分を結合 result = torch.cat([powered_result, cur], axis=0) # Crossfadeと今回のインプットの生部分を結合
else: else:
cur = audio1[-2*inputSize:-1*inputSize] cur = audio1[-2 * inputSize:-1 * inputSize]
result = cur result = cur
self.prev_audio1 = audio1 self.prev_audio1 = audio1
result = result.cpu().float().numpy() result = result.cpu().float().numpy()
return result return result
def on_request(self, unpackedData:any): def on_request(self, unpackedData: any):
convertSize = self.settings.convertChunkNum * 128 # 128sample/1chunk convertSize = self.settings.convertChunkNum * 128 # 128sample/1chunk
if unpackedData.shape[0]*(1 + self.settings.crossFadeOverlapRate) + 1024 > convertSize: if unpackedData.shape[0] * (1 + self.settings.crossFadeOverlapRate) + 1024 > convertSize:
convertSize = int(unpackedData.shape[0]*(1 + self.settings.crossFadeOverlapRate)) + 1024 convertSize = int(unpackedData.shape[0] * (1 + self.settings.crossFadeOverlapRate)) + 1024
if convertSize < self.settings.minConvertSize: if convertSize < self.settings.minConvertSize:
convertSize = self.settings.minConvertSize convertSize = self.settings.minConvertSize
# print("convert Size", unpackedData.shape[0], unpackedData.shape[0]*(1 + self.settings.crossFadeOverlapRate), convertSize, self.settings.minConvertSize) # print("convert Size", unpackedData.shape[0], unpackedData.shape[0]*(1 + self.settings.crossFadeOverlapRate), convertSize, self.settings.minConvertSize)
@ -308,16 +308,14 @@ class VoiceChanger():
self._generate_strength(unpackedData) self._generate_strength(unpackedData)
data = self._generate_input(unpackedData, convertSize) data = self._generate_input(unpackedData, convertSize)
try: try:
if self.settings.framework == "ONNX": if self.settings.framework == "ONNX":
result = self._onnx_inference(data, unpackedData.shape[0]) result = self._onnx_inference(data, unpackedData.shape[0])
else: else:
result = self._pyTorch_inference(data, unpackedData.shape[0]) result = self._pyTorch_inference(data, unpackedData.shape[0])
except Exception as e: except Exception as e:
print("VC PROCESSING!!!! EXCEPTION!!!", e) print("VC PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc()) print(traceback.format_exc())
if hasattr(self, "np_prev_audio1"): if hasattr(self, "np_prev_audio1"):
del self.np_prev_audio1 del self.np_prev_audio1
@ -328,4 +326,3 @@ class VoiceChanger():
result = result.astype(np.int16) result = result.astype(np.int16)
# print("on_request result size:",result.shape) # print("on_request result size:",result.shape)
return result return result