Skip to content

Commit ab24833

Browse files
authored
Merge pull request #1236 from IAHispano/codex/optimize-realtime-algorithm-for-quality-and-speed
Optimize realtime conversion hot-path allocations and SOLA processing
2 parents cf22a18 + 8588266 commit ab24833

2 files changed

Lines changed: 21 additions & 17 deletions

File tree

rvc/realtime/core.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,9 @@ def generate_strength(self):
451451
)
452452

453453
self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
454+
self.sola_denominator_kernel = torch.ones(
455+
1, 1, self.crossfade_frame, device=self.device, dtype=torch.float32
456+
)
454457
# The size will change from the previous result, so the record will be deleted.
455458
self.sola_buffer = torch.zeros(
456459
self.crossfade_frame, device=self.device, dtype=torch.float32
@@ -513,10 +516,7 @@ def process_audio(
513516
].float()
514517
cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
515518
cor_den = torch.sqrt(
516-
F.conv1d(
517-
conv_input**2,
518-
torch.ones(1, 1, self.crossfade_frame, device=self.device),
519-
)
519+
F.conv1d(conv_input**2, self.sola_denominator_kernel)
520520
+ 1e-8
521521
)
522522
sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
@@ -543,12 +543,9 @@ def process_audio(
543543
# Apply sin² fade-in over crossfade_frame duration from onset.
544544
fade_len = min(block_size - onset_sample, self.crossfade_frame)
545545
if fade_len > 0:
546-
t = torch.linspace(
547-
0.0, 1.0, steps=fade_len, device=self.device, dtype=torch.float32
548-
)
549-
audio[onset_sample : onset_sample + fade_len] *= (
550-
torch.sin(0.5 * np.pi * t) ** 2
551-
)
546+
audio[onset_sample : onset_sample + fade_len] *= self.fade_in_window[
547+
:fade_len
548+
]
552549
else:
553550
audio[: self.crossfade_frame] *= self.fade_in_window
554551
audio[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window

rvc/realtime/pipeline.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@ def __init__(
122122
self.resamplers = {}
123123
self.f0_model = self.setup_f0(self.f0_method)
124124
self.dtype = vc.dtype
125+
# Reuse scalar tensors to avoid per-block allocations.
126+
self._rate_tensor = torch.zeros(1, device=self.device, dtype=torch.float32)
127+
self._p_len_tensor = torch.zeros(1, device=self.device, dtype=torch.int64)
125128

126129
def setup_f0(self, f0_method: str = "fcpe"):
127130
if f0_method == "rmvpe":
@@ -279,8 +282,9 @@ def voice_conversion(
279282
f0_new = f0_new.squeeze(0)
280283

281284
# Shift pitch cache left by one block and append new frames (trimmed [3:-1]).
282-
pitch[:-shift] = pitch[shift:].clone()
283-
pitchf[:-shift] = pitchf[shift:].clone()
285+
if shift > 0:
286+
pitch[:-shift] = pitch[shift:].clone()
287+
pitchf[:-shift] = pitchf[shift:].clone()
284288
interior_coarse = (
285289
f0_coarse_new[3:-1] if f0_coarse_new.shape[0] > 4 else f0_coarse_new
286290
)
@@ -341,12 +345,15 @@ def voice_conversion(
341345

342346
pitchf_p = pitchf_p.to(self.dtype) if self.use_f0 else None
343347
# Trim oldest context so model output covers only the current block.
344-
rate = torch.tensor(
345-
[return_length / p_len], device=self.device, dtype=torch.float32
346-
)
347-
p_len = torch.tensor([p_len], device=self.device, dtype=torch.int64)
348+
self._rate_tensor.fill_(return_length / p_len)
349+
self._p_len_tensor.fill_(p_len)
348350
out_audio = self.vc.inference(
349-
feats, p_len, self.torch_sid, pitch_p, pitchf_p, rate
351+
feats,
352+
self._p_len_tensor,
353+
self.torch_sid,
354+
pitch_p,
355+
pitchf_p,
356+
self._rate_tensor,
350357
).float()
351358
# Match output RMS to the current block's input RMS.
352359
if volume_envelope < 1:

0 commit comments

Comments
 (0)