Merge pull request #1236 from IAHispano/codex/optimize-realtime-algorithm-for-quality-and-speed

Vidalnt · web-flow · commit ab248338074d · 2026-04-19T12:11:55.000-05:00
Optimize realtime conversion hot-path allocations and SOLA processing
diff --git a/rvc/realtime/core.py b/rvc/realtime/core.py
@@ -451,6 +451,9 @@ def generate_strength(self):
         )
 
         self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
+        self.sola_denominator_kernel = torch.ones(
+            1, 1, self.crossfade_frame, device=self.device, dtype=torch.float32
+        )
         # The size will change from the previous result, so the record will be deleted.
         self.sola_buffer = torch.zeros(
             self.crossfade_frame, device=self.device, dtype=torch.float32
@@ -513,10 +516,7 @@ def process_audio(
         ].float()
         cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
         cor_den = torch.sqrt(
-            F.conv1d(
-                conv_input**2,
-                torch.ones(1, 1, self.crossfade_frame, device=self.device),
-            )
+            F.conv1d(conv_input**2, self.sola_denominator_kernel)
             + 1e-8
         )
         sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
@@ -543,12 +543,9 @@ def process_audio(
             # Apply sin² fade-in over crossfade_frame duration from onset.
             fade_len = min(block_size - onset_sample, self.crossfade_frame)
             if fade_len > 0:
-                t = torch.linspace(
-                    0.0, 1.0, steps=fade_len, device=self.device, dtype=torch.float32
-                )
-                audio[onset_sample : onset_sample + fade_len] *= (
-                    torch.sin(0.5 * np.pi * t) ** 2
-                )
+                audio[onset_sample : onset_sample + fade_len] *= self.fade_in_window[
+                    :fade_len
+                ]
         else:
             audio[: self.crossfade_frame] *= self.fade_in_window
             audio[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window
diff --git a/rvc/realtime/pipeline.py b/rvc/realtime/pipeline.py
@@ -122,6 +122,9 @@ def __init__(
         self.resamplers = {}
         self.f0_model = self.setup_f0(self.f0_method)
         self.dtype = vc.dtype
+        # Reuse scalar tensors to avoid per-block allocations.
+        self._rate_tensor = torch.zeros(1, device=self.device, dtype=torch.float32)
+        self._p_len_tensor = torch.zeros(1, device=self.device, dtype=torch.int64)
 
     def setup_f0(self, f0_method: str = "fcpe"):
         if f0_method == "rmvpe":
@@ -279,8 +282,9 @@ def voice_conversion(
                 f0_new = f0_new.squeeze(0)
 
                 # Shift pitch cache left by one block and append new frames (trimmed [3:-1]).
-                pitch[:-shift] = pitch[shift:].clone()
-                pitchf[:-shift] = pitchf[shift:].clone()
+                if shift > 0:
+                    pitch[:-shift] = pitch[shift:].clone()
+                    pitchf[:-shift] = pitchf[shift:].clone()
                 interior_coarse = (
                     f0_coarse_new[3:-1] if f0_coarse_new.shape[0] > 4 else f0_coarse_new
                 )
@@ -341,12 +345,15 @@ def voice_conversion(
 
             pitchf_p = pitchf_p.to(self.dtype) if self.use_f0 else None
             # Trim oldest context so model output covers only the current block.
-            rate = torch.tensor(
-                [return_length / p_len], device=self.device, dtype=torch.float32
-            )
-            p_len = torch.tensor([p_len], device=self.device, dtype=torch.int64)
+            self._rate_tensor.fill_(return_length / p_len)
+            self._p_len_tensor.fill_(p_len)
             out_audio = self.vc.inference(
-                feats, p_len, self.torch_sid, pitch_p, pitchf_p, rate
+                feats,
+                self._p_len_tensor,
+                self.torch_sid,
+                pitch_p,
+                pitchf_p,
+                self._rate_tensor,
             ).float()
             # Match output RMS to the current block's input RMS.
             if volume_envelope < 1: