kt005

Sleeping

App Files Files Community

ktvoice commited on Dec 28, 2025

Commit

a6d6c8b

verified ·

1 Parent(s): 84377d0

Upload 7 files

Browse files

Files changed (7) hide show

.gitattributes +37 -35
README.md +11 -12
app.py +196 -0
config.yaml +65 -0
packages.txt +3 -0
requirements.txt +9 -0
vieneu_tts.py +347 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,37 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+sample/thientam.mp3 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
----
-title: Kt001
-emoji: 🏃
-colorFrom: red
-colorTo: red
-sdk: gradio
-sdk_version: 6.2.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: MyVoice
+emoji: 🦜
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 6.1.0
+app_file: app.py
+pinned: true
+license: mit
+---

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import spaces
+import os
+os.environ['SPACES_ZERO_GPU'] = '1'
+import gradio as gr
+import soundfile as sf
+import tempfile
+import torch
+import librosa
+from vieneu_tts import VieNeuTTS
+import time
+# --- 1. SETUP MODEL (Sử dụng repo cá nhân của bạn) ---
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# THAY THẾ 'YOUR_USERNAME' bằng tên Hugging Face của bạn
+MY_BACKBONE_REPO = "YOUR_USERNAME/my-vieneu-tts"
+MY_CODEC_REPO = "YOUR_USERNAME/my-neucodec"
+try:
+    tts = VieNeuTTS(
+        backbone_repo=MY_BACKBONE_REPO,
+        backbone_device=device,
+        codec_repo=MY_CODEC_REPO,
+        codec_device=device
+    )
+except Exception as e:
+    print(f"⚠️ Lỗi khởi tạo: {e}")
+    class MockTTS:
+        def encode_reference(self, path): return None
+        def infer(self, text, ref, ref_text):
+            time.sleep(1.2)
+            import numpy as np
+            return np.random.uniform(-0.1, 0.1, 24000*2)
+    tts = MockTTS()
+# --- 2. DATA (Giữ nguyên danh sách giọng mẫu cục bộ) ---
+VOICE_SAMPLES = {
+    "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
+    "Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
+    "Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
+    "Bình (nam miền Bắc)": {"audio": "./sample/Bình (nam miền Bắc).wav", "text": "./sample/Bình (nam miền Bắc).txt"},
+    "Nguyên (nam miền Nam)": {"audio": "./sample/Nguyên (nam miền Nam).wav", "text": "./sample/Nguyên (nam miền Nam).txt"},
+    "Sơn (nam miền Nam)": {"audio": "./sample/Sơn (nam miền Nam).wav", "text": "./sample/Sơn (nam miền Nam).txt"},
+    "Đoan (nữ miền Nam)": {"audio": "./sample/Đoan (nữ miền Nam).wav", "text": "./sample/Đoan (nữ miền Nam).txt"},
+    "Ngọc (nữ miền Bắc)": {"audio": "./sample/Ngọc (nữ miền Bắc).wav", "text": "./sample/Ngọc (nữ miền Bắc).txt"},
+    "Ly (nữ miền Bắc)": {"audio": "./sample/Ly (nữ miền Bắc).wav", "text": "./sample/Ly (nữ miền Bắc).txt"},
+    "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
+}
+# --- 3. HELPER FUNCTIONS ---
+def load_reference_info(voice_choice):
+    if voice_choice in VOICE_SAMPLES:
+        audio_path = VOICE_SAMPLES[voice_choice]["audio"]
+        text_path = VOICE_SAMPLES[voice_choice]["text"]
+        if os.path.exists(text_path):
+            with open(text_path, "r", encoding="utf-8") as f:
+                ref_text = f.read()
+            return audio_path, ref_text
+    return None, ""
+@spaces.GPU(duration=120)
+def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, pause_level, speed_value):
+    try:
+        if not text or text.strip() == "":
+            return None, "⚠️ Vui lòng nhập nội dung!"
+        # Tiền xử lý văn bản để tăng độ nghỉ
+        processed_text = text
+        if pause_level == "Trung bình":
+            processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
+        elif pause_level == "Dài":
+            processed_text = processed_text.replace(",", ", , , ").replace(".", ". . . . ")
+        if len(processed_text) > 400:
+             processed_text = processed_text[:400]
+        # Lấy Reference Data
+        if mode_tab == "custom_mode":
+            if custom_audio is None or not custom_text:
+                return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
+            ref_audio_path = custom_audio
+            ref_text_raw = custom_text
+        else:
+            ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
+            with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
+                ref_text_raw = f.read()
+        # Inference
+        start_time = time.time()
+        ref_codes = tts.encode_reference(ref_audio_path)
+        wav = tts.infer(processed_text, ref_codes, ref_text_raw)
+        # Điều chỉnh tốc độ (Time-stretching)
+        if speed_value != 1.0:
+            wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
+        process_time = time.time() - start_time
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            sf.write(tmp_file.name, wav, 24000)
+            output_path = tmp_file.name
+        return output_path, f"⚡ Thành công: {process_time:.2f}s | Tốc độ: {speed_value}x"
+    except Exception as e:
+        return None, f"❌ Lỗi: {str(e)}"
+# --- 4. THEME & CSS (Deep Night Pro) ---
+theme = gr.themes.Default(
+    primary_hue="indigo",
+    secondary_hue="blue",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont('Inter'), 'sans-serif'],
+).set(
+    body_background_fill="#020617",
+    block_background_fill="#0f172a",
+    block_border_width="1px",
+    input_background_fill="#1e293b",
+    input_border_color="#334155",
+    button_primary_background_fill="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)",
+)
+css = """
+.main-wrap { max-width: 1200px !important; margin: auto !important; padding: 20px !important; }
+.st-card {
+    border-radius: 16px !important;
+    border: 1px solid rgba(255,255,255,0.1) !important;
+    box-shadow: 0 4px 20px rgba(0,0,0,0.5) !important;
+    padding: 15px;
+}
+.result-card {
+    background: linear-gradient(180deg, rgba(15, 23, 42, 0.8) 0%, rgba(30, 41, 59, 0.8) 100%) !important;
+    border: 1px solid rgba(99, 102, 241, 0.2) !important;
+    margin-top: 15px;
+}
+audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
+.footer { text-align: center; margin-top: 40px; color: #475569; font-size: 0.8rem; font-weight: 500; }
+"""
+# --- 5. UI CONSTRUCTION ---
+with gr.Blocks(title="AI Voice Studio") as demo:
+    with gr.Column(elem_classes="main-wrap"):
+        with gr.Row(equal_height=True):
+            # TRÁI: Nhập văn bản
+            with gr.Column(scale=1):
+                with gr.Group(elem_classes="st-card"):
+                    text_input = gr.Textbox(
+                        label="VĂN BẢN CẦN CHUYỂN ĐỔI",
+                        placeholder="Chào mừng bạn. Hãy nhập nội dung vào đây...",
+                        lines=20,
+                        show_label=True,
+                    )
+                    char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-size: 0.85rem; font-weight: bold; padding: 5px;'>0 / 250</div>")
+            # PHẢI: Cấu hình
+            with gr.Column(scale=1):
+                with gr.Tabs() as tabs:
+                    with gr.TabItem("👤 Nghệ sĩ đọc", id="preset_mode"):
+                        voice_select = gr.Dropdown(
+                            choices=list(VOICE_SAMPLES.keys()),
+                            value="Tuyên (nam miền Bắc)",
+                            label="Chọn giọng đọc",
+                        )
+                        with gr.Accordion("Nghe thử giọng mẫu", open=False):
+                            ref_audio_preview = gr.Audio(interactive=False, show_label=False)
+                            ref_text_preview = gr.Markdown("...")
+                    with gr.TabItem("🎙️ Nhân bản (Clone)", id="custom_mode"):
+                        custom_audio = gr.Audio(label="Audio gốc", type="filepath")
+                        custom_text = gr.Textbox(label="NỘI DUNG AUDIO MẪU", lines=4)
+                # Cấu hình âm thanh chuyên nghiệp
+                with gr.Row():
+                    pause_level = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ", scale=1)
+                    speed_select = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ đọc", scale=1)
+                current_mode = gr.State(value="preset_mode")
+                gr.Markdown("<br>")
+                btn_generate = gr.Button("BẮT ĐẦU TỔNG HỢP", variant="primary", size="lg")
+                with gr.Group(elem_classes="st-card result-card"):
+                    audio_output = gr.Audio(label="KẾT QUẢ", interactive=False, autoplay=True)
+                    status_output = gr.Markdown("<p style='text-align: center; color: #818cf8;'>✨ Sẵn sàng</p>")
+        gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL AI SOLUTIONS 2025</div>")
+    # LOGIC
+    text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-size: 0.85rem; font-weight: bold; padding: 5px;'>{len(t)} / 250</div>", text_input, char_count)
+    voice_select.change(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
+    tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
+    tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
+    btn_generate.click(fn=synthesize_speech, inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select], outputs=[audio_output, status_output])
+if __name__ == "__main__":
+    demo.queue().launch(theme=theme, css=css, server_name="0.0.0.0", server_port=7860)

config.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+text_settings:
+  max_chars_per_chunk: 256
+  max_total_chars_streaming: 3000
+backbone_configs:
+  "VieNeu-TTS (GPU)":
+    repo: pnnbao-ump/VieNeu-TTS
+    supports_streaming: false
+    description: Chất lượng cao nhất, yêu cầu GPU
+  "VieNeu-TTS-q8-gguf":
+    repo: pnnbao-ump/VieNeu-TTS-q8-gguf
+    supports_streaming: true
+    description: Cân bằng giữa chất lượng và tốc độ
+  "VieNeu-TTS-q4-gguf":
+    repo: pnnbao-ump/VieNeu-TTS-q4-gguf
+    supports_streaming: true
+    description: Nhẹ nhất, phù hợp CPU
+codec_configs:
+  "NeuCodec (Standard)":
+    repo: neuphonic/neucodec
+    description: Codec chuẩn, tốc độ trung bình
+    use_preencoded: false
+  "NeuCodec ONNX (Fast CPU)":
+    repo: neuphonic/neucodec-onnx-decoder
+    description: Tối ưu cho CPU, cần pre-encoded codes
+    use_preencoded: true
+voice_samples:
+  "Tuyên (nam miền Bắc)":
+    audio: ./sample/Tuyên (nam miền Bắc).wav
+    text: ./sample/Tuyên (nam miền Bắc).txt
+    codes: ./sample/Tuyên (nam miền Bắc).pt
+  "Vĩnh (nam miền Nam)":
+    audio: ./sample/Vĩnh (nam miền Nam).wav
+    text: ./sample/Vĩnh (nam miền Nam).txt
+    codes: ./sample/Vĩnh (nam miền Nam).pt
+  "Bình (nam miền Bắc)":
+    audio: ./sample/Bình (nam miền Bắc).wav
+    text: ./sample/Bình (nam miền Bắc).txt
+    codes: ./sample/Bình (nam miền Bắc).pt
+  "Nguyên (nam miền Nam)":
+    audio: ./sample/Nguyên (nam miền Nam).wav
+    text: ./sample/Nguyên (nam miền Nam).txt
+    codes: ./sample/Nguyên (nam miền Nam).pt
+  "Sơn (nam miền Nam)":
+    audio: ./sample/Sơn (nam miền Nam).wav
+    text: ./sample/Sơn (nam miền Nam).txt
+    codes: ./sample/Sơn (nam miền Nam).pt
+  "Đoan (nữ miền Nam)":
+    audio: ./sample/Đoan (nữ miền Nam).wav
+    text: ./sample/Đoan (nữ miền Nam).txt
+    codes: ./sample/Đoan (nữ miền Nam).pt
+  "Ngọc (nữ miền Bắc)":
+    audio: ./sample/Ngọc (nữ miền Bắc).wav
+    text: ./sample/Ngọc (nữ miền Bắc).txt
+    codes: ./sample/Ngọc (nữ miền Bắc).pt
+  "Ly (nữ miền Bắc)":
+    audio: ./sample/Ly (nữ miền Bắc).wav
+    text: ./sample/Ly (nữ miền Bắc).txt
+    codes: ./sample/Ly (nữ miền Bắc).pt
+  "Dung (nữ miền Nam)":
+    audio: ./sample/Dung (nữ miền Nam).wav
+    text: ./sample/Dung (nữ miền Nam).txt
+    codes: ./sample/Dung (nữ miền Nam).pt

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+espeak-ng
+libespeak-ng1
+ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio
+spaces
+torchaudio
+transformers
+librosa
+soundfile
+numpy
+phonemizer
+neucodec

vieneu_tts.py ADDED Viewed

	@@ -0,0 +1,347 @@

+from pathlib import Path
+from typing import Generator
+import librosa
+import numpy as np
+import torch
+from neucodec import NeuCodec, DistillNeuCodec
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from utils.phonemize_text import phonemize_text, phonemize_with_dict
+import re
+def _linear_overlap_add(frames: list[np.ndarray], stride: int) -> np.ndarray:
+    # original impl --> https://github.com/facebookresearch/encodec/blob/main/encodec/utils.py
+    assert len(frames)
+    dtype = frames[0].dtype
+    shape = frames[0].shape[:-1]
+    total_size = 0
+    for i, frame in enumerate(frames):
+        frame_end = stride * i + frame.shape[-1]
+        total_size = max(total_size, frame_end)
+    sum_weight = np.zeros(total_size, dtype=dtype)
+    out = np.zeros(*shape, total_size, dtype=dtype)
+    offset: int = 0
+    for frame in frames:
+        frame_length = frame.shape[-1]
+        t = np.linspace(0, 1, frame_length + 2, dtype=dtype)[1:-1]
+        weight = np.abs(0.5 - (t - 0.5))
+        out[..., offset : offset + frame_length] += weight * frame
+        sum_weight[offset : offset + frame_length] += weight
+        offset += stride
+    assert sum_weight.min() > 0
+    return out / sum_weight
+class VieNeuTTS:
+    def __init__(
+        self,
+        backbone_repo="pnnbao-ump/VieNeu-TTS",
+        backbone_device="cpu",
+        codec_repo="neuphonic/neucodec",
+        codec_device="cpu",
+    ):
+        # Constants
+        self.sample_rate = 24_000
+        self.max_context = 2048
+        self.hop_length = 480
+        self.streaming_overlap_frames = 1
+        self.streaming_frames_per_chunk = 25
+        self.streaming_lookforward = 5
+        self.streaming_lookback = 50
+        self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
+        # ggml & onnx flags
+        self._is_quantized_model = False
+        self._is_onnx_codec = False
+        # HF tokenizer
+        self.tokenizer = None
+        # Load models
+        self._load_backbone(backbone_repo, backbone_device)
+        self._load_codec(codec_repo, codec_device)
+    def _load_backbone(self, backbone_repo, backbone_device):
+        print(f"Loading backbone from: {backbone_repo} on {backbone_device} ...")
+        if backbone_repo.lower().endswith("gguf") or "gguf" in backbone_repo.lower():
+            try:
+                from llama_cpp import Llama
+            except ImportError as e:
+                raise ImportError(
+                    "Failed to import `llama_cpp`. "
+                    "Please install it with:\n"
+                    "    pip install llama-cpp-python"
+                ) from e
+            self.backbone = Llama.from_pretrained(
+                repo_id=backbone_repo,
+                filename="*.gguf",
+                verbose=False,
+                n_gpu_layers=-1 if backbone_device == "gpu" else 0,
+                n_ctx=self.max_context,
+                mlock=True,
+                flash_attn=True if backbone_device == "gpu" else False,
+            )
+            self._is_quantized_model = True
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
+            self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
+                torch.device(backbone_device)
+            )
+    def _load_codec(self, codec_repo, codec_device):
+        print(f"Loading codec from: {codec_repo} on {codec_device} ...")
+        match codec_repo:
+            case "neuphonic/neucodec":
+                self.codec = NeuCodec.from_pretrained(codec_repo)
+                self.codec.eval().to(codec_device)
+            case "neuphonic/distill-neucodec":
+                self.codec = DistillNeuCodec.from_pretrained(codec_repo)
+                self.codec.eval().to(codec_device)
+            case "neuphonic/neucodec-onnx-decoder":
+                if codec_device != "cpu":
+                    raise ValueError("Onnx decoder only currently runs on CPU.")
+                try:
+                    from neucodec import NeuCodecOnnxDecoder
+                except ImportError as e:
+                    raise ImportError(
+                        "Failed to import the onnx decoder."
+                        " Ensure you have onnxruntime installed as well as neucodec >= 0.0.4."
+                    ) from e
+                self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
+                self._is_onnx_codec = True
+            case _:
+                raise ValueError(f"Unsupported codec repository: {codec_repo}")
+    def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
+        """
+        Perform inference to generate speech from text using the TTS model and reference audio.
+        Args:
+            text (str): Input text to be converted to speech.
+            ref_codes (np.ndarray | torch.tensor): Encoded reference.
+            ref_text (str): Reference text for reference audio. Defaults to None.
+        Returns:
+            np.ndarray: Generated speech waveform.
+        """
+        # Generate tokens
+        if self._is_quantized_model:
+            output_str = self._infer_ggml(ref_codes, ref_text, text)
+        else:
+            prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
+            output_str = self._infer_torch(prompt_ids)
+        # Decode
+        wav = self._decode(output_str)
+        return wav
+    def infer_stream(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> Generator[np.ndarray, None, None]:
+        """
+        Perform streaming inference to generate speech from text using the TTS model and reference audio.
+        Args:
+            text (str): Input text to be converted to speech.
+            ref_codes (np.ndarray | torch.tensor): Encoded reference.
+            ref_text (str): Reference text for reference audio. Defaults to None.
+        Yields:
+            np.ndarray: Generated speech waveform.
+        """
+        if self._is_quantized_model:
+            return self._infer_stream_ggml(ref_codes, ref_text, text)
+        else:
+            raise NotImplementedError("Streaming is not implemented for the torch backend!")
+    def encode_reference(self, ref_audio_path: str | Path):
+        wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+        wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)  # [1, 1, T]
+        with torch.no_grad():
+            ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
+        return ref_codes
+    def _decode(self, codes: str):
+        """Decode speech tokens to audio waveform."""
+        # Extract speech token IDs using regex
+        speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
+        if len(speech_ids) == 0:
+            raise ValueError(
+                "No valid speech tokens found in the output. "
+                "The model may not have generated proper speech tokens."
+            )
+        # Onnx decode
+        if self._is_onnx_codec:
+            codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
+            recon = self.codec.decode_code(codes)
+        # Torch decode
+        else:
+            with torch.no_grad():
+                codes = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(
+                    self.codec.device
+                )
+                recon = self.codec.decode_code(codes).cpu().numpy()
+        return recon[0, 0, :]
+    def _apply_chat_template(self, ref_codes: list[int], ref_text: str, input_text: str) -> list[int]:
+        input_text = phonemize_with_dict(ref_text) + " " + phonemize_with_dict(input_text)
+        speech_replace = self.tokenizer.convert_tokens_to_ids("<|SPEECH_REPLACE|>")
+        speech_gen_start = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
+        text_replace = self.tokenizer.convert_tokens_to_ids("<|TEXT_REPLACE|>")
+        text_prompt_start = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_START|>")
+        text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
+        input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
+        chat = """user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"""
+        ids = self.tokenizer.encode(chat)
+        text_replace_idx = ids.index(text_replace)
+        ids = (
+            ids[:text_replace_idx]
+            + [text_prompt_start]
+            + input_ids
+            + [text_prompt_end]
+            + ids[text_replace_idx + 1 :]  # noqa
+        )
+        speech_replace_idx = ids.index(speech_replace)
+        codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
+        codes = self.tokenizer.encode(codes_str, add_special_tokens=False)
+        ids = ids[:speech_replace_idx] + [speech_gen_start] + list(codes)
+        return ids
+    def _infer_torch(self, prompt_ids: list[int]) -> str:
+        prompt_tensor = torch.tensor(prompt_ids).unsqueeze(0).to(self.backbone.device)
+        speech_end_id = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")
+        with torch.no_grad():
+            output_tokens = self.backbone.generate(
+                prompt_tensor,
+                max_length=self.max_context,
+                eos_token_id=speech_end_id,
+                do_sample=True,
+                temperature=1,
+                top_k=50,
+                use_cache=True,
+                min_new_tokens=50,
+            )
+        input_length = prompt_tensor.shape[-1]
+        output_str = self.tokenizer.decode(
+            output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
+        )
+        return output_str
+    def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
+        ref_text = phonemize_with_dict(ref_text)
+        input_text = phonemize_with_dict(input_text)
+        codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
+        prompt = (
+            f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
+            f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
+        )
+        output = self.backbone(
+            prompt,
+            max_tokens=self.max_context,
+            temperature=1.0,
+            top_k=50,
+            stop=["<|SPEECH_GENERATION_END|>"],
+        )
+        output_str = output["choices"][0]["text"]
+        return output_str
+    def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
+        ref_text = phonemize_with_dict(ref_text)
+        input_text = phonemize_with_dict(input_text)
+        codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
+        prompt = (
+            f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
+            f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
+        )
+        audio_cache: list[np.ndarray] = []
+        token_cache: list[str] = [f"<|speech_{idx}|>" for idx in ref_codes]
+        n_decoded_samples: int = 0
+        n_decoded_tokens: int = len(ref_codes)
+        for item in self.backbone(
+            prompt,
+            max_tokens=self.max_context,
+            temperature=0.2,
+            top_k=50,
+            stop=["<|SPEECH_GENERATION_END|>"],
+            stream=True
+        ):
+            output_str = item["choices"][0]["text"]
+            token_cache.append(output_str)
+            if len(token_cache[n_decoded_tokens:]) >= self.streaming_frames_per_chunk + self.streaming_lookforward:
+                # decode chunk
+                tokens_start = max(
+                    n_decoded_tokens
+                    - self.streaming_lookback
+                    - self.streaming_overlap_frames,
+                    0
+                )
+                tokens_end = (
+                    n_decoded_tokens
+                    + self.streaming_frames_per_chunk
+                    + self.streaming_lookforward
+                    + self.streaming_overlap_frames
+                )
+                sample_start = (
+                    n_decoded_tokens - tokens_start
+                ) * self.hop_length
+                sample_end = (
+                    sample_start
+                    + (self.streaming_frames_per_chunk + 2 * self.streaming_overlap_frames) * self.hop_length
+                )
+                curr_codes = token_cache[tokens_start:tokens_end]
+                recon = self._decode("".join(curr_codes))
+                recon = recon[sample_start:sample_end]
+                audio_cache.append(recon)
+                # postprocess
+                processed_recon = _linear_overlap_add(
+                    audio_cache, stride=self.streaming_stride_samples
+                )
+                new_samples_end = len(audio_cache) * self.streaming_stride_samples
+                processed_recon = processed_recon[
+                    n_decoded_samples:new_samples_end
+                ]
+                n_decoded_samples = new_samples_end
+                n_decoded_tokens += self.streaming_frames_per_chunk
+                yield processed_recon
+        # final decoding handled separately as non-constant chunk size
+        remaining_tokens = len(token_cache) - n_decoded_tokens
+        if len(token_cache) > n_decoded_tokens:
+            tokens_start = max(
+                len(token_cache)
+                - (self.streaming_lookback + self.streaming_overlap_frames + remaining_tokens),
+                0
+            )
+            sample_start = (
+                len(token_cache)
+                - tokens_start
+                - remaining_tokens
+                - self.streaming_overlap_frames
+            ) * self.hop_length
+            curr_codes = token_cache[tokens_start:]
+            recon = self._decode("".join(curr_codes))
+            recon = recon[sample_start:]
+            audio_cache.append(recon)
+            processed_recon = _linear_overlap_add(audio_cache, stride=self.streaming_stride_samples)
+            processed_recon = processed_recon[n_decoded_samples:]
+            yield processed_recon