ktvoice commited on
Commit
a6d6c8b
·
verified ·
1 Parent(s): 84377d0

Upload 7 files

Browse files
Files changed (7) hide show
  1. .gitattributes +37 -35
  2. README.md +11 -12
  3. app.py +196 -0
  4. config.yaml +65 -0
  5. packages.txt +3 -0
  6. requirements.txt +9 -0
  7. vieneu_tts.py +347 -0
.gitattributes CHANGED
@@ -1,35 +1,37 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
37
+ sample/thientam.mp3 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,11 @@
1
- ---
2
- title: Kt001
3
- emoji: 🏃
4
- colorFrom: red
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 6.2.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: MyVoice
3
+ emoji: 🦜
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 6.1.0
8
+ app_file: app.py
9
+ pinned: true
10
+ license: mit
11
+ ---
 
app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import os
3
+ os.environ['SPACES_ZERO_GPU'] = '1'
4
+
5
+ import gradio as gr
6
+ import soundfile as sf
7
+ import tempfile
8
+ import torch
9
+ import librosa
10
+ from vieneu_tts import VieNeuTTS
11
+ import time
12
+
13
+ # --- 1. SETUP MODEL (Sử dụng repo cá nhân của bạn) ---
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ # THAY THẾ 'YOUR_USERNAME' bằng tên Hugging Face của bạn
17
+ MY_BACKBONE_REPO = "YOUR_USERNAME/my-vieneu-tts"
18
+ MY_CODEC_REPO = "YOUR_USERNAME/my-neucodec"
19
+
20
+ try:
21
+ tts = VieNeuTTS(
22
+ backbone_repo=MY_BACKBONE_REPO,
23
+ backbone_device=device,
24
+ codec_repo=MY_CODEC_REPO,
25
+ codec_device=device
26
+ )
27
+ except Exception as e:
28
+ print(f"⚠️ Lỗi khởi tạo: {e}")
29
+ class MockTTS:
30
+ def encode_reference(self, path): return None
31
+ def infer(self, text, ref, ref_text):
32
+ time.sleep(1.2)
33
+ import numpy as np
34
+ return np.random.uniform(-0.1, 0.1, 24000*2)
35
+ tts = MockTTS()
36
+
37
+ # --- 2. DATA (Giữ nguyên danh sách giọng mẫu cục bộ) ---
38
+ VOICE_SAMPLES = {
39
+ "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
40
+ "Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
41
+ "Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
42
+ "Bình (nam miền Bắc)": {"audio": "./sample/Bình (nam miền Bắc).wav", "text": "./sample/Bình (nam miền Bắc).txt"},
43
+ "Nguyên (nam miền Nam)": {"audio": "./sample/Nguyên (nam miền Nam).wav", "text": "./sample/Nguyên (nam miền Nam).txt"},
44
+ "Sơn (nam miền Nam)": {"audio": "./sample/Sơn (nam miền Nam).wav", "text": "./sample/Sơn (nam miền Nam).txt"},
45
+ "Đoan (nữ miền Nam)": {"audio": "./sample/Đoan (nữ miền Nam).wav", "text": "./sample/Đoan (nữ miền Nam).txt"},
46
+ "Ngọc (nữ miền Bắc)": {"audio": "./sample/Ngọc (nữ miền Bắc).wav", "text": "./sample/Ngọc (nữ miền Bắc).txt"},
47
+ "Ly (nữ miền Bắc)": {"audio": "./sample/Ly (nữ miền Bắc).wav", "text": "./sample/Ly (nữ miền Bắc).txt"},
48
+ "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
49
+ }
50
+
51
+ # --- 3. HELPER FUNCTIONS ---
52
+ def load_reference_info(voice_choice):
53
+ if voice_choice in VOICE_SAMPLES:
54
+ audio_path = VOICE_SAMPLES[voice_choice]["audio"]
55
+ text_path = VOICE_SAMPLES[voice_choice]["text"]
56
+ if os.path.exists(text_path):
57
+ with open(text_path, "r", encoding="utf-8") as f:
58
+ ref_text = f.read()
59
+ return audio_path, ref_text
60
+ return None, ""
61
+
62
+ @spaces.GPU(duration=120)
63
+ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, pause_level, speed_value):
64
+ try:
65
+ if not text or text.strip() == "":
66
+ return None, "⚠️ Vui lòng nhập nội dung!"
67
+
68
+ # Tiền xử lý văn bản để tăng độ nghỉ
69
+ processed_text = text
70
+ if pause_level == "Trung bình":
71
+ processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
72
+ elif pause_level == "Dài":
73
+ processed_text = processed_text.replace(",", ", , , ").replace(".", ". . . . ")
74
+
75
+ if len(processed_text) > 400:
76
+ processed_text = processed_text[:400]
77
+
78
+ # Lấy Reference Data
79
+ if mode_tab == "custom_mode":
80
+ if custom_audio is None or not custom_text:
81
+ return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
82
+ ref_audio_path = custom_audio
83
+ ref_text_raw = custom_text
84
+ else:
85
+ ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
86
+ with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
87
+ ref_text_raw = f.read()
88
+
89
+ # Inference
90
+ start_time = time.time()
91
+ ref_codes = tts.encode_reference(ref_audio_path)
92
+ wav = tts.infer(processed_text, ref_codes, ref_text_raw)
93
+
94
+ # Điều chỉnh tốc độ (Time-stretching)
95
+ if speed_value != 1.0:
96
+ wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
97
+
98
+ process_time = time.time() - start_time
99
+
100
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
101
+ sf.write(tmp_file.name, wav, 24000)
102
+ output_path = tmp_file.name
103
+
104
+ return output_path, f"⚡ Thành công: {process_time:.2f}s | Tốc độ: {speed_value}x"
105
+ except Exception as e:
106
+ return None, f"❌ Lỗi: {str(e)}"
107
+
108
+ # --- 4. THEME & CSS (Deep Night Pro) ---
109
+ theme = gr.themes.Default(
110
+ primary_hue="indigo",
111
+ secondary_hue="blue",
112
+ neutral_hue="slate",
113
+ font=[gr.themes.GoogleFont('Inter'), 'sans-serif'],
114
+ ).set(
115
+ body_background_fill="#020617",
116
+ block_background_fill="#0f172a",
117
+ block_border_width="1px",
118
+ input_background_fill="#1e293b",
119
+ input_border_color="#334155",
120
+ button_primary_background_fill="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)",
121
+ )
122
+
123
+ css = """
124
+ .main-wrap { max-width: 1200px !important; margin: auto !important; padding: 20px !important; }
125
+ .st-card {
126
+ border-radius: 16px !important;
127
+ border: 1px solid rgba(255,255,255,0.1) !important;
128
+ box-shadow: 0 4px 20px rgba(0,0,0,0.5) !important;
129
+ padding: 15px;
130
+ }
131
+ .result-card {
132
+ background: linear-gradient(180deg, rgba(15, 23, 42, 0.8) 0%, rgba(30, 41, 59, 0.8) 100%) !important;
133
+ border: 1px solid rgba(99, 102, 241, 0.2) !important;
134
+ margin-top: 15px;
135
+ }
136
+ audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
137
+ .footer { text-align: center; margin-top: 40px; color: #475569; font-size: 0.8rem; font-weight: 500; }
138
+ """
139
+
140
+ # --- 5. UI CONSTRUCTION ---
141
+ with gr.Blocks(title="AI Voice Studio") as demo:
142
+
143
+ with gr.Column(elem_classes="main-wrap"):
144
+ with gr.Row(equal_height=True):
145
+ # TRÁI: Nhập văn bản
146
+ with gr.Column(scale=1):
147
+ with gr.Group(elem_classes="st-card"):
148
+ text_input = gr.Textbox(
149
+ label="VĂN BẢN CẦN CHUYỂN ĐỔI",
150
+ placeholder="Chào mừng bạn. Hãy nhập nội dung vào đây...",
151
+ lines=20,
152
+ show_label=True,
153
+ )
154
+ char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-size: 0.85rem; font-weight: bold; padding: 5px;'>0 / 250</div>")
155
+
156
+ # PHẢI: Cấu hình
157
+ with gr.Column(scale=1):
158
+ with gr.Tabs() as tabs:
159
+ with gr.TabItem("👤 Nghệ sĩ đọc", id="preset_mode"):
160
+ voice_select = gr.Dropdown(
161
+ choices=list(VOICE_SAMPLES.keys()),
162
+ value="Tuyên (nam miền Bắc)",
163
+ label="Chọn giọng đọc",
164
+ )
165
+ with gr.Accordion("Nghe thử giọng mẫu", open=False):
166
+ ref_audio_preview = gr.Audio(interactive=False, show_label=False)
167
+ ref_text_preview = gr.Markdown("...")
168
+
169
+ with gr.TabItem("🎙️ Nhân bản (Clone)", id="custom_mode"):
170
+ custom_audio = gr.Audio(label="Audio gốc", type="filepath")
171
+ custom_text = gr.Textbox(label="NỘI DUNG AUDIO MẪU", lines=4)
172
+
173
+ # Cấu hình âm thanh chuyên nghiệp
174
+ with gr.Row():
175
+ pause_level = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ", scale=1)
176
+ speed_select = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ đọc", scale=1)
177
+
178
+ current_mode = gr.State(value="preset_mode")
179
+ gr.Markdown("<br>")
180
+ btn_generate = gr.Button("BẮT ĐẦU TỔNG HỢP", variant="primary", size="lg")
181
+
182
+ with gr.Group(elem_classes="st-card result-card"):
183
+ audio_output = gr.Audio(label="KẾT QUẢ", interactive=False, autoplay=True)
184
+ status_output = gr.Markdown("<p style='text-align: center; color: #818cf8;'>✨ Sẵn sàng</p>")
185
+
186
+ gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL AI SOLUTIONS 2025</div>")
187
+
188
+ # LOGIC
189
+ text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-size: 0.85rem; font-weight: bold; padding: 5px;'>{len(t)} / 250</div>", text_input, char_count)
190
+ voice_select.change(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
191
+ tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
192
+ tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
193
+ btn_generate.click(fn=synthesize_speech, inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select], outputs=[audio_output, status_output])
194
+
195
+ if __name__ == "__main__":
196
+ demo.queue().launch(theme=theme, css=css, server_name="0.0.0.0", server_port=7860)
config.yaml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ text_settings:
2
+ max_chars_per_chunk: 256
3
+ max_total_chars_streaming: 3000
4
+
5
+ backbone_configs:
6
+ "VieNeu-TTS (GPU)":
7
+ repo: pnnbao-ump/VieNeu-TTS
8
+ supports_streaming: false
9
+ description: Chất lượng cao nhất, yêu cầu GPU
10
+ "VieNeu-TTS-q8-gguf":
11
+ repo: pnnbao-ump/VieNeu-TTS-q8-gguf
12
+ supports_streaming: true
13
+ description: Cân bằng giữa chất lượng và tốc độ
14
+ "VieNeu-TTS-q4-gguf":
15
+ repo: pnnbao-ump/VieNeu-TTS-q4-gguf
16
+ supports_streaming: true
17
+ description: Nhẹ nhất, phù hợp CPU
18
+
19
+ codec_configs:
20
+ "NeuCodec (Standard)":
21
+ repo: neuphonic/neucodec
22
+ description: Codec chuẩn, tốc độ trung bình
23
+ use_preencoded: false
24
+ "NeuCodec ONNX (Fast CPU)":
25
+ repo: neuphonic/neucodec-onnx-decoder
26
+ description: Tối ưu cho CPU, cần pre-encoded codes
27
+ use_preencoded: true
28
+
29
+ voice_samples:
30
+ "Tuyên (nam miền Bắc)":
31
+ audio: ./sample/Tuyên (nam miền Bắc).wav
32
+ text: ./sample/Tuyên (nam miền Bắc).txt
33
+ codes: ./sample/Tuyên (nam miền Bắc).pt
34
+ "Vĩnh (nam miền Nam)":
35
+ audio: ./sample/Vĩnh (nam miền Nam).wav
36
+ text: ./sample/Vĩnh (nam miền Nam).txt
37
+ codes: ./sample/Vĩnh (nam miền Nam).pt
38
+ "Bình (nam miền Bắc)":
39
+ audio: ./sample/Bình (nam miền Bắc).wav
40
+ text: ./sample/Bình (nam miền Bắc).txt
41
+ codes: ./sample/Bình (nam miền Bắc).pt
42
+ "Nguyên (nam miền Nam)":
43
+ audio: ./sample/Nguyên (nam miền Nam).wav
44
+ text: ./sample/Nguyên (nam miền Nam).txt
45
+ codes: ./sample/Nguyên (nam miền Nam).pt
46
+ "Sơn (nam miền Nam)":
47
+ audio: ./sample/Sơn (nam miền Nam).wav
48
+ text: ./sample/Sơn (nam miền Nam).txt
49
+ codes: ./sample/Sơn (nam miền Nam).pt
50
+ "Đoan (nữ miền Nam)":
51
+ audio: ./sample/Đoan (nữ miền Nam).wav
52
+ text: ./sample/Đoan (nữ miền Nam).txt
53
+ codes: ./sample/Đoan (nữ miền Nam).pt
54
+ "Ngọc (nữ miền Bắc)":
55
+ audio: ./sample/Ngọc (nữ miền Bắc).wav
56
+ text: ./sample/Ngọc (nữ miền Bắc).txt
57
+ codes: ./sample/Ngọc (nữ miền Bắc).pt
58
+ "Ly (nữ miền Bắc)":
59
+ audio: ./sample/Ly (nữ miền Bắc).wav
60
+ text: ./sample/Ly (nữ miền Bắc).txt
61
+ codes: ./sample/Ly (nữ miền Bắc).pt
62
+ "Dung (nữ miền Nam)":
63
+ audio: ./sample/Dung (nữ miền Nam).wav
64
+ text: ./sample/Dung (nữ miền Nam).txt
65
+ codes: ./sample/Dung (nữ miền Nam).pt
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ espeak-ng
2
+ libespeak-ng1
3
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ spaces
3
+ torchaudio
4
+ transformers
5
+ librosa
6
+ soundfile
7
+ numpy
8
+ phonemizer
9
+ neucodec
vieneu_tts.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Generator
3
+ import librosa
4
+ import numpy as np
5
+ import torch
6
+ from neucodec import NeuCodec, DistillNeuCodec
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ from utils.phonemize_text import phonemize_text, phonemize_with_dict
9
+ import re
10
+
11
+ def _linear_overlap_add(frames: list[np.ndarray], stride: int) -> np.ndarray:
12
+ # original impl --> https://github.com/facebookresearch/encodec/blob/main/encodec/utils.py
13
+ assert len(frames)
14
+ dtype = frames[0].dtype
15
+ shape = frames[0].shape[:-1]
16
+
17
+ total_size = 0
18
+ for i, frame in enumerate(frames):
19
+ frame_end = stride * i + frame.shape[-1]
20
+ total_size = max(total_size, frame_end)
21
+
22
+ sum_weight = np.zeros(total_size, dtype=dtype)
23
+ out = np.zeros(*shape, total_size, dtype=dtype)
24
+
25
+ offset: int = 0
26
+ for frame in frames:
27
+ frame_length = frame.shape[-1]
28
+ t = np.linspace(0, 1, frame_length + 2, dtype=dtype)[1:-1]
29
+ weight = np.abs(0.5 - (t - 0.5))
30
+
31
+ out[..., offset : offset + frame_length] += weight * frame
32
+ sum_weight[offset : offset + frame_length] += weight
33
+ offset += stride
34
+ assert sum_weight.min() > 0
35
+ return out / sum_weight
36
+
37
+ class VieNeuTTS:
38
+ def __init__(
39
+ self,
40
+ backbone_repo="pnnbao-ump/VieNeu-TTS",
41
+ backbone_device="cpu",
42
+ codec_repo="neuphonic/neucodec",
43
+ codec_device="cpu",
44
+ ):
45
+
46
+ # Constants
47
+ self.sample_rate = 24_000
48
+ self.max_context = 2048
49
+ self.hop_length = 480
50
+ self.streaming_overlap_frames = 1
51
+ self.streaming_frames_per_chunk = 25
52
+ self.streaming_lookforward = 5
53
+ self.streaming_lookback = 50
54
+ self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
55
+
56
+ # ggml & onnx flags
57
+ self._is_quantized_model = False
58
+ self._is_onnx_codec = False
59
+
60
+ # HF tokenizer
61
+ self.tokenizer = None
62
+
63
+ # Load models
64
+ self._load_backbone(backbone_repo, backbone_device)
65
+ self._load_codec(codec_repo, codec_device)
66
+
67
+ def _load_backbone(self, backbone_repo, backbone_device):
68
+ print(f"Loading backbone from: {backbone_repo} on {backbone_device} ...")
69
+
70
+ if backbone_repo.lower().endswith("gguf") or "gguf" in backbone_repo.lower():
71
+ try:
72
+ from llama_cpp import Llama
73
+ except ImportError as e:
74
+ raise ImportError(
75
+ "Failed to import `llama_cpp`. "
76
+ "Please install it with:\n"
77
+ " pip install llama-cpp-python"
78
+ ) from e
79
+ self.backbone = Llama.from_pretrained(
80
+ repo_id=backbone_repo,
81
+ filename="*.gguf",
82
+ verbose=False,
83
+ n_gpu_layers=-1 if backbone_device == "gpu" else 0,
84
+ n_ctx=self.max_context,
85
+ mlock=True,
86
+ flash_attn=True if backbone_device == "gpu" else False,
87
+ )
88
+ self._is_quantized_model = True
89
+
90
+ else:
91
+ self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
92
+ self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
93
+ torch.device(backbone_device)
94
+ )
95
+
96
+ def _load_codec(self, codec_repo, codec_device):
97
+ print(f"Loading codec from: {codec_repo} on {codec_device} ...")
98
+ match codec_repo:
99
+ case "neuphonic/neucodec":
100
+ self.codec = NeuCodec.from_pretrained(codec_repo)
101
+ self.codec.eval().to(codec_device)
102
+ case "neuphonic/distill-neucodec":
103
+ self.codec = DistillNeuCodec.from_pretrained(codec_repo)
104
+ self.codec.eval().to(codec_device)
105
+ case "neuphonic/neucodec-onnx-decoder":
106
+ if codec_device != "cpu":
107
+ raise ValueError("Onnx decoder only currently runs on CPU.")
108
+ try:
109
+ from neucodec import NeuCodecOnnxDecoder
110
+ except ImportError as e:
111
+ raise ImportError(
112
+ "Failed to import the onnx decoder."
113
+ " Ensure you have onnxruntime installed as well as neucodec >= 0.0.4."
114
+ ) from e
115
+ self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
116
+ self._is_onnx_codec = True
117
+ case _:
118
+ raise ValueError(f"Unsupported codec repository: {codec_repo}")
119
+
120
+ def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
121
+ """
122
+ Perform inference to generate speech from text using the TTS model and reference audio.
123
+
124
+ Args:
125
+ text (str): Input text to be converted to speech.
126
+ ref_codes (np.ndarray | torch.tensor): Encoded reference.
127
+ ref_text (str): Reference text for reference audio. Defaults to None.
128
+ Returns:
129
+ np.ndarray: Generated speech waveform.
130
+ """
131
+
132
+ # Generate tokens
133
+ if self._is_quantized_model:
134
+ output_str = self._infer_ggml(ref_codes, ref_text, text)
135
+ else:
136
+ prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
137
+ output_str = self._infer_torch(prompt_ids)
138
+
139
+ # Decode
140
+ wav = self._decode(output_str)
141
+
142
+ return wav
143
+
144
+ def infer_stream(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> Generator[np.ndarray, None, None]:
145
+ """
146
+ Perform streaming inference to generate speech from text using the TTS model and reference audio.
147
+
148
+ Args:
149
+ text (str): Input text to be converted to speech.
150
+ ref_codes (np.ndarray | torch.tensor): Encoded reference.
151
+ ref_text (str): Reference text for reference audio. Defaults to None.
152
+ Yields:
153
+ np.ndarray: Generated speech waveform.
154
+ """
155
+
156
+ if self._is_quantized_model:
157
+ return self._infer_stream_ggml(ref_codes, ref_text, text)
158
+ else:
159
+ raise NotImplementedError("Streaming is not implemented for the torch backend!")
160
+
161
+ def encode_reference(self, ref_audio_path: str | Path):
162
+ wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
163
+ wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0) # [1, 1, T]
164
+ with torch.no_grad():
165
+ ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
166
+ return ref_codes
167
+
168
+ def _decode(self, codes: str):
169
+ """Decode speech tokens to audio waveform."""
170
+ # Extract speech token IDs using regex
171
+ speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
172
+
173
+ if len(speech_ids) == 0:
174
+ raise ValueError(
175
+ "No valid speech tokens found in the output. "
176
+ "The model may not have generated proper speech tokens."
177
+ )
178
+
179
+ # Onnx decode
180
+ if self._is_onnx_codec:
181
+ codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
182
+ recon = self.codec.decode_code(codes)
183
+ # Torch decode
184
+ else:
185
+ with torch.no_grad():
186
+ codes = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(
187
+ self.codec.device
188
+ )
189
+ recon = self.codec.decode_code(codes).cpu().numpy()
190
+
191
+ return recon[0, 0, :]
192
+
193
+ def _apply_chat_template(self, ref_codes: list[int], ref_text: str, input_text: str) -> list[int]:
194
+ input_text = phonemize_with_dict(ref_text) + " " + phonemize_with_dict(input_text)
195
+
196
+ speech_replace = self.tokenizer.convert_tokens_to_ids("<|SPEECH_REPLACE|>")
197
+ speech_gen_start = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
198
+ text_replace = self.tokenizer.convert_tokens_to_ids("<|TEXT_REPLACE|>")
199
+ text_prompt_start = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_START|>")
200
+ text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
201
+
202
+ input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
203
+ chat = """user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"""
204
+ ids = self.tokenizer.encode(chat)
205
+
206
+ text_replace_idx = ids.index(text_replace)
207
+ ids = (
208
+ ids[:text_replace_idx]
209
+ + [text_prompt_start]
210
+ + input_ids
211
+ + [text_prompt_end]
212
+ + ids[text_replace_idx + 1 :] # noqa
213
+ )
214
+
215
+ speech_replace_idx = ids.index(speech_replace)
216
+ codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
217
+ codes = self.tokenizer.encode(codes_str, add_special_tokens=False)
218
+ ids = ids[:speech_replace_idx] + [speech_gen_start] + list(codes)
219
+
220
+ return ids
221
+
222
+ def _infer_torch(self, prompt_ids: list[int]) -> str:
223
+ prompt_tensor = torch.tensor(prompt_ids).unsqueeze(0).to(self.backbone.device)
224
+ speech_end_id = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")
225
+ with torch.no_grad():
226
+ output_tokens = self.backbone.generate(
227
+ prompt_tensor,
228
+ max_length=self.max_context,
229
+ eos_token_id=speech_end_id,
230
+ do_sample=True,
231
+ temperature=1,
232
+ top_k=50,
233
+ use_cache=True,
234
+ min_new_tokens=50,
235
+ )
236
+ input_length = prompt_tensor.shape[-1]
237
+ output_str = self.tokenizer.decode(
238
+ output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
239
+ )
240
+ return output_str
241
+
242
+ def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
243
+ ref_text = phonemize_with_dict(ref_text)
244
+ input_text = phonemize_with_dict(input_text)
245
+
246
+ codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
247
+ prompt = (
248
+ f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
249
+ f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
250
+ )
251
+ output = self.backbone(
252
+ prompt,
253
+ max_tokens=self.max_context,
254
+ temperature=1.0,
255
+ top_k=50,
256
+ stop=["<|SPEECH_GENERATION_END|>"],
257
+ )
258
+ output_str = output["choices"][0]["text"]
259
+ return output_str
260
+
261
+ def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
262
+ ref_text = phonemize_with_dict(ref_text)
263
+ input_text = phonemize_with_dict(input_text)
264
+
265
+ codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
266
+ prompt = (
267
+ f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
268
+ f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
269
+ )
270
+
271
+ audio_cache: list[np.ndarray] = []
272
+ token_cache: list[str] = [f"<|speech_{idx}|>" for idx in ref_codes]
273
+ n_decoded_samples: int = 0
274
+ n_decoded_tokens: int = len(ref_codes)
275
+
276
+ for item in self.backbone(
277
+ prompt,
278
+ max_tokens=self.max_context,
279
+ temperature=0.2,
280
+ top_k=50,
281
+ stop=["<|SPEECH_GENERATION_END|>"],
282
+ stream=True
283
+ ):
284
+ output_str = item["choices"][0]["text"]
285
+ token_cache.append(output_str)
286
+
287
+ if len(token_cache[n_decoded_tokens:]) >= self.streaming_frames_per_chunk + self.streaming_lookforward:
288
+
289
+ # decode chunk
290
+ tokens_start = max(
291
+ n_decoded_tokens
292
+ - self.streaming_lookback
293
+ - self.streaming_overlap_frames,
294
+ 0
295
+ )
296
+ tokens_end = (
297
+ n_decoded_tokens
298
+ + self.streaming_frames_per_chunk
299
+ + self.streaming_lookforward
300
+ + self.streaming_overlap_frames
301
+ )
302
+ sample_start = (
303
+ n_decoded_tokens - tokens_start
304
+ ) * self.hop_length
305
+ sample_end = (
306
+ sample_start
307
+ + (self.streaming_frames_per_chunk + 2 * self.streaming_overlap_frames) * self.hop_length
308
+ )
309
+ curr_codes = token_cache[tokens_start:tokens_end]
310
+ recon = self._decode("".join(curr_codes))
311
+ recon = recon[sample_start:sample_end]
312
+ audio_cache.append(recon)
313
+
314
+ # postprocess
315
+ processed_recon = _linear_overlap_add(
316
+ audio_cache, stride=self.streaming_stride_samples
317
+ )
318
+ new_samples_end = len(audio_cache) * self.streaming_stride_samples
319
+ processed_recon = processed_recon[
320
+ n_decoded_samples:new_samples_end
321
+ ]
322
+ n_decoded_samples = new_samples_end
323
+ n_decoded_tokens += self.streaming_frames_per_chunk
324
+ yield processed_recon
325
+
326
+ # final decoding handled separately as non-constant chunk size
327
+ remaining_tokens = len(token_cache) - n_decoded_tokens
328
+ if len(token_cache) > n_decoded_tokens:
329
+ tokens_start = max(
330
+ len(token_cache)
331
+ - (self.streaming_lookback + self.streaming_overlap_frames + remaining_tokens),
332
+ 0
333
+ )
334
+ sample_start = (
335
+ len(token_cache)
336
+ - tokens_start
337
+ - remaining_tokens
338
+ - self.streaming_overlap_frames
339
+ ) * self.hop_length
340
+ curr_codes = token_cache[tokens_start:]
341
+ recon = self._decode("".join(curr_codes))
342
+ recon = recon[sample_start:]
343
+ audio_cache.append(recon)
344
+
345
+ processed_recon = _linear_overlap_add(audio_cache, stride=self.streaming_stride_samples)
346
+ processed_recon = processed_recon[n_decoded_samples:]
347
+ yield processed_recon