Instructions to use cshbli/Moxin-TTS with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers.js
How to use cshbli/Moxin-TTS with Transformers.js:
// npm i @huggingface/transformers import { pipeline } from '@huggingface/transformers'; // Allocate pipeline const pipe = await pipeline('text-to-speech', 'cshbli/Moxin-TTS');
| import os | |
| import json | |
| import numpy as np | |
| import scipy.io.wavfile as wavfile | |
| from onnxruntime import InferenceSession | |
| from phonemizer import phonemize | |
| # === Step 1: Load phoneme-to-ID vocabulary === | |
| CONFIG_PATH = "./config_kokoro.json" # Download this from Hugging Face: Kokoro-82M/config.json | |
| with open(CONFIG_PATH, "r", encoding="utf-8") as f: | |
| config = json.load(f) | |
| phoneme_to_id = config["vocab"] | |
| # === Step 2: Convert text to phonemes using espeak-ng === | |
| text = "Hi how are you, what is your name. tell me something" | |
| phonemes = phonemize( | |
| text, | |
| language="en-us", | |
| backend="espeak", | |
| strip=True, | |
| preserve_punctuation=True, | |
| with_stress=True | |
| ) | |
| # === Step 3: Filter out unsupported phonemes and convert to token IDs === | |
| phonemes = "".join(p for p in phonemes if p in phoneme_to_id) | |
| print("Phonemes:", phonemes) | |
| tokens = [phoneme_to_id[p] for p in phonemes] | |
| print("Token IDs:", tokens) | |
| # === Step 4: Prepare style embedding and input IDs === | |
| assert len(tokens) <= 510, "Token sequence too long (max 510 phonemes)" | |
| voices = np.fromfile('./voices/af.bin', dtype=np.float32).reshape(-1, 1, 256) | |
| ref_s = voices[len(tokens)] # Select style vector based on token length | |
| tokens = [[0, *tokens, 0]] # Add padding tokens at the beginning and end | |
| # === Step 5: Run ONNX model inference === | |
| model_name = 'model.onnx' | |
| sess = InferenceSession(os.path.join('onnx', model_name)) | |
| audio = sess.run(None, { | |
| 'input_ids': tokens, | |
| 'style': ref_s, | |
| 'speed': np.ones(1, dtype=np.float32), | |
| })[0] | |
| # === Step 6: Save output audio as a 24kHz WAV file === | |
| wavfile.write('audio.wav', 24000, audio[0]) | |
| print("✅ Audio saved to audio.wav") | |