cshbli
/

Moxin-TTS

Transformers.js

style_text_to_speech_2

Model card Files Files and versions

Moxin-TTS / inference.py

Hongbing Li

Add model weights and configuration files

db9a17a 11 months ago

history blame contribute delete

1.66 kB

	import os
	import json
	import numpy as np
	import scipy.io.wavfile as wavfile
	from onnxruntime import InferenceSession
	from phonemizer import phonemize

	# === Step 1: Load phoneme-to-ID vocabulary ===
	CONFIG_PATH = "./config_kokoro.json" # Download this from Hugging Face: Kokoro-82M/config.json
	with open(CONFIG_PATH, "r", encoding="utf-8") as f:
	config = json.load(f)
	phoneme_to_id = config["vocab"]

	# === Step 2: Convert text to phonemes using espeak-ng ===
	text = "Hi how are you, what is your name. tell me something"

	phonemes = phonemize(
	text,
	language="en-us",
	backend="espeak",
	strip=True,
	preserve_punctuation=True,
	with_stress=True
	)

	# === Step 3: Filter out unsupported phonemes and convert to token IDs ===
	phonemes = "".join(p for p in phonemes if p in phoneme_to_id)
	print("Phonemes:", phonemes)

	tokens = [phoneme_to_id[p] for p in phonemes]
	print("Token IDs:", tokens)

	# === Step 4: Prepare style embedding and input IDs ===
	assert len(tokens) <= 510, "Token sequence too long (max 510 phonemes)"

	voices = np.fromfile('./voices/af.bin', dtype=np.float32).reshape(-1, 1, 256)
	ref_s = voices[len(tokens)] # Select style vector based on token length

	tokens = [[0, *tokens, 0]] # Add padding tokens at the beginning and end

	# === Step 5: Run ONNX model inference ===
	model_name = 'model.onnx'
	sess = InferenceSession(os.path.join('onnx', model_name))

	audio = sess.run(None, {
	'input_ids': tokens,
	'style': ref_s,
	'speed': np.ones(1, dtype=np.float32),
	})[0]

	# === Step 6: Save output audio as a 24kHz WAV file ===
	wavfile.write('audio.wav', 24000, audio[0])
	print("✅ Audio saved to audio.wav")