Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,19 +8,16 @@ from gtts import gTTS
|
|
| 8 |
import soundfile as sf
|
| 9 |
from transformers import VitsTokenizer, VitsModel, set_seed
|
| 10 |
|
| 11 |
-
# Clone and Install IndicTransToolkit repository
|
| 12 |
if not os.path.exists('IndicTransToolkit'):
|
| 13 |
os.system('git clone https://github.com/VarunGumma/IndicTransToolkit')
|
| 14 |
os.system('cd IndicTransToolkit && python3 -m pip install --editable ./')
|
| 15 |
|
| 16 |
-
# Ensure that IndicTransToolkit is installed and used properly
|
| 17 |
from IndicTransToolkit import IndicProcessor
|
| 18 |
|
| 19 |
-
# Initialize BLIP for image captioning
|
| 20 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
|
| 21 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda" if torch.cuda.is_available() else "cpu")
|
| 22 |
|
| 23 |
-
|
| 24 |
def generate_caption(image_path):
|
| 25 |
image = Image.open(image_path).convert("RGB")
|
| 26 |
inputs = blip_processor(image, "image of", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
|
|
@@ -29,7 +26,7 @@ def generate_caption(image_path):
|
|
| 29 |
caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
|
| 30 |
return caption
|
| 31 |
|
| 32 |
-
|
| 33 |
def translate_caption(caption, target_languages):
|
| 34 |
# Load model and tokenizer
|
| 35 |
model_name = "ai4bharat/indictrans2-en-indic-1B"
|
|
@@ -44,20 +41,16 @@ def translate_caption(caption, target_languages):
|
|
| 44 |
# Source language (English)
|
| 45 |
src_lang = "eng_Latn"
|
| 46 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 47 |
-
model_IT2.to(DEVICE)
|
| 48 |
-
|
| 49 |
-
# Integrating with workflow now
|
| 50 |
input_sentences = [caption]
|
| 51 |
translations = {}
|
| 52 |
|
| 53 |
for tgt_lang in target_languages:
|
| 54 |
-
# Preprocess input sentences
|
| 55 |
batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
|
| 56 |
|
| 57 |
-
# Tokenize the sentences and generate input encodings
|
| 58 |
inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)
|
| 59 |
|
| 60 |
-
# Generate translations using the model
|
| 61 |
with torch.no_grad():
|
| 62 |
generated_tokens = model_IT2.generate(
|
| 63 |
**inputs,
|
|
@@ -68,23 +61,21 @@ def translate_caption(caption, target_languages):
|
|
| 68 |
num_return_sequences=1,
|
| 69 |
)
|
| 70 |
|
| 71 |
-
# Decode the generated tokens into text
|
| 72 |
with tokenizer_IT2.as_target_tokenizer():
|
| 73 |
generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 74 |
|
| 75 |
-
# Postprocess the translations
|
| 76 |
translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
|
| 77 |
translations[tgt_lang] = translated_texts[0]
|
| 78 |
|
| 79 |
return translations
|
| 80 |
|
| 81 |
-
|
| 82 |
def generate_audio_gtts(text, lang_code, output_file):
|
| 83 |
tts = gTTS(text=text, lang=lang_code)
|
| 84 |
tts.save(output_file)
|
| 85 |
return output_file
|
| 86 |
|
| 87 |
-
|
| 88 |
def generate_audio_fbmms(text, model_name, output_file):
|
| 89 |
tokenizer = VitsTokenizer.from_pretrained(model_name)
|
| 90 |
model = VitsModel.from_pretrained(model_name)
|
|
@@ -114,11 +105,10 @@ if uploaded_image is not None:
|
|
| 114 |
# Select target languages for translation
|
| 115 |
target_languages = st.multiselect(
|
| 116 |
"Select target languages for translation",
|
| 117 |
-
["hin_Deva", "mar_Deva", "guj_Gujr", "urd_Arab"],
|
| 118 |
["hin_Deva", "mar_Deva"]
|
| 119 |
)
|
| 120 |
|
| 121 |
-
# Generate Translations
|
| 122 |
if target_languages:
|
| 123 |
st.write("Translating Caption...")
|
| 124 |
translations = translate_caption(caption, target_languages)
|
|
@@ -126,7 +116,6 @@ if uploaded_image is not None:
|
|
| 126 |
for lang, translation in translations.items():
|
| 127 |
st.write(f"{lang}: {translation}")
|
| 128 |
|
| 129 |
-
# Default to gTTS for TTS
|
| 130 |
for lang in target_languages:
|
| 131 |
st.write(f"Using gTTS for {lang}...")
|
| 132 |
lang_code = {
|
|
|
|
| 8 |
import soundfile as sf
|
| 9 |
from transformers import VitsTokenizer, VitsModel, set_seed
|
| 10 |
|
|
|
|
| 11 |
if not os.path.exists('IndicTransToolkit'):
|
| 12 |
os.system('git clone https://github.com/VarunGumma/IndicTransToolkit')
|
| 13 |
os.system('cd IndicTransToolkit && python3 -m pip install --editable ./')
|
| 14 |
|
|
|
|
| 15 |
from IndicTransToolkit import IndicProcessor
|
| 16 |
|
|
|
|
| 17 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
|
| 18 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda" if torch.cuda.is_available() else "cpu")
|
| 19 |
|
| 20 |
+
@st.cache_resource
|
| 21 |
def generate_caption(image_path):
|
| 22 |
image = Image.open(image_path).convert("RGB")
|
| 23 |
inputs = blip_processor(image, "image of", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 26 |
caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
|
| 27 |
return caption
|
| 28 |
|
| 29 |
+
@st.cache_resource
|
| 30 |
def translate_caption(caption, target_languages):
|
| 31 |
# Load model and tokenizer
|
| 32 |
model_name = "ai4bharat/indictrans2-en-indic-1B"
|
|
|
|
| 41 |
# Source language (English)
|
| 42 |
src_lang = "eng_Latn"
|
| 43 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 44 |
+
model_IT2.to(DEVICE)
|
| 45 |
+
|
|
|
|
| 46 |
input_sentences = [caption]
|
| 47 |
translations = {}
|
| 48 |
|
| 49 |
for tgt_lang in target_languages:
|
|
|
|
| 50 |
batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
|
| 51 |
|
|
|
|
| 52 |
inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)
|
| 53 |
|
|
|
|
| 54 |
with torch.no_grad():
|
| 55 |
generated_tokens = model_IT2.generate(
|
| 56 |
**inputs,
|
|
|
|
| 61 |
num_return_sequences=1,
|
| 62 |
)
|
| 63 |
|
|
|
|
| 64 |
with tokenizer_IT2.as_target_tokenizer():
|
| 65 |
generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 66 |
|
|
|
|
| 67 |
translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
|
| 68 |
translations[tgt_lang] = translated_texts[0]
|
| 69 |
|
| 70 |
return translations
|
| 71 |
|
| 72 |
+
@st.cache_resource
|
| 73 |
def generate_audio_gtts(text, lang_code, output_file):
|
| 74 |
tts = gTTS(text=text, lang=lang_code)
|
| 75 |
tts.save(output_file)
|
| 76 |
return output_file
|
| 77 |
|
| 78 |
+
@st.cache_resource
|
| 79 |
def generate_audio_fbmms(text, model_name, output_file):
|
| 80 |
tokenizer = VitsTokenizer.from_pretrained(model_name)
|
| 81 |
model = VitsModel.from_pretrained(model_name)
|
|
|
|
| 105 |
# Select target languages for translation
|
| 106 |
target_languages = st.multiselect(
|
| 107 |
"Select target languages for translation",
|
| 108 |
+
["hin_Deva", "mar_Deva", "guj_Gujr", "urd_Arab"],
|
| 109 |
["hin_Deva", "mar_Deva"]
|
| 110 |
)
|
| 111 |
|
|
|
|
| 112 |
if target_languages:
|
| 113 |
st.write("Translating Caption...")
|
| 114 |
translations = translate_caption(caption, target_languages)
|
|
|
|
| 116 |
for lang, translation in translations.items():
|
| 117 |
st.write(f"{lang}: {translation}")
|
| 118 |
|
|
|
|
| 119 |
for lang in target_languages:
|
| 120 |
st.write(f"Using gTTS for {lang}...")
|
| 121 |
lang_code = {
|