Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,8 +2,8 @@ import gradio as gr
|
|
| 2 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
| 5 |
-
from tqdm.auto import tqdm
|
| 6 |
-
import os
|
| 7 |
|
| 8 |
# --- Model Loading ---
|
| 9 |
tokenizer_splade = None
|
|
@@ -34,11 +34,11 @@ except Exception as e:
|
|
| 34 |
print(f"Error loading SPLADE-v3-Lexical model: {e}")
|
| 35 |
print(f"Please ensure '{splade_lexical_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
|
| 36 |
|
| 37 |
-
# Load SPLADE v3 Doc model
|
| 38 |
try:
|
| 39 |
splade_doc_model_name = "naver/splade-v3-doc"
|
| 40 |
tokenizer_splade_doc = AutoTokenizer.from_pretrained(splade_doc_model_name)
|
| 41 |
-
model_splade_doc = AutoModelForMaskedLM.from_pretrained(splade_doc_model_name)
|
| 42 |
model_splade_doc.eval()
|
| 43 |
print(f"SPLADE-v3-Doc model '{splade_doc_model_name}' loaded successfully!")
|
| 44 |
except Exception as e:
|
|
@@ -183,25 +183,19 @@ def get_splade_lexical_representation(text):
|
|
| 183 |
|
| 184 |
|
| 185 |
def get_splade_doc_representation(text):
|
| 186 |
-
if tokenizer_splade_doc is None
|
| 187 |
-
return "SPLADE-v3-Doc
|
| 188 |
|
| 189 |
inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
|
| 190 |
-
inputs = {k: v.to(
|
| 191 |
-
|
| 192 |
-
with torch.no_grad():
|
| 193 |
-
output = model_splade_doc(**inputs)
|
| 194 |
-
|
| 195 |
-
if not hasattr(output, "logits"):
|
| 196 |
-
return "Model output structure not as expected. 'logits' not found."
|
| 197 |
|
| 198 |
vocab_size = tokenizer_splade_doc.vocab_size
|
| 199 |
-
#
|
| 200 |
-
|
| 201 |
inputs['input_ids'], vocab_size, tokenizer_splade_doc
|
| 202 |
).squeeze() # Squeeze back for single output
|
| 203 |
|
| 204 |
-
indices = torch.nonzero(
|
| 205 |
if not isinstance(indices, list):
|
| 206 |
indices = [indices] if indices else []
|
| 207 |
|
|
@@ -216,7 +210,7 @@ def get_splade_doc_representation(text):
|
|
| 216 |
|
| 217 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
| 218 |
|
| 219 |
-
formatted_output = "
|
| 220 |
if not sorted_representation:
|
| 221 |
formatted_output += "No significant terms found for this input.\n"
|
| 222 |
else:
|
|
@@ -226,7 +220,7 @@ def get_splade_doc_representation(text):
|
|
| 226 |
break
|
| 227 |
formatted_output += f"- **{term}**\n"
|
| 228 |
|
| 229 |
-
formatted_output += "\n--- Raw Binary
|
| 230 |
formatted_output += f"Total activated terms: {len(indices)}\n"
|
| 231 |
formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
|
| 232 |
|
|
@@ -235,11 +229,11 @@ def get_splade_doc_representation(text):
|
|
| 235 |
|
| 236 |
# --- Unified Prediction Function for the Explorer Tab ---
|
| 237 |
def predict_representation_explorer(model_choice, text):
|
| 238 |
-
if model_choice == "SPLADE-cocondenser-distil
|
| 239 |
return get_splade_cocondenser_representation(text)
|
| 240 |
elif model_choice == "MLP encoder (SPLADE-v3-lexical)":
|
| 241 |
return get_splade_lexical_representation(text)
|
| 242 |
-
elif model_choice == "Binary
|
| 243 |
return get_splade_doc_representation(text)
|
| 244 |
else:
|
| 245 |
return "Please select a model."
|
|
@@ -290,22 +284,18 @@ def get_splade_lexical_vector(text):
|
|
| 290 |
return None
|
| 291 |
|
| 292 |
def get_splade_doc_vector(text):
|
| 293 |
-
if tokenizer_splade_doc is None
|
| 294 |
return None
|
| 295 |
|
| 296 |
inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
|
| 297 |
-
inputs = {k: v.to(
|
| 298 |
-
|
| 299 |
-
with torch.no_grad():
|
| 300 |
-
output = model_splade_doc(**inputs)
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
return None
|
| 309 |
|
| 310 |
|
| 311 |
# --- Function to get formatted representation from a raw vector and tokenizer ---
|
|
@@ -322,7 +312,7 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
|
| 322 |
values = [1.0] * len(indices)
|
| 323 |
else:
|
| 324 |
values = splade_vector[indices].cpu().tolist()
|
| 325 |
-
|
| 326 |
token_weights = dict(zip(indices, values))
|
| 327 |
|
| 328 |
meaningful_tokens = {}
|
|
@@ -361,8 +351,8 @@ def get_model_assets(model_choice_str):
|
|
| 361 |
return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
|
| 362 |
elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
|
| 363 |
return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
|
| 364 |
-
elif model_choice_str == "Binary
|
| 365 |
-
return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary
|
| 366 |
else:
|
| 367 |
return None, None, False, "Unknown Model"
|
| 368 |
|
|
@@ -415,7 +405,7 @@ with gr.Blocks(title="SPLADE Demos") as demo:
|
|
| 415 |
[
|
| 416 |
"MLM encoder (SPLADE-cocondenser-distil)",
|
| 417 |
"MLP encoder (SPLADE-v3-lexical)",
|
| 418 |
-
"Binary
|
| 419 |
],
|
| 420 |
label="Choose Sparse Encoder",
|
| 421 |
value="MLM encoder (SPLADE-cocondenser-distil)"
|
|
@@ -439,7 +429,7 @@ with gr.Blocks(title="SPLADE Demos") as demo:
|
|
| 439 |
model_choices = [
|
| 440 |
"MLM encoder (SPLADE-cocondenser-distil)",
|
| 441 |
"MLP encoder (SPLADE-v3-lexical)",
|
| 442 |
-
"Binary
|
| 443 |
]
|
| 444 |
|
| 445 |
gr.Interface(
|
|
|
|
| 2 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
| 5 |
+
from tqdm.auto import tqdm
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
# --- Model Loading ---
|
| 9 |
tokenizer_splade = None
|
|
|
|
| 34 |
print(f"Error loading SPLADE-v3-Lexical model: {e}")
|
| 35 |
print(f"Please ensure '{splade_lexical_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
|
| 36 |
|
| 37 |
+
# Load SPLADE v3 Doc model - Model loading is still necessary even if its logits aren't used for BoW
|
| 38 |
try:
|
| 39 |
splade_doc_model_name = "naver/splade-v3-doc"
|
| 40 |
tokenizer_splade_doc = AutoTokenizer.from_pretrained(splade_doc_model_name)
|
| 41 |
+
model_splade_doc = AutoModelForMaskedLM.from_pretrained(splade_doc_model_name) # Still load the model
|
| 42 |
model_splade_doc.eval()
|
| 43 |
print(f"SPLADE-v3-Doc model '{splade_doc_model_name}' loaded successfully!")
|
| 44 |
except Exception as e:
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
def get_splade_doc_representation(text):
|
| 186 |
+
if tokenizer_splade_doc is None: # No longer need model_splade_doc to be loaded for 'logits'
|
| 187 |
+
return "SPLADE-v3-Doc tokenizer is not loaded. Please check the console for loading errors."
|
| 188 |
|
| 189 |
inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
|
| 190 |
+
inputs = {k: v.to(torch.device("cpu")) for k, v in inputs.items()} # Ensure on CPU for direct mask creation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
vocab_size = tokenizer_splade_doc.vocab_size
|
| 193 |
+
# Directly create the binary Bag-of-Words vector using the input_ids
|
| 194 |
+
binary_bow_vector = create_lexical_bow_mask(
|
| 195 |
inputs['input_ids'], vocab_size, tokenizer_splade_doc
|
| 196 |
).squeeze() # Squeeze back for single output
|
| 197 |
|
| 198 |
+
indices = torch.nonzero(binary_bow_vector).squeeze().cpu().tolist()
|
| 199 |
if not isinstance(indices, list):
|
| 200 |
indices = [indices] if indices else []
|
| 201 |
|
|
|
|
| 210 |
|
| 211 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
| 212 |
|
| 213 |
+
formatted_output = "Binary Bag-of-Words Representation:\n" # Changed title
|
| 214 |
if not sorted_representation:
|
| 215 |
formatted_output += "No significant terms found for this input.\n"
|
| 216 |
else:
|
|
|
|
| 220 |
break
|
| 221 |
formatted_output += f"- **{term}**\n"
|
| 222 |
|
| 223 |
+
formatted_output += "\n--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
|
| 224 |
formatted_output += f"Total activated terms: {len(indices)}\n"
|
| 225 |
formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
|
| 226 |
|
|
|
|
| 229 |
|
| 230 |
# --- Unified Prediction Function for the Explorer Tab ---
|
| 231 |
def predict_representation_explorer(model_choice, text):
|
| 232 |
+
if model_choice == "MLM encoder (SPLADE-cocondenser-distil)":
|
| 233 |
return get_splade_cocondenser_representation(text)
|
| 234 |
elif model_choice == "MLP encoder (SPLADE-v3-lexical)":
|
| 235 |
return get_splade_lexical_representation(text)
|
| 236 |
+
elif model_choice == "Binary Bag-of-Words": # Changed name
|
| 237 |
return get_splade_doc_representation(text)
|
| 238 |
else:
|
| 239 |
return "Please select a model."
|
|
|
|
| 284 |
return None
|
| 285 |
|
| 286 |
def get_splade_doc_vector(text):
|
| 287 |
+
if tokenizer_splade_doc is None: # No longer need model_splade_doc to be loaded for 'logits'
|
| 288 |
return None
|
| 289 |
|
| 290 |
inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
|
| 291 |
+
inputs = {k: v.to(torch.device("cpu")) for k, v in inputs.items()} # Ensure on CPU for direct mask creation
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
+
vocab_size = tokenizer_splade_doc.vocab_size
|
| 294 |
+
# Directly create the binary Bag-of-Words vector using the input_ids
|
| 295 |
+
binary_bow_vector = create_lexical_bow_mask(
|
| 296 |
+
inputs['input_ids'], vocab_size, tokenizer_splade_doc
|
| 297 |
+
).squeeze()
|
| 298 |
+
return binary_bow_vector
|
|
|
|
| 299 |
|
| 300 |
|
| 301 |
# --- Function to get formatted representation from a raw vector and tokenizer ---
|
|
|
|
| 312 |
values = [1.0] * len(indices)
|
| 313 |
else:
|
| 314 |
values = splade_vector[indices].cpu().tolist()
|
| 315 |
+
|
| 316 |
token_weights = dict(zip(indices, values))
|
| 317 |
|
| 318 |
meaningful_tokens = {}
|
|
|
|
| 351 |
return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
|
| 352 |
elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
|
| 353 |
return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
|
| 354 |
+
elif model_choice_str == "Binary Bag-of-Words": # Changed name
|
| 355 |
+
return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words" # Changed name
|
| 356 |
else:
|
| 357 |
return None, None, False, "Unknown Model"
|
| 358 |
|
|
|
|
| 405 |
[
|
| 406 |
"MLM encoder (SPLADE-cocondenser-distil)",
|
| 407 |
"MLP encoder (SPLADE-v3-lexical)",
|
| 408 |
+
"Binary Bag-of-Words" # Changed name here
|
| 409 |
],
|
| 410 |
label="Choose Sparse Encoder",
|
| 411 |
value="MLM encoder (SPLADE-cocondenser-distil)"
|
|
|
|
| 429 |
model_choices = [
|
| 430 |
"MLM encoder (SPLADE-cocondenser-distil)",
|
| 431 |
"MLP encoder (SPLADE-v3-lexical)",
|
| 432 |
+
"Binary Bag-of-Words" # Changed name here
|
| 433 |
]
|
| 434 |
|
| 435 |
gr.Interface(
|