Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -113,7 +113,7 @@ def get_splade_cocondenser_representation(text):
|
|
| 113 |
|
| 114 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 115 |
|
| 116 |
-
formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n"
|
| 117 |
if not sorted_representation:
|
| 118 |
formatted_output += "No significant terms found for this input.\n"
|
| 119 |
else:
|
|
@@ -121,10 +121,10 @@ def get_splade_cocondenser_representation(text):
|
|
| 121 |
terms_list = []
|
| 122 |
for term, weight in sorted_representation:
|
| 123 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
| 124 |
-
formatted_output += ", ".join(terms_list) + "."
|
| 125 |
|
| 126 |
-
info_output = f"--- Sparse Vector Info ---\n"
|
| 127 |
-
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
| 128 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
|
| 129 |
|
| 130 |
return formatted_output, info_output
|
|
@@ -171,7 +171,7 @@ def get_splade_lexical_representation(text):
|
|
| 171 |
|
| 172 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 173 |
|
| 174 |
-
formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n"
|
| 175 |
if not sorted_representation:
|
| 176 |
formatted_output += "No significant terms found for this input.\n"
|
| 177 |
else:
|
|
@@ -179,10 +179,10 @@ def get_splade_lexical_representation(text):
|
|
| 179 |
terms_list = []
|
| 180 |
for term, weight in sorted_representation:
|
| 181 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
| 182 |
-
formatted_output += ", ".join(terms_list) + "."
|
| 183 |
|
| 184 |
-
info_output = f"--- Raw Sparse Vector Info ---\n"
|
| 185 |
-
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
| 186 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
|
| 187 |
|
| 188 |
return formatted_output, info_output
|
|
@@ -216,7 +216,7 @@ def get_splade_doc_representation(text):
|
|
| 216 |
|
| 217 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
| 218 |
|
| 219 |
-
formatted_output = "Binary Bag-of-Words Representation:\n\n"
|
| 220 |
if not sorted_representation:
|
| 221 |
formatted_output += "No significant terms found for this input.\n"
|
| 222 |
else:
|
|
@@ -224,11 +224,10 @@ def get_splade_doc_representation(text):
|
|
| 224 |
terms_list = []
|
| 225 |
for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
|
| 226 |
terms_list.append(f"**{term}**")
|
| 227 |
-
formatted_output += ", ".join(terms_list) + "."
|
| 228 |
|
| 229 |
-
info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" #
|
| 230 |
-
info_output += f"Total activated terms: {len(indices)}\n"
|
| 231 |
-
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
|
| 232 |
|
| 233 |
return formatted_output, info_output
|
| 234 |
|
|
@@ -332,13 +331,12 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
|
| 332 |
else:
|
| 333 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 334 |
|
| 335 |
-
formatted_output = ""
|
| 336 |
if not sorted_representation:
|
| 337 |
formatted_output += "No significant terms found.\n"
|
| 338 |
else:
|
| 339 |
terms_list = []
|
| 340 |
for i, (term, weight) in enumerate(sorted_representation):
|
| 341 |
-
# Limit display for very long lists, but ensure it's still a paragraph if cut
|
| 342 |
if i >= 50:
|
| 343 |
terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
|
| 344 |
break
|
|
@@ -346,12 +344,13 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
|
| 346 |
terms_list.append(f"**{term}**")
|
| 347 |
else:
|
| 348 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
| 349 |
-
formatted_output += ", ".join(terms_list) + "."
|
| 350 |
|
| 351 |
-
|
| 352 |
-
info_output
|
|
|
|
| 353 |
|
| 354 |
-
return formatted_output, info_output
|
| 355 |
|
| 356 |
|
| 357 |
# --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
|
|
@@ -360,8 +359,8 @@ def get_model_assets(model_choice_str):
|
|
| 360 |
return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
|
| 361 |
elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
|
| 362 |
return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
|
| 363 |
-
elif model_choice_str == "Binary Bag-of-Words":
|
| 364 |
-
return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words"
|
| 365 |
else:
|
| 366 |
return None, None, False, "Unknown Model"
|
| 367 |
|
|
|
|
| 113 |
|
| 114 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 115 |
|
| 116 |
+
formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n"
|
| 117 |
if not sorted_representation:
|
| 118 |
formatted_output += "No significant terms found for this input.\n"
|
| 119 |
else:
|
|
|
|
| 121 |
terms_list = []
|
| 122 |
for term, weight in sorted_representation:
|
| 123 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
| 124 |
+
formatted_output += ", ".join(terms_list) + "."
|
| 125 |
|
| 126 |
+
info_output = f"--- Sparse Vector Info ---\n" # Line 1
|
| 127 |
+
info_output += f"Total non-zero terms in vector: {len(indices)}\n" # Line 2 (and onwards for sparsity)
|
| 128 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
|
| 129 |
|
| 130 |
return formatted_output, info_output
|
|
|
|
| 171 |
|
| 172 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 173 |
|
| 174 |
+
formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n"
|
| 175 |
if not sorted_representation:
|
| 176 |
formatted_output += "No significant terms found for this input.\n"
|
| 177 |
else:
|
|
|
|
| 179 |
terms_list = []
|
| 180 |
for term, weight in sorted_representation:
|
| 181 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
| 182 |
+
formatted_output += ", ".join(terms_list) + "."
|
| 183 |
|
| 184 |
+
info_output = f"--- Raw Sparse Vector Info ---\n" # Line 1
|
| 185 |
+
info_output += f"Total non-zero terms in vector: {len(indices)}\n" # Line 2 (and onwards for sparsity)
|
| 186 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
|
| 187 |
|
| 188 |
return formatted_output, info_output
|
|
|
|
| 216 |
|
| 217 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
| 218 |
|
| 219 |
+
formatted_output = "Binary Bag-of-Words Representation:\n\n"
|
| 220 |
if not sorted_representation:
|
| 221 |
formatted_output += "No significant terms found for this input.\n"
|
| 222 |
else:
|
|
|
|
| 224 |
terms_list = []
|
| 225 |
for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
|
| 226 |
terms_list.append(f"**{term}**")
|
| 227 |
+
formatted_output += ", ".join(terms_list) + "."
|
| 228 |
|
| 229 |
+
info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Line 1
|
| 230 |
+
info_output += f"Total activated terms: {len(indices)} Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n" # Line 2
|
|
|
|
| 231 |
|
| 232 |
return formatted_output, info_output
|
| 233 |
|
|
|
|
| 331 |
else:
|
| 332 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 333 |
|
| 334 |
+
formatted_output = ""
|
| 335 |
if not sorted_representation:
|
| 336 |
formatted_output += "No significant terms found.\n"
|
| 337 |
else:
|
| 338 |
terms_list = []
|
| 339 |
for i, (term, weight) in enumerate(sorted_representation):
|
|
|
|
| 340 |
if i >= 50:
|
| 341 |
terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
|
| 342 |
break
|
|
|
|
| 344 |
terms_list.append(f"**{term}**")
|
| 345 |
else:
|
| 346 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
| 347 |
+
formatted_output += ", ".join(terms_list) + "."
|
| 348 |
|
| 349 |
+
# This is the line that will now always be split into two
|
| 350 |
+
info_output = f"Total non-zero terms: {len(indices)}\n" # Line 1
|
| 351 |
+
info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n" # Line 2
|
| 352 |
|
| 353 |
+
return formatted_output, info_output
|
| 354 |
|
| 355 |
|
| 356 |
# --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
|
|
|
|
| 359 |
return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
|
| 360 |
elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
|
| 361 |
return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
|
| 362 |
+
elif model_choice_str == "Binary Bag-of-Words":
|
| 363 |
+
return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words"
|
| 364 |
else:
|
| 365 |
return None, None, False, "Unknown Model"
|
| 366 |
|