Spaces:

SiddharthAK
/

TextLSRDemo

Sleeping

App Files Files Community

SiddharthAK commited on May 22

Commit

ea39258

verified ·

1 Parent(s): da0c779

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -21

app.py CHANGED Viewed

@@ -113,7 +113,7 @@ def get_splade_cocondenser_representation(text):
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
-    formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n" # Added newline
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
@@ -121,10 +121,10 @@ def get_splade_cocondenser_representation(text):
         terms_list = []
         for term, weight in sorted_representation:
             terms_list.append(f"**{term}**: {weight:.4f}")
-        formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
-    info_output = f"--- Sparse Vector Info ---\n"
-    info_output += f"Total non-zero terms in vector: {len(indices)}\n"
     info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
     return formatted_output, info_output
@@ -171,7 +171,7 @@ def get_splade_lexical_representation(text):
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
-    formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n" # Added newline
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
@@ -179,10 +179,10 @@ def get_splade_lexical_representation(text):
         terms_list = []
         for term, weight in sorted_representation:
             terms_list.append(f"**{term}**: {weight:.4f}")
-        formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
-    info_output = f"--- Raw Sparse Vector Info ---\n"
-    info_output += f"Total non-zero terms in vector: {len(indices)}\n"
     info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
     return formatted_output, info_output
@@ -216,7 +216,7 @@ def get_splade_doc_representation(text):
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
-    formatted_output = "Binary Bag-of-Words Representation:\n\n" # Changed title, added newline
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
@@ -224,11 +224,10 @@ def get_splade_doc_representation(text):
         terms_list = []
         for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
             terms_list.append(f"**{term}**")
-        formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
-    info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
-    info_output += f"Total activated terms: {len(indices)}\n"
-    info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
     return formatted_output, info_output
@@ -332,13 +331,12 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
     else:
         sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
-    formatted_output = "" # Removed initial newline to allow control outside
     if not sorted_representation:
         formatted_output += "No significant terms found.\n"
     else:
         terms_list = []
         for i, (term, weight) in enumerate(sorted_representation):
-            # Limit display for very long lists, but ensure it's still a paragraph if cut
             if i >= 50:
                 terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
                 break
@@ -346,12 +344,13 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
                 terms_list.append(f"**{term}**")
             else:
                 terms_list.append(f"**{term}**: {weight:.4f}")
-        formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
-    info_output = f"Total non-zero terms: {len(indices)}\n"
-    info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
-    return formatted_output, info_output # Now returns two strings
 # --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
@@ -360,8 +359,8 @@ def get_model_assets(model_choice_str):
         return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
     elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
         return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
-    elif model_choice_str == "Binary Bag-of-Words": # Changed name
-        return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words" # Changed name
     else:
         return None, None, False, "Unknown Model"

     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
         terms_list = []
         for term, weight in sorted_representation:
             terms_list.append(f"**{term}**: {weight:.4f}")
+        formatted_output += ", ".join(terms_list) + "."
+    info_output = f"--- Sparse Vector Info ---\n" # Line 1
+    info_output += f"Total non-zero terms in vector: {len(indices)}\n" # Line 2 (and onwards for sparsity)
     info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
     return formatted_output, info_output
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
         terms_list = []
         for term, weight in sorted_representation:
             terms_list.append(f"**{term}**: {weight:.4f}")
+        formatted_output += ", ".join(terms_list) + "."
+    info_output = f"--- Raw Sparse Vector Info ---\n" # Line 1
+    info_output += f"Total non-zero terms in vector: {len(indices)}\n" # Line 2 (and onwards for sparsity)
     info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
     return formatted_output, info_output
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
+    formatted_output = "Binary Bag-of-Words Representation:\n\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
         terms_list = []
         for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
             terms_list.append(f"**{term}**")
+        formatted_output += ", ".join(terms_list) + "."
+    info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Line 1
+    info_output += f"Total activated terms: {len(indices)} Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n" # Line 2
     return formatted_output, info_output
     else:
         sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = ""
     if not sorted_representation:
         formatted_output += "No significant terms found.\n"
     else:
         terms_list = []
         for i, (term, weight) in enumerate(sorted_representation):
             if i >= 50:
                 terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
                 break
                 terms_list.append(f"**{term}**")
             else:
                 terms_list.append(f"**{term}**: {weight:.4f}")
+        formatted_output += ", ".join(terms_list) + "."
+    # This is the line that will now always be split into two
+    info_output = f"Total non-zero terms: {len(indices)}\n" # Line 1
+    info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n" # Line 2
+    return formatted_output, info_output
 # --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
         return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
     elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
         return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
+    elif model_choice_str == "Binary Bag-of-Words":
+        return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words"
     else:
         return None, None, False, "Unknown Model"