Spaces:

SiddharthAK
/

TextLSRDemo

Sleeping

App Files Files Community

SiddharthAK commited on May 31

Commit

8e4067c

verified ·

1 Parent(s): 7ffc91a

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -4

app.py CHANGED Viewed

@@ -430,18 +430,73 @@ def calculate_dot_product_and_representations_independent(query_model_choice, do
     if query_vector is None or doc_vector is None:
         return "Failed to generate one or both vectors. Please check model loading and input text.", ""
-    # Calculate dot product
     dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
-    # Format representations - these functions now return two strings (main_output, info_output)
     query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
     doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
     # Combine output into a single string for the Markdown component
-    # Using Markdown's blockquote for representations can help them stand out as code/data
-    full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
     full_output += "---\n\n"
     # Query Representation
     full_output += f"#### Query Representation ({query_model_name_display}):\n" # Smaller heading for sub-section
     full_output += f"> {query_main_rep_str}\n" # Using blockquote for the sparse list

     if query_vector is None or doc_vector is None:
         return "Failed to generate one or both vectors. Please check model loading and input text.", ""
+    # Calculate overall dot product
     dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
+    # Format representations for display
     query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
     doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
+    # --- NEW FEATURE: Calculate dot product of overlapping terms ---
+    overlapping_terms_dot_products = {}
+    query_indices = torch.nonzero(query_vector).squeeze().cpu()
+    doc_indices = torch.nonzero(doc_vector).squeeze().cpu()
+    # Handle cases where vectors are empty or single element
+    if query_indices.dim() == 0 and query_indices.numel() == 1:
+        query_indices = query_indices.unsqueeze(0)
+    if doc_indices.dim() == 0 and doc_indices.numel() == 1:
+        doc_indices = doc_indices.unsqueeze(0)
+    # Convert indices to sets for efficient intersection
+    query_index_set = set(query_indices.tolist())
+    doc_index_set = set(doc_indices.tolist())
+    common_indices = sorted(list(query_index_set.intersection(doc_index_set)))
+    if common_indices:
+        for idx in common_indices:
+            query_weight = query_vector[idx].item()
+            doc_weight = doc_vector[idx].item()
+            term = query_tokenizer.decode([idx]) # Tokenizers should be the same for this purpose
+            if term not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"] and len(term.strip()) > 0:
+                overlapping_terms_dot_products[term] = query_weight * doc_weight
+    sorted_overlapping_dot_products = sorted(
+        overlapping_terms_dot_products.items(),
+        key=lambda item: item[1],
+        reverse=True
+    )
+    # --- End NEW FEATURE ---
     # Combine output into a single string for the Markdown component
+    full_output = f"### Overall Dot Product Score: {dot_product:.6f}\n\n"
     full_output += "---\n\n"
+    # Overlapping Terms Dot Products
+    if sorted_overlapping_dot_products:
+        full_output += "### Dot Products of Overlapping Terms:\n"
+        full_output += "*(Term: Query_Weight x Document_Weight = Product)*\n\n"
+        overlap_list = []
+        for term, product_val in sorted_overlapping_dot_products:
+            # Get individual weights for display
+            query_weight = query_vector[query_tokenizer.encode(term, add_special_tokens=False)[0]].item()
+            doc_weight = doc_vector[doc_tokenizer.encode(term, add_special_tokens=False)[0]].item()
+            if query_is_binary and doc_is_binary:
+                 overlap_list.append(f"**{term}**: 1.0000 x 1.0000 = {product_val:.4f}")
+            elif query_is_binary:
+                overlap_list.append(f"**{term}**: 1.0000 x {doc_weight:.4f} = {product_val:.4f}")
+            elif doc_is_binary:
+                overlap_list.append(f"**{term}**: {query_weight:.4f} x 1.0000 = {product_val:.4f}")
+            else:
+                overlap_list.append(f"**{term}**: {query_weight:.4f} x {doc_weight:.4f} = {product_val:.4f}")
+        full_output += ", ".join(overlap_list) + ".\n\n"
+        full_output += "---\n\n"
+    else:
+        full_output += "### No Overlapping Terms Found.\n\n"
+        full_output += "---\n\n"
     # Query Representation
     full_output += f"#### Query Representation ({query_model_name_display}):\n" # Smaller heading for sub-section
     full_output += f"> {query_main_rep_str}\n" # Using blockquote for the sparse list