Spaces:

SiddharthAK
/

TextLSRDemo

Sleeping

App Files Files Community

SiddharthAK commited on May 22

Commit

01b1a90

verified ·

1 Parent(s): e05e8c0

added retrieval feature

Browse files

Files changed (1) hide show

app.py +269 -52

app.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
 import torch
-# --- Model Loading ---
 tokenizer_splade = None
 model_splade = None
 tokenizer_splade_lexical = None
@@ -14,7 +18,7 @@ model_splade_doc = None
 try:
     tokenizer_splade = AutoTokenizer.from_pretrained("naver/splade-cocondenser-selfdistil")
     model_splade = AutoModelForMaskedLM.from_pretrained("naver/splade-cocondenser-selfdistil")
-    model_splade.eval() # Set to evaluation mode for inference
     print("SPLADE-cocondenser-distil model loaded successfully!")
 except Exception as e:
     print(f"Error loading SPLADE-cocondenser-distil model: {e}")
@@ -25,7 +29,7 @@ try:
     splade_lexical_model_name = "naver/splade-v3-lexical"
     tokenizer_splade_lexical = AutoTokenizer.from_pretrained(splade_lexical_model_name)
     model_splade_lexical = AutoModelForMaskedLM.from_pretrained(splade_lexical_model_name)
-    model_splade_lexical.eval() # Set to evaluation mode for inference
     print(f"SPLADE-v3-Lexical model '{splade_lexical_model_name}' loaded successfully!")
 except Exception as e:
     print(f"Error loading SPLADE-v3-Lexical model: {e}")
@@ -36,19 +40,35 @@ try:
     splade_doc_model_name = "naver/splade-v3-doc"
     tokenizer_splade_doc = AutoTokenizer.from_pretrained(splade_doc_model_name)
     model_splade_doc = AutoModelForMaskedLM.from_pretrained(splade_doc_model_name)
-    model_splade_doc.eval() # Set to evaluation mode for inference
     print(f"SPLADE-v3-Doc model '{splade_doc_model_name}' loaded successfully!")
 except Exception as e:
     print(f"Error loading SPLADE-v3-Doc model: {e}")
     print(f"Please ensure '{splade_doc_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
-# --- Helper function for lexical mask ---
 def create_lexical_bow_mask(input_ids, vocab_size, tokenizer):
-    """
-    Creates a binary bag-of-words mask from input_ids,
-    zeroing out special tokens and padding.
-    """
     bow_mask = torch.zeros(vocab_size, device=input_ids.device)
     meaningful_token_ids = []
     for token_id in input_ids.squeeze().tolist():
@@ -60,14 +80,15 @@ def create_lexical_bow_mask(input_ids, vocab_size, tokenizer):
             tokenizer.unk_token_id
         ]:
             meaningful_token_ids.append(token_id)
     if meaningful_token_ids:
         bow_mask[list(set(meaningful_token_ids))] = 1
     return bow_mask.unsqueeze(0)
-# --- Core Representation Functions ---
 def get_splade_cocondenser_representation(text):
     if tokenizer_splade is None or model_splade is None:
@@ -80,7 +101,6 @@ def get_splade_cocondenser_representation(text):
         output = model_splade(**inputs)
     if hasattr(output, 'logits'):
-        # Standard SPLADE calculation for learned weighting and expansion
         splade_vector = torch.max(
             torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1),
             dim=1
@@ -90,7 +110,7 @@ def get_splade_cocondenser_representation(text):
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
-        indices = [indices]
     values = splade_vector[indices].cpu().tolist()
     token_weights = dict(zip(indices, values))
@@ -139,12 +159,12 @@ def get_splade_lexical_representation(text):
     vocab_size = tokenizer_splade_lexical.vocab_size
     bow_mask = create_lexical_bow_mask(
         inputs['input_ids'], vocab_size, tokenizer_splade_lexical
-    ).squeeze()
     splade_vector = splade_vector * bow_mask
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
-        indices = [indices]
     values = splade_vector[indices].cpu().tolist()
     token_weights = dict(zip(indices, values))
@@ -171,7 +191,6 @@ def get_splade_lexical_representation(text):
     return formatted_output
-# Function for SPLADE-v3-Doc representation (Binary Sparse - Lexical Only)
 def get_splade_doc_representation(text):
     if tokenizer_splade_doc is None or model_splade_doc is None:
         return "SPLADE-v3-Doc model is not loaded. Please check the console for loading errors."
@@ -185,19 +204,15 @@ def get_splade_doc_representation(text):
     if not hasattr(output, "logits"):
         return "SPLADE-v3-Doc model output structure not as expected. 'logits' not found."
-    # For SPLADE-v3-Doc, assuming output is designed to be binary and lexical-only.
-    # We will derive the output directly from the input tokens themselves,
-    # as the model's primary role in this context is as a pre-trained LM feature extractor
-    # for a document-side, lexical-only binary sparse representation.
     vocab_size = tokenizer_splade_doc.vocab_size
-    binary_splade_vector = create_lexical_bow_mask( # Use the BOW mask directly for binary
         inputs['input_ids'], vocab_size, tokenizer_splade_doc
     ).squeeze()
     indices = torch.nonzero(binary_splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
         indices = [indices] if indices else []
     values = [1.0] * len(indices) # All values are 1 for binary representation
     token_weights = dict(zip(indices, values))
@@ -226,41 +241,243 @@ def get_splade_doc_representation(text):
     return formatted_output
-# --- Unified Prediction Function for Gradio ---
-def predict_representation(model_choice, text):
     if model_choice == "SPLADE-cocondenser-distil (weighting and expansion)":
         return get_splade_cocondenser_representation(text)
     elif model_choice == "SPLADE-v3-Lexical (weighting)":
         return get_splade_lexical_representation(text)
     elif model_choice == "SPLADE-v3-Doc (binary)":
-        return get_splade_doc_representation(text)
     else:
         return "Please select a model."
-# --- Gradio Interface Setup ---
-demo = gr.Interface(
-    fn=predict_representation,
-    inputs=[
-        gr.Radio(
-            [
-                "SPLADE-cocondenser-distil (weighting and expansion)",
-                "SPLADE-v3-Lexical (weighting)",
-                "SPLADE-v3-Doc (binary)"
-            ],
-            label="Choose Representation Model",
-            value="SPLADE-cocondenser-distil (weighting and expansion)" # Corrected default value
-        ),
-        gr.Textbox(
-            lines=5,
-            label="Enter your query or document text here:",
-            placeholder="e.g., Why is Padua the nicest city in Italy?"
-        )
-    ],
-    outputs=gr.Markdown(),
-    title="🌌 Sparse Representation Generator",
-    description="Explore different SPLADE models and their sparse representation types: weighted and expansive, weighted and lexical-only, or strictly binary.",
-    allow_flagging="never"
-)
-# Launch the Gradio app
 demo.launch()

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForMaskedLM
 import torch
+import numpy as np
+from tqdm.auto import tqdm
+import os
+import ir_datasets
+# --- Model Loading (Keep as is) ---
 tokenizer_splade = None
 model_splade = None
 tokenizer_splade_lexical = None
 try:
     tokenizer_splade = AutoTokenizer.from_pretrained("naver/splade-cocondenser-selfdistil")
     model_splade = AutoModelForMaskedLM.from_pretrained("naver/splade-cocondenser-selfdistil")
+    model_splade.eval()
     print("SPLADE-cocondenser-distil model loaded successfully!")
 except Exception as e:
     print(f"Error loading SPLADE-cocondenser-distil model: {e}")
     splade_lexical_model_name = "naver/splade-v3-lexical"
     tokenizer_splade_lexical = AutoTokenizer.from_pretrained(splade_lexical_model_name)
     model_splade_lexical = AutoModelForMaskedLM.from_pretrained(splade_lexical_model_name)
+    model_splade_lexical.eval()
     print(f"SPLADE-v3-Lexical model '{splade_lexical_model_name}' loaded successfully!")
 except Exception as e:
     print(f"Error loading SPLADE-v3-Lexical model: {e}")
     splade_doc_model_name = "naver/splade-v3-doc"
     tokenizer_splade_doc = AutoTokenizer.from_pretrained(splade_doc_model_name)
     model_splade_doc = AutoModelForMaskedLM.from_pretrained(splade_doc_model_name)
+    model_splade_doc.eval()
     print(f"SPLADE-v3-Doc model '{splade_doc_model_name}' loaded successfully!")
 except Exception as e:
     print(f"Error loading SPLADE-v3-Doc model: {e}")
     print(f"Please ensure '{splade_doc_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
+# --- Global Variables for Document Index ---
+document_representations = {} # Stores {doc_id: sparse_vector}
+document_texts = {}           # Stores {doc_id: doc_text}
+initial_doc_model_for_indexing = "SPLADE-cocondenser-distil" # Fixed for initial demo index
+# --- Load SciFact Corpus using ir_datasets ---
+def load_scifact_corpus_ir_datasets():
+    global document_texts
+    print("Loading SciFact corpus using ir_datasets...")
+    try:
+        dataset = ir_datasets.load("scifact")
+        for doc in tqdm(dataset.docs_iter(), desc="Loading SciFact documents"):
+            document_texts[doc.doc_id] = doc.text.strip()
+        print(f"Loaded {len(document_texts)} documents from SciFact corpus.")
+    except Exception as e:
+        print(f"Error loading SciFact corpus with ir_datasets: {e}")
+        print("Please ensure 'ir_datasets' is installed and your internet connection is stable.")
+# --- Helper function for lexical mask (Keep as is) ---
 def create_lexical_bow_mask(input_ids, vocab_size, tokenizer):
     bow_mask = torch.zeros(vocab_size, device=input_ids.device)
     meaningful_token_ids = []
     for token_id in input_ids.squeeze().tolist():
             tokenizer.unk_token_id
         ]:
             meaningful_token_ids.append(token_id)
     if meaningful_token_ids:
         bow_mask[list(set(meaningful_token_ids))] = 1
     return bow_mask.unsqueeze(0)
+# --- Core Representation Functions (Return Formatted Strings - for Explorer Tab) ---
+# These are your original functions, re-added.
 def get_splade_cocondenser_representation(text):
     if tokenizer_splade is None or model_splade is None:
         output = model_splade(**inputs)
     if hasattr(output, 'logits'):
         splade_vector = torch.max(
             torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1),
             dim=1
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
+        indices = [indices] if indices else []
     values = splade_vector[indices].cpu().tolist()
     token_weights = dict(zip(indices, values))
     vocab_size = tokenizer_splade_lexical.vocab_size
     bow_mask = create_lexical_bow_mask(
         inputs['input_ids'], vocab_size, tokenizer_splade_lexical
+    ).squeeze()
     splade_vector = splade_vector * bow_mask
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
+        indices = [indices] if indices else []
     values = splade_vector[indices].cpu().tolist()
     token_weights = dict(zip(indices, values))
     return formatted_output
 def get_splade_doc_representation(text):
     if tokenizer_splade_doc is None or model_splade_doc is None:
         return "SPLADE-v3-Doc model is not loaded. Please check the console for loading errors."
     if not hasattr(output, "logits"):
         return "SPLADE-v3-Doc model output structure not as expected. 'logits' not found."
     vocab_size = tokenizer_splade_doc.vocab_size
+    binary_splade_vector = create_lexical_bow_mask(
         inputs['input_ids'], vocab_size, tokenizer_splade_doc
     ).squeeze()
     indices = torch.nonzero(binary_splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
         indices = [indices] if indices else []
     values = [1.0] * len(indices) # All values are 1 for binary representation
     token_weights = dict(zip(indices, values))
     return formatted_output
+# --- Unified Prediction Function for the Explorer Tab ---
+def predict_representation_explorer(model_choice, text):
     if model_choice == "SPLADE-cocondenser-distil (weighting and expansion)":
         return get_splade_cocondenser_representation(text)
     elif model_choice == "SPLADE-v3-Lexical (weighting)":
         return get_splade_lexical_representation(text)
     elif model_choice == "SPLADE-v3-Doc (binary)":
+        return get_splade_doc_representation(text)
     else:
         return "Please select a model."
+# --- Internal Core Representation Functions (Return Raw Vectors - for Retrieval Tab) ---
+# These are the ones ending with _internal, as previously defined.
+def get_splade_cocondenser_representation_internal(text, tokenizer, model):
+    if tokenizer is None or model is None: return None
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad(): output = model(**inputs)
+    if hasattr(output, 'logits'):
+        splade_vector = torch.max(torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1), dim=1)[0].squeeze()
+        return splade_vector
+    else:
+        print("Model output structure not as expected for SPLADE-cocondenser-distil. 'logits' not found.")
+        return None
+def get_splade_lexical_representation_internal(text, tokenizer, model):
+    if tokenizer is None or model is None: return None
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad(): output = model(**inputs)
+    if hasattr(output, 'logits'):
+        splade_vector = torch.max(torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1), dim=1)[0].squeeze()
+        vocab_size = tokenizer.vocab_size
+        bow_mask = create_lexical_bow_mask(inputs['input_ids'], vocab_size, tokenizer).squeeze()
+        splade_vector = splade_vector * bow_mask
+        return splade_vector
+    else:
+        print("Model output structure not as expected for SPLADE-v3-Lexical. 'logits' not found.")
+        return None
+def get_splade_doc_representation_internal(text, tokenizer, model):
+    if tokenizer is None or model is None: return None
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    vocab_size = tokenizer.vocab_size
+    binary_splade_vector = create_lexical_bow_mask(inputs['input_ids'], vocab_size, tokenizer).squeeze()
+    return binary_splade_vector
+# --- Document Indexing Function (for Retrieval Tab) ---
+def index_documents(doc_model_choice):
+    global document_representations
+    if document_representations:
+        print("Documents already indexed. Skipping re-indexing.")
+        return True
+    tokenizer_to_use = None
+    model_to_use = None
+    representation_func_to_use = None
+    if doc_model_choice == "SPLADE-cocondenser-distil":
+        if tokenizer_splade is None or model_splade is None:
+            print("SPLADE-cocondenser-distil model not loaded for indexing.")
+            return False
+        tokenizer_to_use = tokenizer_splade
+        model_to_use = model_splade
+        representation_func_to_use = get_splade_cocondenser_representation_internal
+    elif doc_model_choice == "SPLADE-v3-Lexical":
+        if tokenizer_splade_lexical is None or model_splade_lexical is None:
+            print("SPLADE-v3-Lexical model not loaded for indexing.")
+            return False
+        tokenizer_to_use = tokenizer_splade_lexical
+        model_to_use = model_splade_lexical
+        representation_func_to_use = get_splade_lexical_representation_internal
+    elif doc_model_choice == "SPLADE-v3-Doc":
+        if tokenizer_splade_doc is None or model_splade_doc is None:
+            print("SPLADE-v3-Doc model not loaded for indexing.")
+            return False
+        tokenizer_to_use = tokenizer_splade_doc
+        model_to_use = model_splade_doc
+        representation_func_to_use = get_splade_doc_representation_internal
+    else:
+        print(f"Invalid model choice for document indexing: {doc_model_choice}")
+        return False
+    print(f"Indexing documents using {doc_model_choice}...")
+    doc_items = list(document_texts.items())
+    for doc_id, doc_text in tqdm(doc_items, desc="Indexing Documents"):
+        sparse_vector = representation_func_to_use(doc_text, tokenizer_to_use, model_to_use)
+        if sparse_vector is not None:
+            document_representations[doc_id] = sparse_vector.cpu()
+        else:
+            print(f"Warning: Failed to get representation for doc_id {doc_id}")
+    print(f"Finished indexing {len(document_representations)} documents.")
+    return True
+# --- Retrieval Function (for Retrieval Tab) ---
+def retrieve_documents(query_text, query_model_choice, indexed_doc_model_name, top_k=5):
+    if not document_representations:
+        return "Document index is not loaded or empty. Please ensure documents are indexed.", []
+    query_vector = None
+    query_tokenizer = None
+    query_model = None
+    if query_model_choice == "SPLADE-cocondenser-distil (weighting and expansion)":
+        query_tokenizer = tokenizer_splade
+        query_model = model_splade
+        query_vector = get_splade_cocondenser_representation_internal(query_text, query_tokenizer, query_model)
+    elif query_model_choice == "SPLADE-v3-Lexical (weighting)":
+        query_tokenizer = tokenizer_splade_lexical
+        query_model = model_splade_lexical
+        query_vector = get_splade_lexical_representation_internal(query_text, query_tokenizer, query_model)
+    elif query_model_choice == "SPLADE-v3-Doc (binary)":
+        query_tokenizer = tokenizer_splade_doc
+        query_model = model_splade_doc
+        query_vector = get_splade_doc_representation_internal(query_text, query_tokenizer, query_model)
+    else:
+        return "Invalid query model choice.", []
+    if query_vector is None:
+        return "Failed to get query representation. Check console for model loading errors.", []
+    query_vector = query_vector.cpu()
+    scores = {}
+    for doc_id, doc_vec in document_representations.items():
+        score = torch.dot(query_vector, doc_vec).item()
+        scores[doc_id] = score
+    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
+    top_results = sorted_scores[:top_k]
+    formatted_output = f"Retrieval Results for Query: '{query_text}'\n"
+    formatted_output += f"Using Query Model: **{query_model_choice}**\n"
+    formatted_output += f"Documents Indexed with: **{indexed_doc_model_name}**\n\n"
+    if not top_results:
+        formatted_output += "No documents found or scored.\n"
+    else:
+        for i, (doc_id, score) in enumerate(top_results):
+            doc_text = document_texts.get(doc_id, "Document text not available.")
+            formatted_output += f"**{i+1}. Document ID: {doc_id}** (Score: {score:.4f})\n"
+            formatted_output += f"> {doc_text[:300]}...\n\n"
+    return formatted_output, top_results
+# --- Unified Prediction Function for Gradio (for Retrieval Tab) ---
+def predict_retrieval_gradio(query_text, query_model_choice, selected_doc_model_display_only):
+    formatted_output, _ = retrieve_documents(query_text, query_model_choice, initial_doc_model_for_indexing, top_k=5)
+    return formatted_output
+# --- Initial Load and Indexing Calls ---
+# This part runs once when the app starts.
+load_scifact_corpus_ir_datasets() # Or load_cranfield_corpus_ir_datasets() if you switch back
+if initial_doc_model_for_indexing == "SPLADE-cocondenser-distil" and model_splade is not None:
+    index_documents(initial_doc_model_for_indexing)
+elif initial_doc_model_for_indexing == "SPLADE-v3-Lexical" and model_splade_lexical is not None:
+    index_documents(initial_doc_model_for_indexing)
+elif initial_doc_model_for_indexing == "SPLADE-v3-Doc" and model_splade_doc is not None:
+    index_documents(initial_doc_model_for_indexing)
+else:
+    print(f"Skipping document indexing: Model '{initial_doc_model_for_indexing}' failed to load or is not a valid choice for indexing.")
+# --- Gradio Interface Setup with Tabs ---
+with gr.Blocks(title="SPLADE Demos") as demo:
+    gr.Markdown("# 🌌 SPLADE Demos: Sparse Representation Explorer & Document Retrieval")
+    gr.Markdown("Explore different SPLADE models and their sparse representation types, or perform document retrieval on a test collection.")
+    with gr.Tabs():
+        with gr.TabItem("Sparse Representation Explorer"):
+            gr.Markdown("### Explore Raw SPLADE Representations for Any Text")
+            gr.Interface(
+                fn=predict_representation_explorer,
+                inputs=[
+                    gr.Radio(
+                        [
+                            "SPLADE-cocondenser-distil (weighting and expansion)",
+                            "SPLADE-v3-Lexical (weighting)",
+                            "SPLADE-v3-Doc (binary)"
+                        ],
+                        label="Choose Representation Model",
+                        value="SPLADE-cocondenser-distil (weighting and expansion)"
+                    ),
+                    gr.Textbox(
+                        lines=5,
+                        label="Enter your query or document text here:",
+                        placeholder="e.g., Why is Padua the nicest city in Italy?"
+                    )
+                ],
+                outputs=gr.Markdown(),
+                allow_flagging="never",
+                # Don't show redundant title/description within the tab, as it's above
+                # Setting live=True might be slow for complex models on every keystroke
+                # live=True
+            )
+        with gr.TabItem("Document Retrieval Demo"):
+            gr.Markdown("### Retrieve Documents from SciFact Collection")
+            gr.Interface(
+                fn=predict_retrieval_gradio,
+                inputs=[
+                    gr.Textbox(
+                        lines=3,
+                        label="Enter your query text here:",
+                        placeholder="e.g., Does high-dose vitamin C cure cancer?"
+                    ),
+                    gr.Radio(
+                        [
+                            "SPLADE-cocondenser-distil (weighting and expansion)",
+                            "SPLADE-v3-Lexical (weighting)",
+                            "SPLADE-v3-Doc (binary)"
+                        ],
+                        label="Choose Query Representation Model",
+                        value="SPLADE-cocondenser-distil (weighting and expansion)"
+                    ),
+                    gr.Radio(
+                        [
+                            "SPLADE-cocondenser-distil",
+                            "SPLADE-v3-Lexical",
+                            "SPLADE-v3-Doc"
+                        ],
+                        label=f"Document Index Model (Pre-indexed with: {initial_doc_model_for_indexing})",
+                        value=initial_doc_model_for_indexing,
+                        interactive=False # This radio is fixed for simplicity
+                    )
+                ],
+                outputs=gr.Markdown(),
+                allow_flagging="never",
+                # live=True # retrieval is too heavy for live
+            )
 demo.launch()