Spaces:

Shankarm08
/

pdfcsvdatarag

Sleeping

App Files Files Community

Shankarm08 commited on Oct 6, 2024

Commit

a52a9bb

verified ·

1 Parent(s): 7108a73

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -77

app.py CHANGED Viewed

@@ -1,110 +1,83 @@
 import streamlit as st
-import pandas as pd
 import torch
-import faiss
-import numpy as np
-from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
 import pdfplumber
-import pytesseract
 from sklearn.metrics.pairwise import cosine_similarity
-# Load the RAG tokenizer and model
-tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
-retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
-model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")
-# Function to get embeddings for FAISS index
-def get_faiss_index(data_chunks):
-    embeddings = [retriever.question_encoder_tokenizer(chunk, return_tensors="pt").input_ids for chunk in data_chunks]
-    embeddings = torch.cat(embeddings).detach().numpy()
-    # Build FAISS index
-    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance
-    index.add(embeddings)
-    return index, embeddings
-# Extract text and tables from PDF (with OCR fallback)
 def extract_text_from_pdf(pdf_file):
-    text = ""
     with pdfplumber.open(pdf_file) as pdf:
-        for page_num, page in enumerate(pdf.pages, 1):
             page_text = page.extract_text()
-            if page_text:
                 text += page_text + "\n"
-            else:
-                st.warning(f"No extractable text found on page {page_num}. Using OCR...")
-                page_image = page.to_image().original
-                ocr_text = pytesseract.image_to_string(page_image)
-                if ocr_text.strip():
-                    text += ocr_text + "\n"
-                else:
-                    st.error(f"Even OCR couldn't extract text from page {page_num}.")
     return text
-# Function to process input for RAG model
-def generate_rag_response(user_input, data_chunks):
-    inputs = tokenizer([user_input], return_tensors="pt")
-    retrieved_docs = retriever(input_ids=inputs['input_ids'], n_docs=5)
-    outputs = model.generate(input_ids=inputs['input_ids'], context_input_ids=retrieved_docs['context_input_ids'])
-    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-# Streamlit app
-st.title("CSV and PDF Chatbot with RAG")
 # CSV file upload
 csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
-csv_data = None
 if csv_file:
     csv_data = pd.read_csv(csv_file)
-    st.success("CSV loaded successfully!")
-    st.write("### CSV Data:")
     st.write(csv_data)
 # PDF file upload
 pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
-pdf_text = ""
-data_chunks = []
 if pdf_file:
     pdf_text = extract_text_from_pdf(pdf_file)
-    if not pdf_text.strip():
-        st.error("The extracted PDF text is empty. Please upload a PDF with extractable text.")
-    else:
         st.success("PDF loaded successfully!")
-        st.write("### Extracted Text:")
-        st.write(pdf_text)
-        # Split the extracted text into chunks for FAISS
-        data_chunks = pdf_text.split('\n')
-        st.write("### Extracted Chunks:")
-        for chunk in data_chunks[:5]:  # Display first 5 chunks
-            st.write(chunk)
 # User input for chatbot
-user_input = st.text_input("Ask a question about the CSV or PDF:")
 if st.button("Get Response"):
-    if csv_data is None and not data_chunks:
-        st.warning("Please upload both a CSV and PDF file first.")
-    elif not user_input.strip():
-        st.warning("Please enter a question.")
     else:
         try:
-            if csv_data is not None:
-                # Check if the query is related to CSV content
-                csv_response = csv_data[csv_data.apply(lambda row: row.astype(str).str.contains(user_input, case=False).any(), axis=1)]
-                if not csv_response.empty:
-                    st.write("### CSV Response:")
-                    st.write(csv_response)
-                else:
-                    st.write("No relevant data found in the CSV.")
-            if data_chunks:
-                # Generate response using RAG for PDF content
-                response = generate_rag_response(user_input, data_chunks)
-                st.write("### PDF Response:")
-                st.write(response)
         except Exception as e:
-            st.error(f"Error while processing user input: {e}")

 import streamlit as st
 import torch
+from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
+from datasets import load_dataset
+import pandas as pd
 import pdfplumber
+import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
+# Load RAG model, tokenizer, and retriever
+tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
+model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
+# Function to get RAG embeddings
+def get_rag_embeddings(question, context):
+    inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
+    with torch.no_grad():
+        output = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
+    return tokenizer.batch_decode(output, skip_special_tokens=True)[0]
+# Extract text from PDF
 def extract_text_from_pdf(pdf_file):
     with pdfplumber.open(pdf_file) as pdf:
+        text = ""
+        for page in pdf.pages:
             page_text = page.extract_text()
+            if page_text:  # Check if the page has extractable text
                 text += page_text + "\n"
     return text
+# Load dataset (wiki_dpr) and set trust_remote_code=True
+def load_wiki_dpr():
+    return load_dataset('wiki_dpr', trust_remote_code=True)
+# Store the PDF text and embeddings
+pdf_text = ""
+pdf_embeddings = None
+csv_data = None
+# Streamlit app UI
+st.title("RAG-Powered PDF & CSV Chatbot")
 # CSV file upload
 csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
 if csv_file:
     csv_data = pd.read_csv(csv_file)
+    st.write("CSV file loaded successfully!")
     st.write(csv_data)
 # PDF file upload
 pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 if pdf_file:
     pdf_text = extract_text_from_pdf(pdf_file)
+    if pdf_text.strip():
         st.success("PDF loaded successfully!")
+        st.text_area("Extracted Text from PDF", pdf_text, height=200)
+    else:
+        st.warning("No extractable text found in the PDF.")
 # User input for chatbot
+user_input = st.text_input("Ask a question related to the PDF or CSV:")
+# Get response on button click
 if st.button("Get Response"):
+    if not pdf_text and csv_data is None:
+        st.warning("Please upload a PDF or CSV file first.")
     else:
+        # Combine PDF text and CSV content for context in RAG
+        combined_context = ""
+        if pdf_text:
+            combined_context += pdf_text
+        if csv_data is not None:
+            combined_context += "\n" + csv_data.to_string()
+        # Get RAG-generated response
         try:
+            response = get_rag_embeddings(user_input, combined_context)
+            st.write("### Response:")
+            st.write(response)
         except Exception as e:
+            st.error(f"Error while processing the question: {e}")