Update vector_store.py
Browse files- vector_store.py +46 -30
vector_store.py
CHANGED
|
@@ -9,6 +9,7 @@ from langchain_pinecone import PineconeVectorStore
|
|
| 9 |
from langchain.embeddings.base import Embeddings
|
| 10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 11 |
|
|
|
|
| 12 |
class InLegalBERTEmbeddings(Embeddings):
|
| 13 |
"""Custom LangChain embeddings wrapper for InLegalBERT"""
|
| 14 |
|
|
@@ -23,6 +24,7 @@ class InLegalBERTEmbeddings(Embeddings):
|
|
| 23 |
"""Embed a single query"""
|
| 24 |
return self.model.encode([text])[0].tolist()
|
| 25 |
|
|
|
|
| 26 |
class LegalDocumentVectorStore:
|
| 27 |
"""Manages vector storage for legal documents"""
|
| 28 |
|
|
@@ -59,27 +61,44 @@ class LegalDocumentVectorStore:
|
|
| 59 |
print(f"β
Created Pinecone index: {self.index_name}")
|
| 60 |
|
| 61 |
self._initialized = True
|
| 62 |
-
|
| 63 |
-
def
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
"""Save embeddings using pre-computed vectors with proper text storage"""
|
| 66 |
try:
|
| 67 |
self._initialize_pinecone()
|
| 68 |
-
|
| 69 |
-
#
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
if not embeddings:
|
| 74 |
print("β οΈ No embeddings found in chunk_data")
|
| 75 |
return False
|
| 76 |
-
|
| 77 |
-
# Prepare vectors with BOTH metadata AND text content
|
| 78 |
vectors = []
|
| 79 |
-
for i, chunk_info in enumerate(
|
| 80 |
-
|
|
|
|
| 81 |
continue
|
| 82 |
-
|
| 83 |
metadata = {
|
| 84 |
'document_id': document_id,
|
| 85 |
'chunk_index': i,
|
|
@@ -91,20 +110,20 @@ class LegalDocumentVectorStore:
|
|
| 91 |
'timestamp': str(np.datetime64('now')),
|
| 92 |
'text': chunk_info["text"] # Store text in metadata for retrieval
|
| 93 |
}
|
| 94 |
-
|
| 95 |
vectors.append({
|
| 96 |
"id": f"{document_id}_chunk_{i}",
|
| 97 |
-
"values":
|
| 98 |
"metadata": metadata
|
| 99 |
})
|
| 100 |
-
|
| 101 |
-
# Add to Pinecone
|
| 102 |
index = self.pc.Index(self.index_name)
|
| 103 |
index.upsert(vectors=vectors)
|
| 104 |
-
|
| 105 |
print(f"β
Saved {len(vectors)} pre-computed embeddings with text to Pinecone")
|
| 106 |
return True
|
| 107 |
-
|
| 108 |
except Exception as e:
|
| 109 |
print(f"β Error saving pre-computed embeddings: {e}")
|
| 110 |
return False
|
|
@@ -113,33 +132,30 @@ class LegalDocumentVectorStore:
|
|
| 113 |
"""Get retriever for chat functionality with improved settings"""
|
| 114 |
try:
|
| 115 |
self._initialize_pinecone()
|
| 116 |
-
|
| 117 |
legal_embeddings = InLegalBERTEmbeddings(clause_tagger.embedding_model)
|
| 118 |
index = self.pc.Index(self.index_name)
|
| 119 |
-
|
| 120 |
vectorstore = PineconeVectorStore(
|
| 121 |
index=index,
|
| 122 |
embedding=legal_embeddings,
|
| 123 |
text_key="text" # Use text stored in metadata
|
| 124 |
)
|
| 125 |
-
|
| 126 |
# Configure search parameters
|
| 127 |
-
search_kwargs = {
|
| 128 |
-
'k': 10 # Increased from default 5 for better context
|
| 129 |
-
}
|
| 130 |
-
|
| 131 |
if document_id:
|
| 132 |
search_kwargs['filter'] = {'document_id': document_id}
|
| 133 |
-
|
| 134 |
-
# Use similarity search without threshold initially
|
| 135 |
return vectorstore.as_retriever(
|
| 136 |
search_type="similarity",
|
| 137 |
search_kwargs=search_kwargs
|
| 138 |
)
|
| 139 |
-
|
| 140 |
except Exception as e:
|
| 141 |
print(f"β Error creating retriever: {e}")
|
| 142 |
return None
|
| 143 |
|
|
|
|
| 144 |
# Global instance
|
| 145 |
vector_store = LegalDocumentVectorStore()
|
|
|
|
| 9 |
from langchain.embeddings.base import Embeddings
|
| 10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 11 |
|
| 12 |
+
|
| 13 |
class InLegalBERTEmbeddings(Embeddings):
|
| 14 |
"""Custom LangChain embeddings wrapper for InLegalBERT"""
|
| 15 |
|
|
|
|
| 24 |
"""Embed a single query"""
|
| 25 |
return self.model.encode([text])[0].tolist()
|
| 26 |
|
| 27 |
+
|
| 28 |
class LegalDocumentVectorStore:
|
| 29 |
"""Manages vector storage for legal documents"""
|
| 30 |
|
|
|
|
| 61 |
print(f"β
Created Pinecone index: {self.index_name}")
|
| 62 |
|
| 63 |
self._initialized = True
|
| 64 |
+
|
| 65 |
+
def _normalize_embedding(self, embedding):
|
| 66 |
+
"""Ensure embedding is always a list of floats"""
|
| 67 |
+
if embedding is None:
|
| 68 |
+
return None
|
| 69 |
+
if isinstance(embedding, np.ndarray):
|
| 70 |
+
return embedding.tolist()
|
| 71 |
+
if isinstance(embedding, list):
|
| 72 |
+
# Already a Python list
|
| 73 |
+
return embedding
|
| 74 |
+
# Fallback: try converting if it's a torch tensor or similar
|
| 75 |
+
try:
|
| 76 |
+
return embedding.tolist()
|
| 77 |
+
except Exception:
|
| 78 |
+
return list(embedding)
|
| 79 |
+
|
| 80 |
+
def save_document_embeddings_optimized(
|
| 81 |
+
self,
|
| 82 |
+
chunk_data: List[Dict],
|
| 83 |
+
document_id: str,
|
| 84 |
+
analysis_results: Dict[str, Any]
|
| 85 |
+
) -> bool:
|
| 86 |
"""Save embeddings using pre-computed vectors with proper text storage"""
|
| 87 |
try:
|
| 88 |
self._initialize_pinecone()
|
| 89 |
+
|
| 90 |
+
# Normalize embeddings safely
|
| 91 |
+
valid_chunks = [c for c in chunk_data if c.get("embedding") is not None]
|
| 92 |
+
if not valid_chunks:
|
|
|
|
|
|
|
| 93 |
print("β οΈ No embeddings found in chunk_data")
|
| 94 |
return False
|
| 95 |
+
|
|
|
|
| 96 |
vectors = []
|
| 97 |
+
for i, chunk_info in enumerate(valid_chunks):
|
| 98 |
+
normalized_embedding = self._normalize_embedding(chunk_info["embedding"])
|
| 99 |
+
if normalized_embedding is None:
|
| 100 |
continue
|
| 101 |
+
|
| 102 |
metadata = {
|
| 103 |
'document_id': document_id,
|
| 104 |
'chunk_index': i,
|
|
|
|
| 110 |
'timestamp': str(np.datetime64('now')),
|
| 111 |
'text': chunk_info["text"] # Store text in metadata for retrieval
|
| 112 |
}
|
| 113 |
+
|
| 114 |
vectors.append({
|
| 115 |
"id": f"{document_id}_chunk_{i}",
|
| 116 |
+
"values": normalized_embedding,
|
| 117 |
"metadata": metadata
|
| 118 |
})
|
| 119 |
+
|
| 120 |
+
# Add to Pinecone
|
| 121 |
index = self.pc.Index(self.index_name)
|
| 122 |
index.upsert(vectors=vectors)
|
| 123 |
+
|
| 124 |
print(f"β
Saved {len(vectors)} pre-computed embeddings with text to Pinecone")
|
| 125 |
return True
|
| 126 |
+
|
| 127 |
except Exception as e:
|
| 128 |
print(f"β Error saving pre-computed embeddings: {e}")
|
| 129 |
return False
|
|
|
|
| 132 |
"""Get retriever for chat functionality with improved settings"""
|
| 133 |
try:
|
| 134 |
self._initialize_pinecone()
|
| 135 |
+
|
| 136 |
legal_embeddings = InLegalBERTEmbeddings(clause_tagger.embedding_model)
|
| 137 |
index = self.pc.Index(self.index_name)
|
| 138 |
+
|
| 139 |
vectorstore = PineconeVectorStore(
|
| 140 |
index=index,
|
| 141 |
embedding=legal_embeddings,
|
| 142 |
text_key="text" # Use text stored in metadata
|
| 143 |
)
|
| 144 |
+
|
| 145 |
# Configure search parameters
|
| 146 |
+
search_kwargs = {'k': 10}
|
|
|
|
|
|
|
|
|
|
| 147 |
if document_id:
|
| 148 |
search_kwargs['filter'] = {'document_id': document_id}
|
| 149 |
+
|
|
|
|
| 150 |
return vectorstore.as_retriever(
|
| 151 |
search_type="similarity",
|
| 152 |
search_kwargs=search_kwargs
|
| 153 |
)
|
| 154 |
+
|
| 155 |
except Exception as e:
|
| 156 |
print(f"β Error creating retriever: {e}")
|
| 157 |
return None
|
| 158 |
|
| 159 |
+
|
| 160 |
# Global instance
|
| 161 |
vector_store = LegalDocumentVectorStore()
|