sagar008 commited on
Commit
3a757d8
Β·
verified Β·
1 Parent(s): 2bdc136

Update vector_store.py

Browse files
Files changed (1) hide show
  1. vector_store.py +46 -30
vector_store.py CHANGED
@@ -9,6 +9,7 @@ from langchain_pinecone import PineconeVectorStore
9
  from langchain.embeddings.base import Embeddings
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
 
 
12
  class InLegalBERTEmbeddings(Embeddings):
13
  """Custom LangChain embeddings wrapper for InLegalBERT"""
14
 
@@ -23,6 +24,7 @@ class InLegalBERTEmbeddings(Embeddings):
23
  """Embed a single query"""
24
  return self.model.encode([text])[0].tolist()
25
 
 
26
  class LegalDocumentVectorStore:
27
  """Manages vector storage for legal documents"""
28
 
@@ -59,27 +61,44 @@ class LegalDocumentVectorStore:
59
  print(f"βœ… Created Pinecone index: {self.index_name}")
60
 
61
  self._initialized = True
62
-
63
- def save_document_embeddings_optimized(self, chunk_data: List[Dict], document_id: str,
64
- analysis_results: Dict[str, Any]) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  """Save embeddings using pre-computed vectors with proper text storage"""
66
  try:
67
  self._initialize_pinecone()
68
-
69
- # Use pre-computed embeddings instead of re-generating
70
- texts = [chunk["text"] for chunk in chunk_data]
71
- embeddings = [chunk["embedding"].tolist() for chunk in chunk_data if chunk["embedding"] is not None]
72
-
73
- if not embeddings:
74
  print("⚠️ No embeddings found in chunk_data")
75
  return False
76
-
77
- # Prepare vectors with BOTH metadata AND text content
78
  vectors = []
79
- for i, chunk_info in enumerate(chunk_data):
80
- if chunk_info["embedding"] is None:
 
81
  continue
82
-
83
  metadata = {
84
  'document_id': document_id,
85
  'chunk_index': i,
@@ -91,20 +110,20 @@ class LegalDocumentVectorStore:
91
  'timestamp': str(np.datetime64('now')),
92
  'text': chunk_info["text"] # Store text in metadata for retrieval
93
  }
94
-
95
  vectors.append({
96
  "id": f"{document_id}_chunk_{i}",
97
- "values": chunk_info["embedding"].tolist(),
98
  "metadata": metadata
99
  })
100
-
101
- # Add to Pinecone using pre-computed embeddings
102
  index = self.pc.Index(self.index_name)
103
  index.upsert(vectors=vectors)
104
-
105
  print(f"βœ… Saved {len(vectors)} pre-computed embeddings with text to Pinecone")
106
  return True
107
-
108
  except Exception as e:
109
  print(f"❌ Error saving pre-computed embeddings: {e}")
110
  return False
@@ -113,33 +132,30 @@ class LegalDocumentVectorStore:
113
  """Get retriever for chat functionality with improved settings"""
114
  try:
115
  self._initialize_pinecone()
116
-
117
  legal_embeddings = InLegalBERTEmbeddings(clause_tagger.embedding_model)
118
  index = self.pc.Index(self.index_name)
119
-
120
  vectorstore = PineconeVectorStore(
121
  index=index,
122
  embedding=legal_embeddings,
123
  text_key="text" # Use text stored in metadata
124
  )
125
-
126
  # Configure search parameters
127
- search_kwargs = {
128
- 'k': 10 # Increased from default 5 for better context
129
- }
130
-
131
  if document_id:
132
  search_kwargs['filter'] = {'document_id': document_id}
133
-
134
- # Use similarity search without threshold initially
135
  return vectorstore.as_retriever(
136
  search_type="similarity",
137
  search_kwargs=search_kwargs
138
  )
139
-
140
  except Exception as e:
141
  print(f"❌ Error creating retriever: {e}")
142
  return None
143
 
 
144
  # Global instance
145
  vector_store = LegalDocumentVectorStore()
 
9
  from langchain.embeddings.base import Embeddings
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
 
12
+
13
  class InLegalBERTEmbeddings(Embeddings):
14
  """Custom LangChain embeddings wrapper for InLegalBERT"""
15
 
 
24
  """Embed a single query"""
25
  return self.model.encode([text])[0].tolist()
26
 
27
+
28
  class LegalDocumentVectorStore:
29
  """Manages vector storage for legal documents"""
30
 
 
61
  print(f"βœ… Created Pinecone index: {self.index_name}")
62
 
63
  self._initialized = True
64
+
65
+ def _normalize_embedding(self, embedding):
66
+ """Ensure embedding is always a list of floats"""
67
+ if embedding is None:
68
+ return None
69
+ if isinstance(embedding, np.ndarray):
70
+ return embedding.tolist()
71
+ if isinstance(embedding, list):
72
+ # Already a Python list
73
+ return embedding
74
+ # Fallback: try converting if it's a torch tensor or similar
75
+ try:
76
+ return embedding.tolist()
77
+ except Exception:
78
+ return list(embedding)
79
+
80
+ def save_document_embeddings_optimized(
81
+ self,
82
+ chunk_data: List[Dict],
83
+ document_id: str,
84
+ analysis_results: Dict[str, Any]
85
+ ) -> bool:
86
  """Save embeddings using pre-computed vectors with proper text storage"""
87
  try:
88
  self._initialize_pinecone()
89
+
90
+ # Normalize embeddings safely
91
+ valid_chunks = [c for c in chunk_data if c.get("embedding") is not None]
92
+ if not valid_chunks:
 
 
93
  print("⚠️ No embeddings found in chunk_data")
94
  return False
95
+
 
96
  vectors = []
97
+ for i, chunk_info in enumerate(valid_chunks):
98
+ normalized_embedding = self._normalize_embedding(chunk_info["embedding"])
99
+ if normalized_embedding is None:
100
  continue
101
+
102
  metadata = {
103
  'document_id': document_id,
104
  'chunk_index': i,
 
110
  'timestamp': str(np.datetime64('now')),
111
  'text': chunk_info["text"] # Store text in metadata for retrieval
112
  }
113
+
114
  vectors.append({
115
  "id": f"{document_id}_chunk_{i}",
116
+ "values": normalized_embedding,
117
  "metadata": metadata
118
  })
119
+
120
+ # Add to Pinecone
121
  index = self.pc.Index(self.index_name)
122
  index.upsert(vectors=vectors)
123
+
124
  print(f"βœ… Saved {len(vectors)} pre-computed embeddings with text to Pinecone")
125
  return True
126
+
127
  except Exception as e:
128
  print(f"❌ Error saving pre-computed embeddings: {e}")
129
  return False
 
132
  """Get retriever for chat functionality with improved settings"""
133
  try:
134
  self._initialize_pinecone()
135
+
136
  legal_embeddings = InLegalBERTEmbeddings(clause_tagger.embedding_model)
137
  index = self.pc.Index(self.index_name)
138
+
139
  vectorstore = PineconeVectorStore(
140
  index=index,
141
  embedding=legal_embeddings,
142
  text_key="text" # Use text stored in metadata
143
  )
144
+
145
  # Configure search parameters
146
+ search_kwargs = {'k': 10}
 
 
 
147
  if document_id:
148
  search_kwargs['filter'] = {'document_id': document_id}
149
+
 
150
  return vectorstore.as_retriever(
151
  search_type="similarity",
152
  search_kwargs=search_kwargs
153
  )
154
+
155
  except Exception as e:
156
  print(f"❌ Error creating retriever: {e}")
157
  return None
158
 
159
+
160
  # Global instance
161
  vector_store = LegalDocumentVectorStore()