Arthur Passuello commited on
Commit
0242f02
·
1 Parent(s): ad4bd4e

Trying to fix imports

Browse files
app.py CHANGED
@@ -29,6 +29,23 @@ os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
29
  os.environ["STREAMLIT_SERVER_ENABLE_CORS"] = "false"
30
  os.environ["STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION"] = "false"
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # HuggingFace Spaces environment detection
33
  IS_HF_SPACES = os.getenv("SPACE_ID") is not None
34
  IS_LOCAL_DEV = not IS_HF_SPACES
@@ -114,6 +131,42 @@ def setup_environment_display(capabilities):
114
  return capabilities
115
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  def main():
118
  """Main application entry point with Epic 2 enhanced capabilities."""
119
 
@@ -125,6 +178,9 @@ def main():
125
  initial_sidebar_state="expanded"
126
  )
127
 
 
 
 
128
  # Check environment capabilities
129
  capabilities = check_environment_capabilities()
130
  setup_environment_display(capabilities)
 
29
  os.environ["STREAMLIT_SERVER_ENABLE_CORS"] = "false"
30
  os.environ["STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION"] = "false"
31
 
32
+ # Setup cache directories for model downloads (HuggingFace Spaces compatible)
33
+ cache_base = "/tmp/.cache" if IS_HF_SPACES else str(Path.home() / ".cache")
34
+ os.environ.setdefault("HF_HOME", f"{cache_base}/huggingface")
35
+ os.environ.setdefault("TRANSFORMERS_CACHE", f"{cache_base}/huggingface/transformers")
36
+ os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", f"{cache_base}/sentence-transformers")
37
+
38
+ # Create cache directories
39
+ for cache_dir in [
40
+ os.environ["HF_HOME"],
41
+ os.environ["TRANSFORMERS_CACHE"],
42
+ os.environ["SENTENCE_TRANSFORMERS_HOME"]
43
+ ]:
44
+ try:
45
+ os.makedirs(cache_dir, exist_ok=True)
46
+ except Exception as e:
47
+ logger.warning(f"Could not create cache directory {cache_dir}: {e}")
48
+
49
  # HuggingFace Spaces environment detection
50
  IS_HF_SPACES = os.getenv("SPACE_ID") is not None
51
  IS_LOCAL_DEV = not IS_HF_SPACES
 
131
  return capabilities
132
 
133
 
134
+ def setup_models_if_needed():
135
+ """Setup models if needed for cloud deployment."""
136
+ try:
137
+ # Quick validation of critical dependencies
138
+ import rank_bm25
139
+ import pdfplumber
140
+ logger.info("✅ Critical dependencies available")
141
+
142
+ # Check if we need to setup spaCy model
143
+ try:
144
+ import spacy
145
+ spacy.load("en_core_web_sm")
146
+ logger.info("✅ spaCy model available")
147
+ except OSError:
148
+ logger.info("📥 Setting up spaCy model...")
149
+ try:
150
+ import subprocess
151
+ result = subprocess.run([
152
+ sys.executable, "-m", "spacy", "download", "en_core_web_sm"
153
+ ], capture_output=True, text=True, timeout=300)
154
+ if result.returncode == 0:
155
+ logger.info("✅ spaCy model setup complete")
156
+ else:
157
+ logger.warning("⚠️ spaCy model setup failed - entity extraction may be limited")
158
+ except Exception as e:
159
+ logger.warning(f"⚠️ spaCy model auto-setup failed: {e}")
160
+ except ImportError:
161
+ logger.warning("⚠️ spaCy not available")
162
+
163
+ except ImportError as e:
164
+ logger.error(f"❌ Critical dependency missing: {e}")
165
+ st.error(f"Critical dependency missing: {e}")
166
+ st.info("Please install missing packages with: pip install -r requirements.txt")
167
+ st.stop()
168
+
169
+
170
  def main():
171
  """Main application entry point with Epic 2 enhanced capabilities."""
172
 
 
178
  initial_sidebar_state="expanded"
179
  )
180
 
181
+ # Setup models if needed
182
+ setup_models_if_needed()
183
+
184
  # Check environment capabilities
185
  capabilities = check_environment_capabilities()
186
  setup_environment_display(capabilities)
requirements.txt CHANGED
@@ -15,6 +15,7 @@ tokenizers>=0.13.0
15
 
16
  # Vector Search & Indexing
17
  faiss-cpu>=1.7.4
 
18
 
19
  # Document Processing (Essential only)
20
  PyMuPDF>=1.23.0
@@ -37,9 +38,12 @@ huggingface-hub>=0.16.0
37
  # Visualization (Lightweight)
38
  plotly>=5.17.0,<6.0.0
39
 
 
 
 
 
 
40
  # Optional dependencies for local development only
41
- # accelerate>=0.20.0 # Uncomment for local GPU acceleration
42
- pdfplumber>=0.10.0 # Uncomment for advanced PDF parsing
43
  # fastapi>=0.104.0 # Uncomment for API server
44
  # uvicorn>=0.24.0 # Uncomment for API server
45
  # ollama>=0.5.0 # Uncomment for local LLM
 
15
 
16
  # Vector Search & Indexing
17
  faiss-cpu>=1.7.4
18
+ weaviate-client>=3.15.0 # Alternative vector backend (optional but configured)
19
 
20
  # Document Processing (Essential only)
21
  PyMuPDF>=1.23.0
 
38
  # Visualization (Lightweight)
39
  plotly>=5.17.0,<6.0.0
40
 
41
+ # Core Dependencies (Previously Optional but Required)
42
+ accelerate>=0.20.0 # HuggingFace model optimization (required for cloud deployment)
43
+ pdfplumber>=0.10.0 # Advanced PDF parsing (required for document processing)
44
+ rank-bm25>=0.2.2 # BM25 sparse retrieval (required for Epic 2 hybrid search)
45
+
46
  # Optional dependencies for local development only
 
 
47
  # fastapi>=0.104.0 # Uncomment for API server
48
  # uvicorn>=0.24.0 # Uncomment for API server
49
  # ollama>=0.5.0 # Uncomment for local LLM
scripts/setup_models.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Model Setup Script for Enhanced RAG Demo
4
+
5
+ This script handles automatic downloading and setup of required models
6
+ for deployment environments like HuggingFace Spaces where models may not
7
+ be pre-installed.
8
+
9
+ Usage:
10
+ python scripts/setup_models.py
11
+
12
+ Environment Variables:
13
+ SKIP_MODEL_DOWNLOAD: Set to '1' to skip model downloads
14
+ SPACY_MODEL: Override default spaCy model (default: en_core_web_sm)
15
+ """
16
+
17
+ import os
18
+ import sys
19
+ import logging
20
+ import subprocess
21
+ import time
22
+ from pathlib import Path
23
+ from typing import List, Dict, Any, Optional
24
+
25
+ # Configure logging
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format='%(asctime)s - %(levelname)s - %(message)s'
29
+ )
30
+ logger = logging.getLogger(__name__)
31
+
32
+ def check_spacy_model(model_name: str = "en_core_web_sm") -> bool:
33
+ """
34
+ Check if spaCy model is available.
35
+
36
+ Args:
37
+ model_name: Name of the spaCy model to check
38
+
39
+ Returns:
40
+ True if model is available, False otherwise
41
+ """
42
+ try:
43
+ import spacy
44
+ spacy.load(model_name)
45
+ logger.info(f"✅ spaCy model '{model_name}' is available")
46
+ return True
47
+ except OSError:
48
+ logger.warning(f"❌ spaCy model '{model_name}' not found")
49
+ return False
50
+ except ImportError:
51
+ logger.warning("❌ spaCy not installed")
52
+ return False
53
+ except Exception as e:
54
+ logger.warning(f"❌ Error checking spaCy model: {e}")
55
+ return False
56
+
57
+ def download_spacy_model(model_name: str = "en_core_web_sm", timeout: int = 300) -> bool:
58
+ """
59
+ Download spaCy model.
60
+
61
+ Args:
62
+ model_name: Name of the spaCy model to download
63
+ timeout: Download timeout in seconds
64
+
65
+ Returns:
66
+ True if download successful, False otherwise
67
+ """
68
+ try:
69
+ logger.info(f"📥 Downloading spaCy model '{model_name}'...")
70
+
71
+ result = subprocess.run([
72
+ sys.executable, "-m", "spacy", "download", model_name
73
+ ], capture_output=True, text=True, timeout=timeout)
74
+
75
+ if result.returncode == 0:
76
+ logger.info(f"✅ Successfully downloaded spaCy model '{model_name}'")
77
+ return True
78
+ else:
79
+ logger.error(f"❌ Failed to download spaCy model: {result.stderr}")
80
+ return False
81
+
82
+ except subprocess.TimeoutExpired:
83
+ logger.error(f"❌ spaCy model download timed out after {timeout} seconds")
84
+ return False
85
+ except Exception as e:
86
+ logger.error(f"❌ Error downloading spaCy model: {e}")
87
+ return False
88
+
89
+ def setup_cache_directories() -> None:
90
+ """
91
+ Set up cache directories for models with proper permissions.
92
+ """
93
+ cache_dirs = [
94
+ os.environ.get('TRANSFORMERS_CACHE', '/tmp/.cache/huggingface/transformers'),
95
+ os.environ.get('HF_HOME', '/tmp/.cache/huggingface'),
96
+ os.environ.get('SENTENCE_TRANSFORMERS_HOME', '/tmp/.cache/sentence-transformers'),
97
+ ]
98
+
99
+ for cache_dir in cache_dirs:
100
+ try:
101
+ os.makedirs(cache_dir, exist_ok=True)
102
+ logger.info(f"📁 Created cache directory: {cache_dir}")
103
+ except Exception as e:
104
+ logger.warning(f"⚠️ Could not create cache directory {cache_dir}: {e}")
105
+
106
+ def validate_python_packages() -> Dict[str, bool]:
107
+ """
108
+ Validate that required Python packages are installed.
109
+
110
+ Returns:
111
+ Dictionary mapping package names to availability status
112
+ """
113
+ required_packages = {
114
+ 'rank_bm25': 'rank_bm25',
115
+ 'pdfplumber': 'pdfplumber',
116
+ 'sentence_transformers': 'sentence_transformers',
117
+ 'transformers': 'transformers',
118
+ 'spacy': 'spacy',
119
+ 'huggingface_hub': 'huggingface_hub',
120
+ 'faiss': 'faiss',
121
+ 'accelerate': 'accelerate' # Optional but recommended
122
+ }
123
+
124
+ status = {}
125
+
126
+ for display_name, import_name in required_packages.items():
127
+ try:
128
+ __import__(import_name)
129
+ status[display_name] = True
130
+ logger.info(f"✅ {display_name} is available")
131
+ except ImportError:
132
+ status[display_name] = False
133
+ logger.error(f"❌ {display_name} is not installed")
134
+
135
+ return status
136
+
137
+ def main() -> int:
138
+ """
139
+ Main setup function.
140
+
141
+ Returns:
142
+ Exit code (0 for success, 1 for failure)
143
+ """
144
+ logger.info("🚀 Starting Enhanced RAG Demo model setup...")
145
+
146
+ # Check if model download should be skipped
147
+ skip_download = os.environ.get('SKIP_MODEL_DOWNLOAD', '').lower() in ('1', 'true', 'yes')
148
+ if skip_download:
149
+ logger.info("⏭️ Skipping model downloads (SKIP_MODEL_DOWNLOAD set)")
150
+ return 0
151
+
152
+ success = True
153
+
154
+ # 1. Validate Python packages
155
+ logger.info("📦 Validating Python packages...")
156
+ package_status = validate_python_packages()
157
+
158
+ critical_packages = ['rank_bm25', 'pdfplumber', 'sentence_transformers', 'transformers', 'spacy']
159
+ missing_critical = [pkg for pkg in critical_packages if not package_status.get(pkg, False)]
160
+
161
+ if missing_critical:
162
+ logger.error(f"❌ Critical packages missing: {', '.join(missing_critical)}")
163
+ logger.error("Please install missing packages with: pip install -r requirements.txt")
164
+ success = False
165
+
166
+ # 2. Setup cache directories
167
+ logger.info("📁 Setting up cache directories...")
168
+ setup_cache_directories()
169
+
170
+ # 3. Handle spaCy model
171
+ spacy_model = os.environ.get('SPACY_MODEL', 'en_core_web_sm')
172
+ logger.info(f"🔤 Checking spaCy model: {spacy_model}")
173
+
174
+ if package_status.get('spacy', False):
175
+ if not check_spacy_model(spacy_model):
176
+ logger.info(f"📥 Attempting to download spaCy model '{spacy_model}'...")
177
+ if not download_spacy_model(spacy_model):
178
+ logger.error(f"❌ Failed to download spaCy model '{spacy_model}'")
179
+ logger.warning("⚠️ Entity extraction features may be limited")
180
+ # Don't fail completely - this is non-critical for basic functionality
181
+ else:
182
+ logger.warning("⚠️ spaCy not available - entity extraction will be disabled")
183
+
184
+ # 4. Test model loading (basic validation)
185
+ if package_status.get('sentence_transformers', False):
186
+ try:
187
+ logger.info("🧪 Testing sentence-transformers model loading...")
188
+ from sentence_transformers import SentenceTransformer
189
+
190
+ # Try to load a small model for validation
191
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder='/tmp/.cache/sentence-transformers')
192
+ logger.info("✅ sentence-transformers model loading successful")
193
+ del model # Free memory
194
+ except Exception as e:
195
+ logger.warning(f"⚠️ sentence-transformers model loading failed: {e}")
196
+
197
+ if success:
198
+ logger.info("🎉 Model setup completed successfully!")
199
+ return 0
200
+ else:
201
+ logger.error("💥 Model setup encountered errors")
202
+ return 1
203
+
204
+ if __name__ == "__main__":
205
+ exit_code = main()
206
+ sys.exit(exit_code)
src/components/embedders/models/sentence_transformer_model.py CHANGED
@@ -153,21 +153,51 @@ class SentenceTransformerModel(EmbeddingModel, ConfigurableEmbedderComponent):
153
  trust_remote_code=self.trust_remote_code
154
  )
155
  else:
156
- # Use default cache behavior with fallback
157
- try:
158
- model = SentenceTransformer(
159
- self.model_name,
160
- trust_remote_code=self.trust_remote_code
161
- )
162
- except Exception as e:
163
- # Fallback to explicit cache directory
164
- cache_dir = os.environ.get('SENTENCE_TRANSFORMERS_HOME', '/tmp/.cache/sentence-transformers')
165
- os.makedirs(cache_dir, exist_ok=True)
166
- model = SentenceTransformer(
167
- self.model_name,
168
- cache_folder=cache_dir,
169
- trust_remote_code=self.trust_remote_code
170
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  # Move to device and set to eval mode
173
  model = model.to(self.device)
 
153
  trust_remote_code=self.trust_remote_code
154
  )
155
  else:
156
+ # Use default cache behavior with multiple fallbacks for cloud environments
157
+ cache_attempts = [
158
+ # Try default cache first
159
+ None,
160
+ # HuggingFace Spaces compatible paths
161
+ os.environ.get('SENTENCE_TRANSFORMERS_HOME', '/tmp/.cache/sentence-transformers'),
162
+ '/tmp/.cache/sentence-transformers',
163
+ '/app/.cache/sentence-transformers', # Common in containerized environments
164
+ './models/cache', # Local fallback
165
+ '/tmp/models' # Final fallback
166
+ ]
167
+
168
+ model = None
169
+ last_error = None
170
+
171
+ for cache_dir in cache_attempts:
172
+ try:
173
+ if cache_dir:
174
+ # Ensure cache directory exists and is writable
175
+ os.makedirs(cache_dir, exist_ok=True)
176
+ # Test if directory is writable
177
+ test_file = os.path.join(cache_dir, '.write_test')
178
+ with open(test_file, 'w') as f:
179
+ f.write('test')
180
+ os.remove(test_file)
181
+
182
+ model = SentenceTransformer(
183
+ self.model_name,
184
+ cache_folder=cache_dir,
185
+ trust_remote_code=self.trust_remote_code
186
+ )
187
+ else:
188
+ model = SentenceTransformer(
189
+ self.model_name,
190
+ trust_remote_code=self.trust_remote_code
191
+ )
192
+ break # Success - exit loop
193
+
194
+ except (OSError, PermissionError, Exception) as e:
195
+ last_error = e
196
+ logger.warning(f"Cache attempt failed for {cache_dir}: {e}")
197
+ continue
198
+
199
+ if model is None:
200
+ raise RuntimeError(f"Failed to load model with any cache configuration. Last error: {last_error}")
201
 
202
  # Move to device and set to eval mode
203
  model = model.to(self.device)
src/components/retrievers/graph/entity_extraction.py CHANGED
@@ -115,10 +115,35 @@ class EntityExtractor:
115
 
116
  except OSError as e:
117
  if "Can't find model" in str(e):
118
- raise EntityExtractionError(
119
- f"spaCy model '{self.config.model}' not found. "
120
- f"Install with: python -m spacy download {self.config.model}"
121
- ) from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  else:
123
  raise EntityExtractionError(f"Failed to load spaCy model: {str(e)}") from e
124
  except Exception as e:
 
115
 
116
  except OSError as e:
117
  if "Can't find model" in str(e):
118
+ logger.warning(f"spaCy model '{self.config.model}' not found. Attempting to download...")
119
+ try:
120
+ # Attempt to download the model
121
+ import subprocess
122
+ import sys
123
+
124
+ result = subprocess.run([
125
+ sys.executable, "-m", "spacy", "download", self.config.model
126
+ ], capture_output=True, text=True, timeout=300) # 5 minute timeout
127
+
128
+ if result.returncode == 0:
129
+ logger.info(f"Successfully downloaded spaCy model '{self.config.model}'")
130
+ # Try loading again
131
+ self.nlp = spacy.load(self.config.model)
132
+ logger.info(f"Loaded spaCy model: {self.config.model}")
133
+ else:
134
+ logger.error(f"Failed to download spaCy model: {result.stderr}")
135
+ raise EntityExtractionError(
136
+ f"spaCy model '{self.config.model}' not found and auto-download failed. "
137
+ f"Manual install: python -m spacy download {self.config.model}"
138
+ ) from e
139
+
140
+ except (subprocess.TimeoutExpired, subprocess.CalledProcessError, Exception) as download_error:
141
+ logger.error(f"Model download failed: {download_error}")
142
+ raise EntityExtractionError(
143
+ f"spaCy model '{self.config.model}' not found and auto-download failed. "
144
+ f"Manual install: python -m spacy download {self.config.model}. "
145
+ f"Download error: {download_error}"
146
+ ) from e
147
  else:
148
  raise EntityExtractionError(f"Failed to load spaCy model: {str(e)}") from e
149
  except Exception as e:
src/core/component_factory.py CHANGED
@@ -147,7 +147,36 @@ class ComponentFactory:
147
  return component_class
148
 
149
  except (ImportError, AttributeError) as e:
150
- raise ImportError(f"Failed to import {module_path}: {e}") from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  @classmethod
153
  def get_performance_metrics(cls) -> Dict[str, Dict[str, Any]]:
 
147
  return component_class
148
 
149
  except (ImportError, AttributeError) as e:
150
+ # Enhanced error handling with dependency suggestions
151
+ error_msg = str(e)
152
+ suggestions = []
153
+
154
+ # Check for specific missing dependencies and provide actionable suggestions
155
+ if "rank_bm25" in error_msg or "BM25" in error_msg:
156
+ suggestions.append("Install rank-bm25: pip install rank-bm25>=0.2.2")
157
+ elif "pdfplumber" in error_msg:
158
+ suggestions.append("Install pdfplumber: pip install pdfplumber>=0.10.0")
159
+ elif "sentence_transformers" in error_msg:
160
+ suggestions.append("Install sentence-transformers: pip install sentence-transformers>=2.2.0")
161
+ elif "transformers" in error_msg:
162
+ suggestions.append("Install transformers: pip install transformers>=4.30.0")
163
+ elif "spacy" in error_msg:
164
+ suggestions.append("Install spacy: pip install spacy>=3.7.0")
165
+ suggestions.append("Download spacy model: python -m spacy download en_core_web_sm")
166
+ elif "weaviate" in error_msg:
167
+ suggestions.append("Install weaviate-client: pip install weaviate-client>=3.15.0")
168
+ elif "huggingface_hub" in error_msg:
169
+ suggestions.append("Install huggingface-hub: pip install huggingface-hub>=0.16.0")
170
+ elif "accelerate" in error_msg:
171
+ suggestions.append("Install accelerate: pip install accelerate>=0.20.0")
172
+
173
+ enhanced_msg = f"Failed to import {module_path}: {e}"
174
+ if suggestions:
175
+ enhanced_msg += f"\n\nSuggested fixes:\n" + "\n".join(f" - {s}" for s in suggestions)
176
+ enhanced_msg += f"\n\nAlternatively, install all requirements: pip install -r requirements.txt"
177
+
178
+ logger.error(enhanced_msg)
179
+ raise ImportError(enhanced_msg) from e
180
 
181
  @classmethod
182
  def get_performance_metrics(cls) -> Dict[str, Dict[str, Any]]: