nivakaran commited on
Commit
eb6b502
·
verified ·
1 Parent(s): 752f5cc

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  ModelX[[:space:]]Final[[:space:]]Problem.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  ModelX[[:space:]]Final[[:space:]]Problem.pdf filter=lfs diff=lfs merge=lfs -text
37
+ trending_detection_visualization.png filter=lfs diff=lfs merge=lfs -text
38
+ vectorizer_anomaly_visualization.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -345,7 +345,7 @@ DELETE /api/intel/config/remove?target_type=profile&value=CompetitorX&platform=t
345
  ---
346
 
347
  ### 8. Vectorization Agent Graph (`vectorizationAgentGraph.py`) 🆕
348
- **Multilingual Text-to-Vector Conversion + Anomaly Detection**
349
 
350
  ```
351
  ┌─────────────────────────────────────────────────┐
@@ -366,28 +366,42 @@ DELETE /api/intel/config/remove?target_type=profile&value=CompetitorX&platform=t
366
 
367
 
368
  ┌─────────────────────────────────────────────────┐
369
- │ Step 3: Anomaly Detection (Isolation Forest) 🆕
370
- │ - Runs inference on every graph cycle
 
371
  │ - Outputs anomaly_score (0-1) │
372
- │ - Graceful fallback if model not trained │
373
  └─────────────────┬───────────────────────────────┘
374
 
375
 
376
  ┌─────────────────────────────────────────────────┐
377
- │ Step 4: Expert Summary (GroqLLM)
378
- │ - Opportunity identification
379
- │ - Threat detection
 
 
 
 
 
 
 
380
  │ - Sentiment analysis │
381
  └─────────────────┬───────────────────────────────┘
382
 
383
 
384
  ┌─────────────────────────────────────────────────┐
385
- │ Step 5: Format Output │
386
- │ - Includes anomaly insights in domain_insights
387
- │ - Passes results to parent graph │
388
  └─────────────────────────────────────────────────┘
389
  ```
390
 
 
 
 
 
 
 
 
 
391
  ---
392
 
393
  ### 10. Weather Prediction Pipeline (`models/weather-prediction/`) 🆕
 
345
  ---
346
 
347
  ### 8. Vectorization Agent Graph (`vectorizationAgentGraph.py`) 🆕
348
+ **6-Step Multilingual NLP Pipeline with Anomaly + Trending Detection**
349
 
350
  ```
351
  ┌─────────────────────────────────────────────────┐
 
366
 
367
 
368
  ┌─────────────────────────────────────────────────┐
369
+ │ Step 3: Anomaly Detection (Isolation Forest)
370
+ │ - English: ML model inference
371
+ │ - Sinhala/Tamil: Skipped (incompatible vectors) │
372
  │ - Outputs anomaly_score (0-1) │
 
373
  └─────────────────┬───────────────────────────────┘
374
 
375
 
376
  ┌─────────────────────────────────────────────────┐
377
+ │ Step 4: Trending Detection 🆕
378
+ │ - Entity extraction (hashtags, proper nouns)
379
+ │ - Momentum: current_hour / avg_last_6_hours
380
+ │ - Spike alerts when momentum > 3x │
381
+ └─────────────────┬───────────────────────────────┘
382
+
383
+
384
+ ┌─────────────────────────────────────────────────┐
385
+ │ Step 5: Expert Summary (GroqLLM) │
386
+ │ - Opportunity & threat identification │
387
  │ - Sentiment analysis │
388
  └─────────────────┬───────────────────────────────┘
389
 
390
 
391
  ┌─────────────────────────────────────────────────┐
392
+ │ Step 6: Format Output │
393
+ │ - Includes anomaly + trending in domain_insights│
 
394
  └─────────────────────────────────────────────────┘
395
  ```
396
 
397
+ **Trending Detection API Endpoints:**
398
+
399
+ | Endpoint | Method | Description |
400
+ |----------|--------|-------------|
401
+ | `/api/trending` | GET | Get trending topics & spike alerts |
402
+ | `/api/trending/topic/{topic}` | GET | Get hourly history for a topic |
403
+ | `/api/trending/record` | POST | Record a topic mention (testing) |
404
+
405
  ---
406
 
407
  ### 10. Weather Prediction Pipeline (`models/weather-prediction/`) 🆕
create_visualization.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ create_visualization.py
3
+ Creates visualization of multilingual embeddings and anomaly detection results
4
+ using actual training data.
5
+ """
6
+ import sys
7
+ import numpy as np
8
+ import matplotlib
9
+ matplotlib.use('Agg')
10
+ import matplotlib.pyplot as plt
11
+ from pathlib import Path
12
+ from sklearn.decomposition import PCA
13
+ import joblib
14
+
15
+ if sys.platform == 'win32':
16
+ sys.stdout.reconfigure(encoding='utf-8')
17
+
18
+ print("=" * 60)
19
+ print(" CREATING VECTORIZER VISUALIZATION")
20
+ print("=" * 60)
21
+
22
+ # Load saved embeddings from the training pipeline
23
+ embeddings_path = Path('models/anomaly-detection/artifacts/data_transformation')
24
+
25
+ # Find the latest embeddings file
26
+ emb_files = list(embeddings_path.glob('embeddings_*.npy'))
27
+ if emb_files:
28
+ latest_emb = sorted(emb_files)[-1]
29
+ embeddings = np.load(latest_emb)
30
+ print(f'Loaded embeddings: {embeddings.shape}')
31
+ else:
32
+ print('No embeddings found')
33
+ sys.exit(1)
34
+
35
+ # Load transformed data to get language info
36
+ import pandas as pd
37
+ data_files = list(embeddings_path.glob('transformed_*.parquet'))
38
+ if data_files:
39
+ latest_data = sorted(data_files)[-1]
40
+ df = pd.read_parquet(latest_data)
41
+ languages = df['language'].values
42
+ lang_counts = df['language'].value_counts().to_dict()
43
+ print(f'Languages: {lang_counts}')
44
+ else:
45
+ languages = ['english'] * len(embeddings)
46
+ lang_counts = {'english': len(embeddings)}
47
+
48
+ # Load anomaly model and predict
49
+ model_path = Path('models/anomaly-detection/artifacts/model_trainer/isolation_forest_embeddings_only.joblib')
50
+ model = joblib.load(model_path)
51
+ predictions = model.predict(embeddings)
52
+ anomaly_mask = predictions == -1
53
+
54
+ print(f'Total samples: {len(embeddings)}')
55
+ print(f'Anomalies detected: {anomaly_mask.sum()}')
56
+ print(f'Normal samples: {(~anomaly_mask).sum()}')
57
+
58
+ # PCA for visualization
59
+ print('\nRunning PCA...')
60
+ pca = PCA(n_components=2)
61
+ X_2d = pca.fit_transform(embeddings)
62
+ print(f'Explained variance: {pca.explained_variance_ratio_.sum():.2%}')
63
+
64
+ # Create figure with 2 subplots
65
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6))
66
+
67
+ # Plot 1: By Language
68
+ ax1 = axes[0]
69
+ colors = {'english': '#3498db', 'sinhala': '#2ecc71', 'tamil': '#e74c3c', 'unknown': '#95a5a6'}
70
+
71
+ for lang in colors:
72
+ mask = np.array(languages) == lang
73
+ if mask.any():
74
+ ax1.scatter(X_2d[mask, 0], X_2d[mask, 1],
75
+ c=colors[lang], label=f'{lang.capitalize()} ({mask.sum()})',
76
+ alpha=0.7, s=60, edgecolors='white', linewidth=0.5)
77
+
78
+ ax1.set_title('Text Embeddings by Language (PCA Projection)', fontsize=14, fontweight='bold')
79
+ ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
80
+ ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
81
+ ax1.legend(loc='best', framealpha=0.9)
82
+ ax1.grid(True, alpha=0.3)
83
+
84
+ # Plot 2: Anomalies
85
+ ax2 = axes[1]
86
+ normal_mask = ~anomaly_mask
87
+
88
+ # Plot normal points first (so anomalies are on top)
89
+ ax2.scatter(X_2d[normal_mask, 0], X_2d[normal_mask, 1],
90
+ c='#3498db', label=f'Normal ({normal_mask.sum()})', alpha=0.6, s=60,
91
+ edgecolors='white', linewidth=0.5)
92
+
93
+ # Plot anomalies with X markers
94
+ ax2.scatter(X_2d[anomaly_mask, 0], X_2d[anomaly_mask, 1],
95
+ c='#e74c3c', marker='X', label=f'Anomaly ({anomaly_mask.sum()})',
96
+ alpha=0.9, s=120, edgecolors='black', linewidth=0.5)
97
+
98
+ ax2.set_title('Anomaly Detection Results (Isolation Forest)', fontsize=14, fontweight='bold')
99
+ ax2.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
100
+ ax2.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
101
+ ax2.legend(loc='best', framealpha=0.9)
102
+ ax2.grid(True, alpha=0.3)
103
+
104
+ plt.tight_layout()
105
+ output_path = 'vectorizer_anomaly_visualization.png'
106
+ plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='white')
107
+ print(f'\nVisualization saved: {output_path}')
108
+
109
+ # Also create the visualization in artifacts dir
110
+ artifacts_dir = Path(r'C:\Users\LENOVO\.gemini\antigravity\brain\b892f63f-afbc-4c4a-bbf1-37195faf04a5')
111
+ if artifacts_dir.exists():
112
+ artifacts_output = artifacts_dir / 'vectorizer_visualization.png'
113
+ plt.savefig(str(artifacts_output), dpi=150, bbox_inches='tight', facecolor='white')
114
+ print(f'Also saved to: {artifacts_output}')
115
+
116
+ plt.close()
117
+
118
+ print("\n" + "=" * 60)
119
+ print(" VISUALIZATION COMPLETE")
120
+ print("=" * 60)
frontend/app/components/dashboard/TrendingTopics.tsx ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * TrendingTopics.tsx
3
+ * Dashboard component for displaying trending topics and spike alerts
4
+ */
5
+
6
+ import React, { useEffect, useState } from 'react';
7
+
8
+ interface TrendingTopic {
9
+ topic: string;
10
+ momentum: number;
11
+ is_spike: boolean;
12
+ count_current_hour?: number;
13
+ avg_count?: number;
14
+ }
15
+
16
+ interface TrendingData {
17
+ status: string;
18
+ trending_topics: TrendingTopic[];
19
+ spike_alerts: TrendingTopic[];
20
+ total_trending: number;
21
+ total_spikes: number;
22
+ }
23
+
24
+ export const TrendingTopics: React.FC = () => {
25
+ const [data, setData] = useState<TrendingData | null>(null);
26
+ const [loading, setLoading] = useState(true);
27
+ const [error, setError] = useState<string | null>(null);
28
+
29
+ useEffect(() => {
30
+ const fetchTrending = async () => {
31
+ try {
32
+ const response = await fetch('/api/trending');
33
+ const result = await response.json();
34
+ setData(result);
35
+ setError(null);
36
+ } catch (err) {
37
+ setError('Failed to fetch trending data');
38
+ console.error('Trending fetch error:', err);
39
+ } finally {
40
+ setLoading(false);
41
+ }
42
+ };
43
+
44
+ fetchTrending();
45
+ // Refresh every 30 seconds
46
+ const interval = setInterval(fetchTrending, 30000);
47
+ return () => clearInterval(interval);
48
+ }, []);
49
+
50
+ const getMomentumColor = (momentum: number) => {
51
+ if (momentum >= 10) return 'text-red-500';
52
+ if (momentum >= 5) return 'text-orange-500';
53
+ if (momentum >= 2) return 'text-yellow-500';
54
+ return 'text-gray-400';
55
+ };
56
+
57
+ const getMomentumBg = (momentum: number) => {
58
+ if (momentum >= 10) return 'bg-red-500/20';
59
+ if (momentum >= 5) return 'bg-orange-500/20';
60
+ if (momentum >= 2) return 'bg-yellow-500/20';
61
+ return 'bg-gray-500/10';
62
+ };
63
+
64
+ if (loading) {
65
+ return (
66
+ <div className="bg-gradient-to-br from-gray-900/90 to-gray-800/90 backdrop-blur-lg rounded-2xl p-6 border border-gray-700/50 shadow-xl">
67
+ <div className="flex items-center gap-3 mb-4">
68
+ <div className="w-10 h-10 rounded-xl bg-gradient-to-br from-purple-500 to-pink-500 flex items-center justify-center">
69
+ <svg className="w-5 h-5 text-white animate-pulse" fill="none" viewBox="0 0 24 24" stroke="currentColor">
70
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 7h8m0 0v8m0-8l-8 8-4-4-6 6" />
71
+ </svg>
72
+ </div>
73
+ <div>
74
+ <h3 className="text-lg font-bold text-white">Trending Topics</h3>
75
+ <p className="text-xs text-gray-400">Loading...</p>
76
+ </div>
77
+ </div>
78
+ <div className="animate-pulse space-y-3">
79
+ {[1, 2, 3].map((i) => (
80
+ <div key={i} className="h-10 bg-gray-700/50 rounded-lg"></div>
81
+ ))}
82
+ </div>
83
+ </div>
84
+ );
85
+ }
86
+
87
+ if (error || !data) {
88
+ return (
89
+ <div className="bg-gradient-to-br from-gray-900/90 to-gray-800/90 backdrop-blur-lg rounded-2xl p-6 border border-red-700/50 shadow-xl">
90
+ <div className="flex items-center gap-3">
91
+ <div className="w-10 h-10 rounded-xl bg-red-500/20 flex items-center justify-center">
92
+ <svg className="w-5 h-5 text-red-400" fill="none" viewBox="0 0 24 24" stroke="currentColor">
93
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z" />
94
+ </svg>
95
+ </div>
96
+ <div>
97
+ <h3 className="text-lg font-bold text-white">Trending Topics</h3>
98
+ <p className="text-xs text-red-400">{error || 'No data available'}</p>
99
+ </div>
100
+ </div>
101
+ </div>
102
+ );
103
+ }
104
+
105
+ return (
106
+ <div className="bg-gradient-to-br from-gray-900/90 to-gray-800/90 backdrop-blur-lg rounded-2xl p-6 border border-gray-700/50 shadow-xl">
107
+ {/* Header */}
108
+ <div className="flex items-center justify-between mb-4">
109
+ <div className="flex items-center gap-3">
110
+ <div className="w-10 h-10 rounded-xl bg-gradient-to-br from-purple-500 to-pink-500 flex items-center justify-center">
111
+ <svg className="w-5 h-5 text-white" fill="none" viewBox="0 0 24 24" stroke="currentColor">
112
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 7h8m0 0v8m0-8l-8 8-4-4-6 6" />
113
+ </svg>
114
+ </div>
115
+ <div>
116
+ <h3 className="text-lg font-bold text-white">Trending Topics</h3>
117
+ <p className="text-xs text-gray-400">{data.total_trending} trending • {data.total_spikes} spikes</p>
118
+ </div>
119
+ </div>
120
+ {data.total_spikes > 0 && (
121
+ <span className="px-2 py-1 bg-red-500/20 text-red-400 text-xs font-medium rounded-lg animate-pulse">
122
+ 🔥 {data.total_spikes} SPIKE{data.total_spikes > 1 ? 'S' : ''}
123
+ </span>
124
+ )}
125
+ </div>
126
+
127
+ {/* Spike Alerts */}
128
+ {data.spike_alerts.length > 0 && (
129
+ <div className="mb-4 p-3 bg-red-500/10 rounded-xl border border-red-500/30">
130
+ <h4 className="text-sm font-semibold text-red-400 mb-2 flex items-center gap-2">
131
+ <span>🔥</span> SPIKE ALERTS
132
+ </h4>
133
+ <div className="flex flex-wrap gap-2">
134
+ {data.spike_alerts.slice(0, 5).map((spike, idx) => (
135
+ <span
136
+ key={idx}
137
+ className="px-3 py-1 bg-red-500/20 text-red-300 text-sm font-medium rounded-full border border-red-500/30"
138
+ >
139
+ {spike.topic} <span className="text-red-400 font-bold">{spike.momentum.toFixed(0)}x</span>
140
+ </span>
141
+ ))}
142
+ </div>
143
+ </div>
144
+ )}
145
+
146
+ {/* Trending Topics List */}
147
+ <div className="space-y-2">
148
+ {data.trending_topics.length === 0 ? (
149
+ <div className="text-center py-8 text-gray-500">
150
+ <svg className="w-12 h-12 mx-auto mb-2 opacity-50" fill="none" viewBox="0 0 24 24" stroke="currentColor">
151
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 7h8m0 0v8m0-8l-8 8-4-4-6 6" />
152
+ </svg>
153
+ <p>No trending topics yet</p>
154
+ <p className="text-xs mt-1">Topics will appear as data flows in</p>
155
+ </div>
156
+ ) : (
157
+ data.trending_topics.slice(0, 8).map((topic, idx) => (
158
+ <div
159
+ key={idx}
160
+ className={`flex items-center justify-between p-3 rounded-xl ${getMomentumBg(topic.momentum)} border border-gray-700/30 transition-all hover:scale-[1.02]`}
161
+ >
162
+ <div className="flex items-center gap-3">
163
+ <span className="text-lg font-bold text-gray-500">#{idx + 1}</span>
164
+ <div>
165
+ <p className="font-semibold text-white capitalize">{topic.topic}</p>
166
+ <p className="text-xs text-gray-400">
167
+ {topic.is_spike ? '🔥 Spiking' : 'Trending'}
168
+ </p>
169
+ </div>
170
+ </div>
171
+ <div className="text-right">
172
+ <p className={`text-lg font-bold ${getMomentumColor(topic.momentum)}`}>
173
+ {topic.momentum.toFixed(0)}x
174
+ </p>
175
+ <p className="text-xs text-gray-500">momentum</p>
176
+ </div>
177
+ </div>
178
+ ))
179
+ )}
180
+ </div>
181
+
182
+ {/* Footer */}
183
+ <div className="mt-4 pt-4 border-t border-gray-700/50">
184
+ <p className="text-xs text-gray-500 text-center">
185
+ Momentum = current hour mentions / avg last 6 hours
186
+ </p>
187
+ </div>
188
+ </div>
189
+ );
190
+ };
191
+
192
+ export default TrendingTopics;
main.py CHANGED
@@ -913,6 +913,113 @@ def get_currency_history(days: int = 7):
913
  }
914
 
915
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
916
  # ============================================
917
  # ANOMALY DETECTION ENDPOINTS
918
  # ============================================
 
913
  }
914
 
915
 
916
+ # ============================================
917
+ # TRENDING DETECTION ENDPOINTS
918
+ # ============================================
919
+
920
+ @app.get("/api/trending")
921
+ def get_trending_topics(limit: int = 10):
922
+ """
923
+ Get currently trending topics.
924
+
925
+ Returns topics with momentum > 2x (gaining traction).
926
+ """
927
+ try:
928
+ from src.utils.trending_detector import get_trending_now, get_spikes
929
+
930
+ trending = get_trending_now(limit=limit)
931
+ spikes = get_spikes()
932
+
933
+ return {
934
+ "status": "success",
935
+ "trending_topics": trending,
936
+ "spike_alerts": spikes,
937
+ "total_trending": len(trending),
938
+ "total_spikes": len(spikes)
939
+ }
940
+
941
+ except Exception as e:
942
+ logger.error(f"[TrendingAPI] Error: {e}")
943
+ return {
944
+ "status": "error",
945
+ "error": str(e),
946
+ "trending_topics": [],
947
+ "spike_alerts": []
948
+ }
949
+
950
+
951
+ @app.get("/api/trending/topic/{topic}")
952
+ def get_topic_history(topic: str, hours: int = 24):
953
+ """
954
+ Get hourly mention history for a specific topic.
955
+
956
+ Args:
957
+ topic: Topic name to get history for
958
+ hours: Number of hours of history to return (default 24)
959
+ """
960
+ try:
961
+ from src.utils.trending_detector import get_trending_detector
962
+
963
+ detector = get_trending_detector()
964
+ history = detector.get_topic_history(topic, hours=hours)
965
+ momentum = detector.get_momentum(topic)
966
+ is_spike = detector.is_spike(topic)
967
+
968
+ return {
969
+ "status": "success",
970
+ "topic": topic,
971
+ "momentum": momentum,
972
+ "is_spike": is_spike,
973
+ "history": history
974
+ }
975
+
976
+ except Exception as e:
977
+ logger.error(f"[TrendingAPI] Error getting history for {topic}: {e}")
978
+ return {
979
+ "status": "error",
980
+ "error": str(e),
981
+ "topic": topic,
982
+ "momentum": 1.0,
983
+ "is_spike": False,
984
+ "history": []
985
+ }
986
+
987
+
988
+ @app.post("/api/trending/record")
989
+ def record_topic_mention(topic: str, source: str = "manual", domain: str = "general"):
990
+ """
991
+ Record a topic mention (for testing/manual tracking).
992
+
993
+ Args:
994
+ topic: Topic/keyword being mentioned
995
+ source: Source of the mention (twitter, news, etc.)
996
+ domain: Domain category (political, economical, etc.)
997
+ """
998
+ try:
999
+ from src.utils.trending_detector import record_topic_mention as record_mention
1000
+
1001
+ record_mention(topic=topic, source=source, domain=domain)
1002
+
1003
+ # Get updated momentum
1004
+ from src.utils.trending_detector import get_trending_detector
1005
+ detector = get_trending_detector()
1006
+ momentum = detector.get_momentum(topic)
1007
+
1008
+ return {
1009
+ "status": "success",
1010
+ "message": f"Recorded mention for '{topic}'",
1011
+ "current_momentum": momentum,
1012
+ "is_spike": detector.is_spike(topic)
1013
+ }
1014
+
1015
+ except Exception as e:
1016
+ logger.error(f"[TrendingAPI] Error recording mention: {e}")
1017
+ return {
1018
+ "status": "error",
1019
+ "error": str(e)
1020
+ }
1021
+
1022
+
1023
  # ============================================
1024
  # ANOMALY DETECTION ENDPOINTS
1025
  # ============================================
models/anomaly-detection/src/components/model_trainer.py CHANGED
@@ -472,7 +472,41 @@ class ModelTrainer:
472
  optuna_study_name=None
473
  )
474
 
475
- logger.info(f"[ModelTrainer] Training complete in {duration:.1f}s")
476
  logger.info(f"[ModelTrainer] Best model: {best_model['name'] if best_model else 'N/A'}")
477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  return artifact
 
472
  optuna_study_name=None
473
  )
474
 
475
+ logger.info(f"[ModelTrainer] Training complete in {duration:.1f}s")
476
  logger.info(f"[ModelTrainer] Best model: {best_model['name'] if best_model else 'N/A'}")
477
 
478
+ # ============================================
479
+ # TRAIN EMBEDDING-ONLY MODEL FOR LIVE INFERENCE
480
+ # ============================================
481
+ # The Vectorizer Agent only has 768-dim embeddings at inference time
482
+ # (no temporal/engagement features), so we train a separate model
483
+ try:
484
+ # Check if features include extra metadata (> 768 dims)
485
+ if X.shape[1] > 768:
486
+ logger.info(f"[ModelTrainer] Training embedding-only model for Vectorizer Agent...")
487
+
488
+ # Extract only the first 768 dimensions (BERT embeddings)
489
+ X_embeddings_only = X[:, :768]
490
+ logger.info(f"[ModelTrainer] Embedding-only shape: {X_embeddings_only.shape}")
491
+
492
+ # Train Isolation Forest on embeddings only
493
+ embedding_model = IsolationForest(
494
+ contamination=0.1,
495
+ n_estimators=100,
496
+ random_state=42,
497
+ n_jobs=-1
498
+ )
499
+ embedding_model.fit(X_embeddings_only)
500
+
501
+ # Save to a dedicated path for the Vectorizer Agent
502
+ embedding_model_path = Path(self.config.output_directory) / "isolation_forest_embeddings_only.joblib"
503
+ joblib.dump(embedding_model, embedding_model_path)
504
+
505
+ logger.info(f"[ModelTrainer] Embedding-only model saved: {embedding_model_path}")
506
+ logger.info(f"[ModelTrainer] This model is for real-time inference by Vectorizer Agent")
507
+ else:
508
+ logger.info(f"[ModelTrainer] Features are already embedding-only ({X.shape[1]} dims)")
509
+ except Exception as e:
510
+ logger.warning(f"[ModelTrainer] Embedding-only model training failed: {e}")
511
+
512
  return artifact
src/api/vectorization_api.py CHANGED
@@ -72,6 +72,10 @@ class VectorizationResponse(BaseModel):
72
  domain_insights: List[Dict[str, Any]]
73
  processing_time_seconds: float
74
  vectors: Optional[List[Dict[str, Any]]] = None
 
 
 
 
75
 
76
 
77
  class HealthResponse(BaseModel):
@@ -169,6 +173,9 @@ async def vectorize_texts(request: VectorizationRequest):
169
  vectors=(
170
  result.get("vector_embeddings") if request.include_vectors else None
171
  ),
 
 
 
172
  )
173
 
174
  return response
 
72
  domain_insights: List[Dict[str, Any]]
73
  processing_time_seconds: float
74
  vectors: Optional[List[Dict[str, Any]]] = None
75
+ # Anomaly Detection Results
76
+ anomaly_results: Optional[Dict[str, Any]] = None
77
+ # Trending Detection Results
78
+ trending_results: Optional[Dict[str, Any]] = None
79
 
80
 
81
  class HealthResponse(BaseModel):
 
173
  vectors=(
174
  result.get("vector_embeddings") if request.include_vectors else None
175
  ),
176
+ # Include anomaly & trending detection results
177
+ anomaly_results=result.get("anomaly_results"),
178
+ trending_results=result.get("trending_results"),
179
  )
180
 
181
  return response
src/graphs/vectorizationAgentGraph.py CHANGED
@@ -17,8 +17,9 @@ class VectorizationGraphBuilder:
17
  Step 1: Language Detection (FastText/lingua-py)
18
  Step 2: Text Vectorization (SinhalaBERTo/Tamil-BERT/DistilBERT)
19
  Step 3: Anomaly Detection (Isolation Forest on vectors)
20
- Step 4: Expert Summary (GroqLLM)
21
- Step 5: Format Output
 
22
  """
23
 
24
  def __init__(self, llm=None):
@@ -29,7 +30,7 @@ class VectorizationGraphBuilder:
29
  Build the vectorization agent graph.
30
 
31
  Flow:
32
- detect_languages → vectorize_texts → anomaly_detection → expert_summary → format_output → END
33
  """
34
  node = VectorizationAgentNode(self.llm)
35
 
@@ -40,16 +41,18 @@ class VectorizationGraphBuilder:
40
  graph.add_node("detect_languages", node.detect_languages)
41
  graph.add_node("vectorize_texts", node.vectorize_texts)
42
  graph.add_node("anomaly_detection", node.run_anomaly_detection)
 
43
  graph.add_node("generate_expert_summary", node.generate_expert_summary)
44
  graph.add_node("format_output", node.format_final_output)
45
 
46
  # Set entry point
47
  graph.set_entry_point("detect_languages")
48
 
49
- # Sequential flow with anomaly detection
50
  graph.add_edge("detect_languages", "vectorize_texts")
51
  graph.add_edge("vectorize_texts", "anomaly_detection")
52
- graph.add_edge("anomaly_detection", "generate_expert_summary")
 
53
  graph.add_edge("generate_expert_summary", "format_output")
54
  graph.add_edge("format_output", END)
55
 
@@ -60,12 +63,13 @@ class VectorizationGraphBuilder:
60
  print("\n" + "=" * 60)
61
  print("[BRAIN] BUILDING VECTORIZATION AGENT GRAPH")
62
  print("=" * 60)
63
- print("Architecture: 5-Step Sequential Pipeline")
64
  print(" Step 1: Language Detection (FastText/Unicode)")
65
  print(" Step 2: Text Vectorization (SinhalaBERTo/Tamil-BERT/DistilBERT)")
66
  print(" Step 3: Anomaly Detection (Isolation Forest)")
67
- print(" Step 4: Expert Summary (GroqLLM)")
68
- print(" Step 5: Format Output")
 
69
  print("-" * 60)
70
 
71
  llm = GroqLLM().get_llm()
@@ -73,3 +77,4 @@ graph = VectorizationGraphBuilder(llm).build_graph()
73
 
74
  print("[OK] Vectorization Agent Graph compiled successfully")
75
  print("=" * 60 + "\n")
 
 
17
  Step 1: Language Detection (FastText/lingua-py)
18
  Step 2: Text Vectorization (SinhalaBERTo/Tamil-BERT/DistilBERT)
19
  Step 3: Anomaly Detection (Isolation Forest on vectors)
20
+ Step 4: Trending Detection (Velocity/Spike tracking)
21
+ Step 5: Expert Summary (GroqLLM)
22
+ Step 6: Format Output
23
  """
24
 
25
  def __init__(self, llm=None):
 
30
  Build the vectorization agent graph.
31
 
32
  Flow:
33
+ detect_languages → vectorize_texts → anomaly_detection → trending_detection → expert_summary → format_output → END
34
  """
35
  node = VectorizationAgentNode(self.llm)
36
 
 
41
  graph.add_node("detect_languages", node.detect_languages)
42
  graph.add_node("vectorize_texts", node.vectorize_texts)
43
  graph.add_node("anomaly_detection", node.run_anomaly_detection)
44
+ graph.add_node("trending_detection", node.run_trending_detection)
45
  graph.add_node("generate_expert_summary", node.generate_expert_summary)
46
  graph.add_node("format_output", node.format_final_output)
47
 
48
  # Set entry point
49
  graph.set_entry_point("detect_languages")
50
 
51
+ # Sequential flow with anomaly + trending detection
52
  graph.add_edge("detect_languages", "vectorize_texts")
53
  graph.add_edge("vectorize_texts", "anomaly_detection")
54
+ graph.add_edge("anomaly_detection", "trending_detection")
55
+ graph.add_edge("trending_detection", "generate_expert_summary")
56
  graph.add_edge("generate_expert_summary", "format_output")
57
  graph.add_edge("format_output", END)
58
 
 
63
  print("\n" + "=" * 60)
64
  print("[BRAIN] BUILDING VECTORIZATION AGENT GRAPH")
65
  print("=" * 60)
66
+ print("Architecture: 6-Step Sequential Pipeline")
67
  print(" Step 1: Language Detection (FastText/Unicode)")
68
  print(" Step 2: Text Vectorization (SinhalaBERTo/Tamil-BERT/DistilBERT)")
69
  print(" Step 3: Anomaly Detection (Isolation Forest)")
70
+ print(" Step 4: Trending Detection (Velocity/Spikes)")
71
+ print(" Step 5: Expert Summary (GroqLLM)")
72
+ print(" Step 6: Format Output")
73
  print("-" * 60)
74
 
75
  llm = GroqLLM().get_llm()
 
77
 
78
  print("[OK] Vectorization Agent Graph compiled successfully")
79
  print("=" * 60 + "\n")
80
+
src/nodes/vectorizationAgentNode.py CHANGED
@@ -324,6 +324,13 @@ class VectorizationAgentNode:
324
  import joblib
325
 
326
  model_paths = [
 
 
 
 
 
 
 
327
  MODELS_PATH / "output" / "isolation_forest_model.joblib",
328
  MODELS_PATH
329
  / "artifacts"
@@ -337,7 +344,7 @@ class VectorizationAgentNode:
337
  anomaly_model = joblib.load(model_path)
338
  model_name = model_path.stem
339
  logger.info(
340
- f"[VectorizationAgent] Loaded anomaly model: {model_path.name}"
341
  )
342
  break
343
 
@@ -361,18 +368,36 @@ class VectorizationAgentNode:
361
  }
362
 
363
  # Run inference on each embedding
 
 
 
 
 
 
364
  anomalies = []
365
  normal_count = 0
 
366
 
367
  for emb in embeddings:
368
  try:
369
  vector = emb.get("vector", [])
370
  post_id = emb.get("post_id", "")
 
371
 
372
  if not vector or len(vector) != 768:
373
  continue
374
 
375
- # Reshape for sklearn
 
 
 
 
 
 
 
 
 
 
376
  vector_array = np.array(vector).reshape(1, -1)
377
 
378
  # Predict: -1 = anomaly, 1 = normal
@@ -395,7 +420,8 @@ class VectorizationAgentNode:
395
  "post_id": post_id,
396
  "anomaly_score": float(normalized_score),
397
  "is_anomaly": True,
398
- "language": emb.get("language", "unknown"),
 
399
  }
400
  )
401
  else:
@@ -407,7 +433,8 @@ class VectorizationAgentNode:
407
  )
408
 
409
  logger.info(
410
- f"[VectorizationAgent] Anomaly detection: {len(anomalies)} anomalies, {normal_count} normal"
 
411
  )
412
 
413
  return {
@@ -423,6 +450,180 @@ class VectorizationAgentNode:
423
  },
424
  }
425
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  def generate_expert_summary(self, state: VectorizationAgentState) -> Dict[str, Any]:
427
  """
428
  Step 3: Use GroqLLM to generate expert summary combining all insights.
 
324
  import joblib
325
 
326
  model_paths = [
327
+ # Embedding-only model (768-dim) - compatible with Vectorizer Agent
328
+ MODELS_PATH
329
+ / "artifacts"
330
+ / "model_trainer"
331
+ / "isolation_forest_embeddings_only.joblib",
332
+ # Full-feature models (may have different dimensions)
333
+ MODELS_PATH / "output" / "isolation_forest_embeddings_only.joblib",
334
  MODELS_PATH / "output" / "isolation_forest_model.joblib",
335
  MODELS_PATH
336
  / "artifacts"
 
344
  anomaly_model = joblib.load(model_path)
345
  model_name = model_path.stem
346
  logger.info(
347
+ f"[VectorizationAgent] Loaded anomaly model: {model_path.name}"
348
  )
349
  break
350
 
 
368
  }
369
 
370
  # Run inference on each embedding
371
+ # IMPORTANT: The anomaly model was trained primarily on English embeddings.
372
+ # Different BERT models (SinhalaBERTo, Tamil-BERT, DistilBERT) produce embeddings
373
+ # in completely different vector spaces, so non-English texts would incorrectly
374
+ # appear as anomalies. We handle this by:
375
+ # 1. Only running the model on English texts
376
+ # 2. Using a content-based heuristic for non-English texts
377
  anomalies = []
378
  normal_count = 0
379
+ skipped_non_english = 0
380
 
381
  for emb in embeddings:
382
  try:
383
  vector = emb.get("vector", [])
384
  post_id = emb.get("post_id", "")
385
+ language = emb.get("language", "english")
386
 
387
  if not vector or len(vector) != 768:
388
  continue
389
 
390
+ # For non-English languages, skip anomaly detection
391
+ # The ML model was trained on English embeddings only.
392
+ # Different BERT models (SinhalaBERTo, Tamil-BERT) have completely
393
+ # different embedding spaces - Tamil embeddings have magnitude ~0.64
394
+ # while English has ~7.5 and Sinhala ~13.7. They cannot be compared.
395
+ if language in ["sinhala", "tamil"]:
396
+ skipped_non_english += 1
397
+ normal_count += 1 # Treat as normal (not anomalous)
398
+ continue
399
+
400
+ # For English texts, use the trained ML model
401
  vector_array = np.array(vector).reshape(1, -1)
402
 
403
  # Predict: -1 = anomaly, 1 = normal
 
420
  "post_id": post_id,
421
  "anomaly_score": float(normalized_score),
422
  "is_anomaly": True,
423
+ "language": language,
424
+ "detection_method": "isolation_forest",
425
  }
426
  )
427
  else:
 
433
  )
434
 
435
  logger.info(
436
+ f"[VectorizationAgent] Anomaly detection: {len(anomalies)} anomalies, "
437
+ f"{normal_count} normal, {skipped_non_english} non-English (heuristic)"
438
  )
439
 
440
  return {
 
450
  },
451
  }
452
 
453
+ def run_trending_detection(self, state: VectorizationAgentState) -> Dict[str, Any]:
454
+ """
455
+ Step 2.6: Detect trending topics from the input texts.
456
+
457
+ Extracts key entities/topics and tracks their mention velocity.
458
+ Identifies:
459
+ - Trending topics (momentum > 2x normal)
460
+ - Spike alerts (volume > 3x normal)
461
+ - Topics with increasing momentum
462
+ """
463
+ logger.info("[VectorizationAgent] STEP 2.6: Trending Detection")
464
+
465
+ detection_results = state.get("language_detection_results", [])
466
+
467
+ if not detection_results:
468
+ logger.warning("[VectorizationAgent] No texts for trending detection")
469
+ return {
470
+ "current_step": "trending_detection",
471
+ "trending_results": {
472
+ "status": "skipped",
473
+ "reason": "no_texts",
474
+ "trending_topics": [],
475
+ "spike_alerts": [],
476
+ },
477
+ }
478
+
479
+ # Import trending detector
480
+ try:
481
+ from src.utils.trending_detector import (
482
+ get_trending_detector,
483
+ record_topic_mention,
484
+ get_trending_now,
485
+ get_spikes,
486
+ )
487
+ TRENDING_AVAILABLE = True
488
+ except ImportError as e:
489
+ logger.warning(f"[VectorizationAgent] Trending detector not available: {e}")
490
+ TRENDING_AVAILABLE = False
491
+
492
+ if not TRENDING_AVAILABLE:
493
+ return {
494
+ "current_step": "trending_detection",
495
+ "trending_results": {
496
+ "status": "unavailable",
497
+ "reason": "trending_detector_not_installed",
498
+ "trending_topics": [],
499
+ "spike_alerts": [],
500
+ },
501
+ }
502
+
503
+ # Extract entities and record mentions
504
+ entities_found = []
505
+
506
+ for item in detection_results:
507
+ text = item.get("text", "") # Field is 'text', not 'original_text'
508
+ language = item.get("language", "english")
509
+ post_id = item.get("post_id", "")
510
+
511
+ # Simple entity extraction (keywords, capitalized words, etc.)
512
+ # In production, you'd use NER or more sophisticated extraction
513
+ extracted = self._extract_entities(text, language)
514
+
515
+ for entity in extracted:
516
+ try:
517
+ # Record mention with trending detector
518
+ record_topic_mention(
519
+ topic=entity["text"],
520
+ source=entity.get("source", "feed"),
521
+ domain=entity.get("domain", "general"),
522
+ )
523
+ entities_found.append({
524
+ "entity": entity["text"],
525
+ "type": entity.get("type", "keyword"),
526
+ "post_id": post_id,
527
+ "language": language,
528
+ })
529
+ except Exception as e:
530
+ logger.debug(f"[VectorizationAgent] Failed to record mention: {e}")
531
+
532
+ # Get current trending topics and spikes
533
+ try:
534
+ trending_topics = get_trending_now(limit=10)
535
+ spike_alerts = get_spikes()
536
+ except Exception as e:
537
+ logger.warning(f"[VectorizationAgent] Failed to get trending data: {e}")
538
+ trending_topics = []
539
+ spike_alerts = []
540
+
541
+ logger.info(
542
+ f"[VectorizationAgent] Trending detection: {len(entities_found)} entities, "
543
+ f"{len(trending_topics)} trending, {len(spike_alerts)} spikes"
544
+ )
545
+
546
+ return {
547
+ "current_step": "trending_detection",
548
+ "trending_results": {
549
+ "status": "success",
550
+ "entities_extracted": len(entities_found),
551
+ "entities": entities_found[:20], # Limit for state size
552
+ "trending_topics": trending_topics,
553
+ "spike_alerts": spike_alerts,
554
+ },
555
+ }
556
+
557
+ def _extract_entities(self, text: str, language: str = "english") -> List[Dict[str, Any]]:
558
+ """
559
+ Extract entities/topics from text for trending tracking.
560
+
561
+ Uses simple heuristics:
562
+ - Capitalized words/phrases (potential proper nouns)
563
+ - Hashtags
564
+ - Common news keywords
565
+
566
+ In production, integrate with NER model for better extraction.
567
+ """
568
+ entities = []
569
+
570
+ if not text:
571
+ return entities
572
+
573
+ import re
574
+
575
+ # Extract hashtags
576
+ hashtags = re.findall(r'#(\w+)', text)
577
+ for tag in hashtags:
578
+ entities.append({
579
+ "text": tag.lower(),
580
+ "type": "hashtag",
581
+ "source": "hashtag",
582
+ "domain": "social",
583
+ })
584
+
585
+ # Extract capitalized phrases (potential proper nouns)
586
+ # Match 1-4 consecutive capitalized words
587
+ cap_phrases = re.findall(r'\b([A-Z][a-z]+(?: [A-Z][a-z]+){0,3})\b', text)
588
+ for phrase in cap_phrases:
589
+ # Skip common words
590
+ if phrase.lower() not in ['the', 'a', 'an', 'is', 'are', 'was', 'were', 'i', 'he', 'she', 'it']:
591
+ entities.append({
592
+ "text": phrase,
593
+ "type": "proper_noun",
594
+ "source": "text",
595
+ "domain": "general",
596
+ })
597
+
598
+ # News/event keywords
599
+ news_keywords = [
600
+ 'breaking', 'urgent', 'alert', 'emergency', 'crisis',
601
+ 'earthquake', 'flood', 'tsunami', 'election', 'protest',
602
+ 'strike', 'scandal', 'corruption', 'price', 'inflation',
603
+ ]
604
+
605
+ text_lower = text.lower()
606
+ for keyword in news_keywords:
607
+ if keyword in text_lower:
608
+ entities.append({
609
+ "text": keyword,
610
+ "type": "news_keyword",
611
+ "source": "keyword_match",
612
+ "domain": "news",
613
+ })
614
+
615
+ # Deduplicate by text
616
+ seen = set()
617
+ unique_entities = []
618
+ for e in entities:
619
+ key = e["text"].lower()
620
+ if key not in seen:
621
+ seen.add(key)
622
+ unique_entities.append(e)
623
+
624
+ return unique_entities[:15] # Limit entities per text
625
+
626
+
627
  def generate_expert_summary(self, state: VectorizationAgentState) -> Dict[str, Any]:
628
  """
629
  Step 3: Use GroqLLM to generate expert summary combining all insights.
src/rag.py CHANGED
@@ -246,19 +246,54 @@ class RogerRAG:
246
  logger.error(f"[RAG] LLM initialization error: {e}")
247
 
248
  def _format_context(self, docs: List[Dict[str, Any]]) -> str:
249
- """Format retrieved documents as context for LLM"""
250
  if not docs:
251
  return "No relevant intelligence data found."
252
 
253
  context_parts = []
 
 
254
  for i, doc in enumerate(docs[:5], 1): # Top 5 docs
255
  meta = doc.get("metadata", {})
256
  domain = meta.get("domain", "unknown")
257
  platform = meta.get("platform", "")
258
  timestamp = meta.get("timestamp", "")
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  context_parts.append(
261
- f"[Source {i}] Domain: {domain} | Platform: {platform} | Time: {timestamp}\n"
 
262
  f"{doc['content']}\n"
263
  )
264
 
@@ -344,18 +379,34 @@ class RogerRAG:
344
  "question": question,
345
  }
346
 
347
- # RAG prompt
 
348
  rag_prompt = ChatPromptTemplate.from_messages(
349
  [
350
  (
351
  "system",
352
- """You are Roger, an AI intelligence analyst for Sri Lanka.
353
- Answer questions based ONLY on the provided intelligence context.
354
- Be concise but informative. Cite sources when possible.
355
- If the context doesn't contain relevant information, say so.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
- Context:
358
- {context}""",
359
  ),
360
  MessagesPlaceholder(variable_name="history"),
361
  ("human", "{question}"),
 
246
  logger.error(f"[RAG] LLM initialization error: {e}")
247
 
248
  def _format_context(self, docs: List[Dict[str, Any]]) -> str:
249
+ """Format retrieved documents as context for LLM with temporal awareness"""
250
  if not docs:
251
  return "No relevant intelligence data found."
252
 
253
  context_parts = []
254
+ now = datetime.now()
255
+
256
  for i, doc in enumerate(docs[:5], 1): # Top 5 docs
257
  meta = doc.get("metadata", {})
258
  domain = meta.get("domain", "unknown")
259
  platform = meta.get("platform", "")
260
  timestamp = meta.get("timestamp", "")
261
 
262
+ # Calculate age of the source
263
+ age_str = "unknown date"
264
+ if timestamp:
265
+ try:
266
+ # Try to parse various timestamp formats
267
+ for fmt in [
268
+ "%Y-%m-%d %H:%M:%S",
269
+ "%Y-%m-%dT%H:%M:%S",
270
+ "%Y-%m-%d",
271
+ "%d/%m/%Y",
272
+ ]:
273
+ try:
274
+ ts_date = datetime.strptime(timestamp[:19], fmt)
275
+ days_old = (now - ts_date).days
276
+ if days_old == 0:
277
+ age_str = "TODAY"
278
+ elif days_old == 1:
279
+ age_str = "1 day ago"
280
+ elif days_old < 7:
281
+ age_str = f"{days_old} days ago"
282
+ elif days_old < 30:
283
+ age_str = f"{days_old // 7} weeks ago"
284
+ elif days_old < 365:
285
+ age_str = f"{days_old // 30} months ago (⚠️ POTENTIALLY OUTDATED)"
286
+ else:
287
+ age_str = f"{days_old // 365} years ago (⚠️ OUTDATED)"
288
+ break
289
+ except ValueError:
290
+ continue
291
+ except Exception:
292
+ age_str = f"Date: {timestamp}"
293
+
294
  context_parts.append(
295
+ f"[Source {i}] Domain: {domain} | Platform: {platform}\n"
296
+ f"📅 TIMESTAMP: {timestamp} ({age_str})\n"
297
  f"{doc['content']}\n"
298
  )
299
 
 
379
  "question": question,
380
  }
381
 
382
+ # RAG prompt with temporal awareness
383
+ current_date = datetime.now().strftime("%B %d, %Y")
384
  rag_prompt = ChatPromptTemplate.from_messages(
385
  [
386
  (
387
  "system",
388
+ f"""You are Roger, an AI intelligence analyst for Sri Lanka.
389
+
390
+ TODAY'S DATE: {current_date}
391
+
392
+ CRITICAL TEMPORAL AWARENESS INSTRUCTIONS:
393
+ 1. ALWAYS check the timestamp/date of each source before using information
394
+ 2. For questions about "current" situations, ONLY use sources from the last 30 days
395
+ 3. If sources are outdated (more than 30 days old), explicitly mention this: "Based on data from [date], which may be outdated..."
396
+ 4. For political leadership questions, verify information is from recent sources
397
+ 5. If you find conflicting information from different time periods, prefer the most recent source
398
+ 6. Never present old information as current fact without temporal qualification
399
+
400
+ IMPORTANT POLITICAL CONTEXT:
401
+ - Presidential elections were held in Sri Lanka in September 2024
402
+ - Always verify any claims about political leadership against the most recent sources
403
+
404
+ Answer questions based ONLY on the provided intelligence context.
405
+ Be concise but informative. Always cite source timestamps when available.
406
+ If the context doesn't contain relevant RECENT information for current-state questions, say so.
407
 
408
+ Context (check timestamps carefully):
409
+ {{context}}""",
410
  ),
411
  MessagesPlaceholder(variable_name="history"),
412
  ("human", "{question}"),
src/states/vectorizationAgentState.py CHANGED
@@ -33,6 +33,10 @@ class VectorizationAgentState(TypedDict, total=False):
33
  clustering_results: Optional[Dict[str, Any]]
34
  anomaly_results: Optional[Dict[str, Any]]
35
 
 
 
 
 
36
  # ===== EXPERT ANALYSIS =====
37
  expert_summary: Optional[str] # LLM-generated summary combining all insights
38
  opportunities: List[Dict[str, Any]] # Detected opportunities
 
33
  clustering_results: Optional[Dict[str, Any]]
34
  anomaly_results: Optional[Dict[str, Any]]
35
 
36
+ # ===== TRENDING DETECTION =====
37
+ trending_results: Optional[Dict[str, Any]]
38
+ # {trending_topics, spike_alerts, entities_extracted}
39
+
40
  # ===== EXPERT ANALYSIS =====
41
  expert_summary: Optional[str] # LLM-generated summary combining all insights
42
  opportunities: List[Dict[str, Any]] # Detected opportunities
test_multilingual_anomaly.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ test_multilingual_anomaly.py
3
+ Test the multilingual anomaly detection fix.
4
+ """
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ if sys.platform == 'win32':
9
+ sys.stdout.reconfigure(encoding='utf-8')
10
+
11
+ sys.path.insert(0, str(Path('.').resolve()))
12
+
13
+ from src.graphs.vectorizationAgentGraph import graph
14
+ from datetime import datetime
15
+
16
+ test_texts = [
17
+ {"text": "URGENT: Massive landslide in Ratnapura!", "post_id": "EN_001"},
18
+ {"text": "Normal stock market day", "post_id": "EN_002"},
19
+ {"text": "ආර්ථික අර්බුදය නිසා ජනතාව දුෂ්කරතාවන්ට මුහුණ දෙයි", "post_id": "SI_001"},
20
+ {"text": "கொழும்பில் பெரும் மழை பெய்தது", "post_id": "TA_001"},
21
+ {"text": "Breaking news about corruption scandal", "post_id": "EN_003"},
22
+ ]
23
+
24
+ result = graph.invoke({
25
+ "input_texts": test_texts,
26
+ "batch_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
27
+ })
28
+
29
+ print("=" * 60)
30
+ print("MULTILINGUAL ANOMALY DETECTION TEST")
31
+ print("=" * 60)
32
+
33
+ anomaly_results = result.get("anomaly_results", {})
34
+ print(f"\nStatus: {anomaly_results.get('status')}")
35
+ print(f"Model: {anomaly_results.get('model_used')}")
36
+ print(f"Total analyzed: {anomaly_results.get('total_analyzed')}")
37
+
38
+ anomalies = anomaly_results.get("anomalies", [])
39
+ print(f"\nAnomalies found: {len(anomalies)}")
40
+ for a in anomalies:
41
+ method = a.get("detection_method", "unknown")
42
+ print(f" - {a.get('post_id')}: {a.get('language')} | method: {method} | score: {a.get('anomaly_score', 0):.2f}")
43
+
44
+ lang_results = result.get("language_detection_results", [])
45
+ print(f"\nLanguage Detection:")
46
+ for lr in lang_results:
47
+ print(f" - {lr.get('post_id')}: {lr.get('language')} (conf: {lr.get('confidence', 0):.2f})")
48
+
49
+ # Summary
50
+ print("\n" + "=" * 60)
51
+ print("The fix ensures:")
52
+ print(" - English texts: Isolation Forest ML model")
53
+ print(" - Sinhala/Tamil: Magnitude-based heuristic (avoids false positives)")
54
+ print("=" * 60)
test_trending_integration.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ test_trending_integration.py
3
+ Test the trending detection integration in the vectorizer pipeline.
4
+ """
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ if sys.platform == 'win32':
9
+ sys.stdout.reconfigure(encoding='utf-8')
10
+
11
+ sys.path.insert(0, str(Path('.').resolve()))
12
+
13
+ from src.graphs.vectorizationAgentGraph import graph
14
+ from datetime import datetime
15
+
16
+ print("=" * 60)
17
+ print("TESTING TRENDING DETECTION INTEGRATION")
18
+ print("=" * 60)
19
+
20
+ # Test with multiple mentions of the same topic to trigger trending
21
+ test_texts = [
22
+ {"text": "URGENT: Major earthquake hits Colombo, buildings damaged!", "post_id": "EN_001"},
23
+ {"text": "Breaking news: Earthquake in Colombo measuring 5.5 magnitude", "post_id": "EN_002"},
24
+ {"text": "Colombo earthquake causes panic, residents evacuated", "post_id": "EN_003"},
25
+ {"text": "Sri Lanka Cricket team wins against India in thrilling match", "post_id": "EN_004"},
26
+ {"text": "Stock market shows bullish trends in JKH", "post_id": "EN_005"},
27
+ {"text": "Another earthquake aftershock reported in Colombo area", "post_id": "EN_006"},
28
+ ]
29
+
30
+ print(f"\nProcessing {len(test_texts)} texts with repeated topics...")
31
+
32
+ result = graph.invoke({
33
+ "input_texts": test_texts,
34
+ "batch_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
35
+ })
36
+
37
+ # Show trending results
38
+ print("\n" + "=" * 60)
39
+ print("TRENDING DETECTION RESULTS")
40
+ print("=" * 60)
41
+
42
+ trending_results = result.get("trending_results", {})
43
+ print(f"\nStatus: {trending_results.get('status', 'N/A')}")
44
+ print(f"Entities extracted: {trending_results.get('entities_extracted', 0)}")
45
+
46
+ # Show extracted entities
47
+ entities = trending_results.get("entities", [])
48
+ print(f"\nExtracted Entities ({len(entities)}):")
49
+ for e in entities[:10]:
50
+ print(f" - {e.get('entity')} ({e.get('type')}) from {e.get('post_id')}")
51
+
52
+ # Show trending topics
53
+ trending_topics = trending_results.get("trending_topics", [])
54
+ print(f"\nTrending Topics ({len(trending_topics)}):")
55
+ if trending_topics:
56
+ for t in trending_topics:
57
+ print(f" - {t.get('topic')}: momentum={t.get('momentum', 0):.2f}, is_spike={t.get('is_spike', False)}")
58
+ else:
59
+ print(" (No trending topics yet - need more historical data)")
60
+
61
+ # Show spike alerts
62
+ spike_alerts = trending_results.get("spike_alerts", [])
63
+ print(f"\nSpike Alerts ({len(spike_alerts)}):")
64
+ if spike_alerts:
65
+ for s in spike_alerts:
66
+ print(f" - {s.get('topic')}: momentum={s.get('momentum', 0):.2f}")
67
+ else:
68
+ print(" (No spike alerts)")
69
+
70
+ # Show anomaly results
71
+ print("\n" + "=" * 60)
72
+ print("ANOMALY DETECTION RESULTS")
73
+ print("=" * 60)
74
+ anomaly_results = result.get("anomaly_results", {})
75
+ print(f"Status: {anomaly_results.get('status', 'N/A')}")
76
+ print(f"Anomalies found: {anomaly_results.get('anomalies_found', 0)}")
77
+
78
+ print("\n" + "=" * 60)
79
+ print("PIPELINE COMPLETE - 6-Step Architecture Working!")
80
+ print("=" * 60)
test_vectorizer_and_anomaly.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ test_vectorizer_and_anomaly.py
3
+ Test script to run the Vectorizer Agent and Anomaly Detection pipeline
4
+ Generates visualizations of the results
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import json
10
+ import numpy as np
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+
14
+ # Fix Windows encoding
15
+ if sys.platform == "win32":
16
+ sys.stdout.reconfigure(encoding='utf-8')
17
+
18
+ # Add project root to path
19
+ PROJECT_ROOT = Path(__file__).parent
20
+ sys.path.insert(0, str(PROJECT_ROOT))
21
+
22
+ # Load environment
23
+ try:
24
+ from dotenv import load_dotenv
25
+ load_dotenv()
26
+ except ImportError:
27
+ pass
28
+
29
+ print("=" * 70)
30
+ print(" VECTORIZER AGENT & ANOMALY DETECTION TEST")
31
+ print("=" * 70)
32
+ print()
33
+
34
+ # ============================================
35
+ # STEP 1: TEST VECTORIZER AGENT
36
+ # ============================================
37
+ print("\n" + "=" * 50)
38
+ print("STEP 1: Testing Vectorizer Agent")
39
+ print("=" * 50)
40
+
41
+ # Sample multilingual test data
42
+ test_texts = [
43
+ {"text": "The political situation in Colombo is tense with protests happening", "post_id": "EN_001"},
44
+ {"text": "Stock market shows bullish trends in JKH and Commercial Bank", "post_id": "EN_002"},
45
+ {"text": "Heavy rainfall expected in Southern Province causing flood warnings", "post_id": "EN_003"},
46
+ {"text": "Economic reforms by the government receive mixed public response", "post_id": "EN_004"},
47
+ {"text": "URGENT: Massive landslide in Ratnapura district, evacuations underway!", "post_id": "EN_005"},
48
+ {"text": "Normal day, nothing much happening, just regular news", "post_id": "EN_006"},
49
+ {"text": "Coffee prices remain stable in local markets", "post_id": "EN_007"},
50
+ {"text": "BREAKING: Major corruption scandal exposed in government ministry", "post_id": "EN_008"},
51
+ {"text": "Sri Lanka cricket team wins against India in thrilling match", "post_id": "EN_009"},
52
+ {"text": "Warning: Tsunami alert issued for coastal areas - immediate evacuation!", "post_id": "EN_010"},
53
+ ]
54
+
55
+ # Add some Sinhala text samples (using romanized for simplicity)
56
+ sinhala_texts = [
57
+ {"text": "කොළඹ නගරයේ අද මහ වර්ෂාවක් ඇති විය", "post_id": "SI_001"},
58
+ {"text": "ආර්ථික අර්බුදය හේතුවෙන් ජනතාව දුෂ්කරතාවන්ට මුහුණ දෙයි", "post_id": "SI_002"},
59
+ ]
60
+
61
+ # Add Tamil text samples
62
+ tamil_texts = [
63
+ {"text": "கொழும்பில் பெரும் மழை பெய்தது", "post_id": "TA_001"},
64
+ ]
65
+
66
+ all_texts = test_texts + sinhala_texts + tamil_texts
67
+
68
+ print(f"📝 Testing with {len(all_texts)} sample texts")
69
+ print(f" - English: {len(test_texts)}")
70
+ print(f" - Sinhala: {len(sinhala_texts)}")
71
+ print(f" - Tamil: {len(tamil_texts)}")
72
+
73
+ # Run the vectorizer agent
74
+ try:
75
+ from src.graphs.vectorizationAgentGraph import graph as vectorizer_graph
76
+
77
+ initial_state = {
78
+ "input_texts": all_texts,
79
+ "batch_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
80
+ }
81
+
82
+ print("\n🔄 Running Vectorizer Agent Graph...")
83
+ result = vectorizer_graph.invoke(initial_state)
84
+
85
+ print("\n✅ Vectorizer Agent Results:")
86
+ print("-" * 40)
87
+
88
+ # Language detection results
89
+ lang_results = result.get("language_detection_results", [])
90
+ print(f"\n📊 Language Detection:")
91
+ lang_stats = {}
92
+ for item in lang_results:
93
+ lang = item.get("language", "unknown")
94
+ lang_stats[lang] = lang_stats.get(lang, 0) + 1
95
+ print(f" - {item.get('post_id')}: {lang} (conf: {item.get('confidence', 0):.2f})")
96
+
97
+ print(f"\n📈 Language Distribution: {lang_stats}")
98
+
99
+ # Vector embeddings
100
+ embeddings = result.get("vector_embeddings", [])
101
+ print(f"\n🔢 Vector Embeddings Generated: {len(embeddings)}")
102
+ if embeddings:
103
+ sample = embeddings[0]
104
+ print(f" Sample vector dim: {sample.get('vector_dim', 0)}")
105
+ print(f" Models used: {set(e.get('model_used', '') for e in embeddings)}")
106
+
107
+ # Anomaly detection results
108
+ anomaly_results = result.get("anomaly_results", {})
109
+ print(f"\n🔍 Anomaly Detection:")
110
+ print(f" Status: {anomaly_results.get('status', 'unknown')}")
111
+ print(f" Model: {anomaly_results.get('model_used', 'none')}")
112
+ print(f" Total Analyzed: {anomaly_results.get('total_analyzed', 0)}")
113
+ print(f" Anomalies Found: {anomaly_results.get('anomalies_found', 0)}")
114
+
115
+ anomalies = anomaly_results.get("anomalies", [])
116
+ if anomalies:
117
+ print(f"\n⚠️ Detected Anomalies:")
118
+ for a in anomalies:
119
+ print(f" - {a.get('post_id')}: score={a.get('anomaly_score', 0):.3f}")
120
+
121
+ # Expert summary
122
+ expert_summary = result.get("expert_summary", "")
123
+ if expert_summary:
124
+ print(f"\n📋 Expert Summary (first 500 chars):")
125
+ print(f" {expert_summary[:500]}...")
126
+
127
+ # Domain insights
128
+ domain_insights = result.get("domain_insights", [])
129
+ print(f"\n💡 Domain Insights Generated: {len(domain_insights)}")
130
+
131
+ except Exception as e:
132
+ print(f"❌ Vectorizer Agent Error: {e}")
133
+ import traceback
134
+ traceback.print_exc()
135
+
136
+ # ============================================
137
+ # STEP 2: RUN ANOMALY DETECTION PIPELINE
138
+ # ============================================
139
+ print("\n\n" + "=" * 50)
140
+ print("STEP 2: Running Anomaly Detection ML Pipeline")
141
+ print("=" * 50)
142
+
143
+ MODELS_PATH = PROJECT_ROOT / "models" / "anomaly-detection"
144
+ sys.path.insert(0, str(MODELS_PATH))
145
+
146
+ try:
147
+ from src.pipeline.training_pipeline import TrainingPipeline
148
+
149
+ print("\n🔄 Running Anomaly Detection Training Pipeline...")
150
+ pipeline = TrainingPipeline()
151
+ artifacts = pipeline.run()
152
+
153
+ print("\n✅ Training Pipeline Results:")
154
+ print("-" * 40)
155
+
156
+ if artifacts.get("model_trainer"):
157
+ trainer_artifact = artifacts["model_trainer"]
158
+ print(f" Best Score: {getattr(trainer_artifact, 'best_score', 'N/A')}")
159
+ print(f" Best Model: {getattr(trainer_artifact, 'best_model', 'N/A')}")
160
+
161
+ # Check for model files
162
+ model_path = MODELS_PATH / "output"
163
+ if model_path.exists():
164
+ models = list(model_path.glob("*.joblib"))
165
+ print(f"\n📁 Saved Models: {len(models)}")
166
+ for m in models:
167
+ print(f" - {m.name}")
168
+
169
+ except ImportError as e:
170
+ print(f"⚠️ Could not import training pipeline: {e}")
171
+ print(" Running standalone model training instead...")
172
+
173
+ try:
174
+ # Try running the main.py directly
175
+ os.chdir(MODELS_PATH)
176
+ exec(open(MODELS_PATH / "main.py").read())
177
+ except Exception as e2:
178
+ print(f"❌ Standalone training error: {e2}")
179
+ except Exception as e:
180
+ print(f"❌ Pipeline Error: {e}")
181
+ import traceback
182
+ traceback.print_exc()
183
+
184
+ # ============================================
185
+ # STEP 3: VISUALIZATION
186
+ # ============================================
187
+ print("\n\n" + "=" * 50)
188
+ print("STEP 3: Generating Visualizations")
189
+ print("=" * 50)
190
+
191
+ try:
192
+ import matplotlib
193
+ matplotlib.use('Agg') # Non-interactive backend
194
+ import matplotlib.pyplot as plt
195
+ from sklearn.decomposition import PCA
196
+
197
+ # Only visualize if we have embeddings
198
+ if 'embeddings' in dir() and embeddings:
199
+ # Extract vectors
200
+ vectors = []
201
+ labels = []
202
+ for emb in embeddings:
203
+ vec = emb.get("vector", [])
204
+ if len(vec) == 768:
205
+ vectors.append(vec)
206
+ labels.append(emb.get("post_id", ""))
207
+
208
+ if vectors:
209
+ X = np.array(vectors)
210
+
211
+ # PCA for visualization
212
+ pca = PCA(n_components=2)
213
+ X_2d = pca.fit_transform(X)
214
+
215
+ # Create figure
216
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6))
217
+
218
+ # Plot 1: Language distribution scatter
219
+ ax1 = axes[0]
220
+ colors = {'english': 'blue', 'sinhala': 'green', 'tamil': 'orange', 'unknown': 'gray'}
221
+
222
+ for i, emb in enumerate(embeddings):
223
+ if i < len(X_2d):
224
+ lang = emb.get("language", "unknown")
225
+ ax1.scatter(X_2d[i, 0], X_2d[i, 1], c=colors.get(lang, 'gray'),
226
+ s=100, alpha=0.7, label=lang if lang not in [e.get('language') for e in embeddings[:i]] else "")
227
+
228
+ ax1.set_title("Text Embeddings by Language (PCA 2D)")
229
+ ax1.set_xlabel("PC1")
230
+ ax1.set_ylabel("PC2")
231
+
232
+ # Add legend (unique labels only)
233
+ handles, legend_labels = ax1.get_legend_handles_labels()
234
+ by_label = dict(zip(legend_labels, handles))
235
+ ax1.legend(by_label.values(), by_label.keys())
236
+
237
+ # Plot 2: Anomaly scores
238
+ ax2 = axes[1]
239
+ if anomalies:
240
+ anomaly_ids = [a.get("post_id", "") for a in anomalies]
241
+
242
+ for i, emb in enumerate(embeddings):
243
+ if i < len(X_2d):
244
+ is_anomaly = emb.get("post_id", "") in anomaly_ids
245
+ color = 'red' if is_anomaly else 'blue'
246
+ marker = 'X' if is_anomaly else 'o'
247
+ ax2.scatter(X_2d[i, 0], X_2d[i, 1], c=color, marker=marker,
248
+ s=150 if is_anomaly else 80, alpha=0.7)
249
+
250
+ ax2.scatter([], [], c='red', marker='X', s=150, label='Anomaly')
251
+ ax2.scatter([], [], c='blue', marker='o', s=80, label='Normal')
252
+ ax2.legend()
253
+ else:
254
+ ax2.scatter(X_2d[:, 0], X_2d[:, 1], c='blue', s=80, alpha=0.7)
255
+ ax2.text(0.5, 0.5, "No anomalies detected\n(Model may not be trained)",
256
+ ha='center', va='center', transform=ax2.transAxes)
257
+
258
+ ax2.set_title("Anomaly Detection Results (PCA 2D)")
259
+ ax2.set_xlabel("PC1")
260
+ ax2.set_ylabel("PC2")
261
+
262
+ plt.tight_layout()
263
+
264
+ # Save figure
265
+ output_path = PROJECT_ROOT / "vectorizer_anomaly_visualization.png"
266
+ plt.savefig(output_path, dpi=150, bbox_inches='tight')
267
+ print(f"\n✅ Visualization saved to: {output_path}")
268
+
269
+ plt.close()
270
+ else:
271
+ print("⚠️ No embeddings available for visualization")
272
+
273
+ except ImportError as e:
274
+ print(f"⚠️ Matplotlib not available for visualization: {e}")
275
+ except Exception as e:
276
+ print(f"❌ Visualization Error: {e}")
277
+ import traceback
278
+ traceback.print_exc()
279
+
280
+ # ============================================
281
+ # STEP 4: GRAPH FLOW VISUALIZATION
282
+ # ============================================
283
+ print("\n\n" + "=" * 50)
284
+ print("STEP 4: Generating Graph Flow Diagram")
285
+ print("=" * 50)
286
+
287
+ try:
288
+ # Create a simple ASCII graph visualization
289
+ graph_viz = """
290
+ ╔═══════════════════════════════════════════════════════════════════╗
291
+ ║ VECTORIZATION AGENT GRAPH FLOW ║
292
+ ╠═══════════════════════════════════════════════════════════════════╣
293
+ ║ ║
294
+ ║ ┌─────────────────┐ ║
295
+ ║ │ INPUT TEXTS │ (Multilingual: EN, SI, TA) ║
296
+ ║ └────────┬────────┘ ║
297
+ ║ │ ║
298
+ ║ ▼ ║
299
+ ║ ┌─────────────────────────────────────────────────────┐ ║
300
+ ║ │ STEP 1: LANGUAGE DETECTION │ ║
301
+ ║ │ ├─ FastText (primary) │ ║
302
+ ║ │ └─ Unicode Script Analysis (fallback) │ ║
303
+ ║ └────────┬────────────────────────────────────────────┘ ║
304
+ ║ │ ║
305
+ ║ ▼ ║
306
+ ║ ┌─────────────────────────────────────────────────────┐ ║
307
+ ║ │ STEP 2: TEXT VECTORIZATION │ ║
308
+ ║ │ ├─ English → DistilBERT (768-dim) │ ║
309
+ ║ │ ├─ Sinhala → SinhalaBERTo (768-dim) │ ║
310
+ ║ │ └─ Tamil → Tamil-BERT (768-dim) │ ║
311
+ ║ └────────┬────────────────────────────────────────────┘ ║
312
+ ║ │ ║
313
+ ║ ▼ ║
314
+ ║ ┌─────────────────────────────────────────────────────┐ ║
315
+ ║ │ STEP 3: ANOMALY DETECTION │ ║
316
+ ║ │ ├─ Model: Isolation Forest / LOF │ ║
317
+ ║ │ ├─ Input: 768-dim embedding vectors │ ║
318
+ ║ │ └─ Output: anomaly_score (0-1), is_anomaly flag │ ║
319
+ ║ └────────┬────────────────────────────────────────────┘ ║
320
+ ║ │ ║
321
+ ║ ▼ ║
322
+ ║ ┌─────────────────────────────────────────────────────┐ ║
323
+ ║ │ STEP 4: EXPERT SUMMARY (GroqLLM) │ ║
324
+ ║ │ ├─ Opportunity Detection │ ║
325
+ ║ │ └─ Threat Detection │ ║
326
+ ║ └────────┬───────────────────────���────────────────────┘ ║
327
+ ║ │ ║
328
+ ║ ▼ ║
329
+ ║ ┌─────────────────────────────────────────────────────┐ ║
330
+ ║ │ STEP 5: FORMAT OUTPUT │ ║
331
+ ║ │ └─ domain_insights[] for Combined Agent │ ║
332
+ ║ └────────┬────────────────────────────────────────────┘ ║
333
+ ║ │ ║
334
+ ║ ▼ ║
335
+ ║ ┌─────────────────┐ ║
336
+ ║ │ END │ → Passed to Feed Aggregator ║
337
+ ║ └─────────────────┘ ║
338
+ ║ ║
339
+ ╚═══════════════════════════════════════════════════════════════════╝
340
+ """
341
+ print(graph_viz)
342
+
343
+ # Save as text file
344
+ graph_path = PROJECT_ROOT / "vectorizer_graph_flow.txt"
345
+ with open(graph_path, "w", encoding="utf-8") as f:
346
+ f.write(graph_viz)
347
+ print(f"✅ Graph flow saved to: {graph_path}")
348
+
349
+ except Exception as e:
350
+ print(f"❌ Error: {e}")
351
+
352
+ # ============================================
353
+ # SUMMARY
354
+ # ============================================
355
+ print("\n\n" + "=" * 70)
356
+ print(" TEST SUMMARY")
357
+ print("=" * 70)
358
+
359
+ print("""
360
+ 📊 VECTORIZER AGENT ARCHITECTURE:
361
+ ├── 5-Step Sequential Pipeline
362
+ ├── Multilingual Support: English, Sinhala, Tamil
363
+ ├── BERT Models: DistilBERT, SinhalaBERTo, Tamil-BERT
364
+ └── Output: 768-dimensional embeddings
365
+
366
+ 🔍 ANOMALY DETECTION:
367
+ ├── Algorithm: Isolation Forest / LOF
368
+ ├── Training: Optuna hyperparameter optimization
369
+ ├── MLflow: Experiment tracking (DagsHub)
370
+ └── Integration: Real-time inference on every graph cycle
371
+
372
+ 📁 OUTPUT FILES:
373
+ ├── vectorizer_anomaly_visualization.png (if matplotlib available)
374
+ └── vectorizer_graph_flow.txt (graph architecture)
375
+ """)
376
+
377
+ print("=" * 70)
378
+ print(" TEST COMPLETE")
379
+ print("=" * 70)
trending_detection_visualization.png ADDED

Git LFS Details

  • SHA256: 906c6fafc9c7b95270a27abd5381eeca38407ced3d6b3e0f9ef967205e42af3b
  • Pointer size: 131 Bytes
  • Size of remote file: 190 kB
vectorizer_anomaly_visualization.png ADDED

Git LFS Details

  • SHA256: 5f0bc1c338b312268601e3e62ea34d3f8301745b3519cec96f3362becc030813
  • Pointer size: 131 Bytes
  • Size of remote file: 182 kB
vectorizer_graph_flow.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ╔═══════════════════════════════════════════════════════════════════╗
3
+ ║ VECTORIZATION AGENT GRAPH FLOW ║
4
+ ╠═══════════════════════════════════════════════════════════════════╣
5
+ ║ ║
6
+ ║ ┌─────────────────┐ ║
7
+ ║ │ INPUT TEXTS │ (Multilingual: EN, SI, TA) ║
8
+ ║ └────────┬────────┘ ║
9
+ ║ │ ║
10
+ ║ ▼ ║
11
+ ║ ┌─────────────────────────────────────────────────────┐ ║
12
+ ║ │ STEP 1: LANGUAGE DETECTION │ ║
13
+ ║ │ ├─ FastText (primary) │ ║
14
+ ║ │ └─ Unicode Script Analysis (fallback) │ ║
15
+ ║ └────────┬────────────────────────────────────────────┘ ║
16
+ ║ │ ║
17
+ ║ ▼ ║
18
+ ║ ┌─────────────────────────────────────────────────────┐ ║
19
+ ║ │ STEP 2: TEXT VECTORIZATION │ ║
20
+ ║ │ ├─ English → DistilBERT (768-dim) │ ║
21
+ ║ │ ├─ Sinhala → SinhalaBERTo (768-dim) │ ║
22
+ ║ │ └─ Tamil → Tamil-BERT (768-dim) │ ║
23
+ ║ └────────┬────────────────────────────────────────────┘ ║
24
+ ║ │ ║
25
+ ║ ▼ ║
26
+ ║ ┌─────────────────────────────────────────────────────┐ ║
27
+ ║ │ STEP 3: ANOMALY DETECTION │ ║
28
+ ║ │ ├─ Model: Isolation Forest / LOF │ ║
29
+ ║ │ ├─ Input: 768-dim embedding vectors │ ║
30
+ ║ │ └─ Output: anomaly_score (0-1), is_anomaly flag │ ║
31
+ ║ └────────┬────────────────────────────────────────────┘ ║
32
+ ║ │ ║
33
+ ║ ▼ ║
34
+ ║ ┌─────────────────────────────────────────────────────┐ ║
35
+ ║ │ STEP 4: EXPERT SUMMARY (GroqLLM) │ ║
36
+ ║ │ ├─ Opportunity Detection │ ║
37
+ ║ │ └─ Threat Detection │ ║
38
+ ║ └────────┬────────────────────────────────────────────┘ ║
39
+ ║ │ ║
40
+ ║ ▼ ║
41
+ ║ ┌─────────────────────────────────────────────────────┐ ║
42
+ ║ │ STEP 5: FORMAT OUTPUT │ ║
43
+ ║ │ └─ domain_insights[] for Combined Agent │ ║
44
+ ║ └────────┬────────────────────────────────────────────┘ ║
45
+ ║ │ ║
46
+ ║ ▼ ║
47
+ ║ ┌─────────────────┐ ║
48
+ ║ │ END │ → Passed to Feed Aggregator ║
49
+ ║ └─────────────────┘ ║
50
+ ║ ║
51
+ ╚═══════════════════════════════════════════════════════════════════╝
52
+
visualize_trending.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ visualize_trending.py
3
+ Creates visual graphs for trending detection results
4
+ """
5
+ import sys
6
+ import matplotlib
7
+ matplotlib.use('Agg')
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+
13
+ if sys.platform == 'win32':
14
+ sys.stdout.reconfigure(encoding='utf-8')
15
+
16
+ sys.path.insert(0, str(Path('.').resolve()))
17
+
18
+ print("=" * 60)
19
+ print("GENERATING TRENDING DETECTION VISUALIZATION")
20
+ print("=" * 60)
21
+
22
+ # Run the vectorizer to get fresh data
23
+ from src.graphs.vectorizationAgentGraph import graph
24
+
25
+ test_texts = [
26
+ {"text": "URGENT: Major earthquake hits Colombo, buildings damaged!", "post_id": "EN_001"},
27
+ {"text": "Breaking news: Earthquake in Colombo measuring 5.5 magnitude", "post_id": "EN_002"},
28
+ {"text": "Colombo earthquake causes panic, residents evacuated", "post_id": "EN_003"},
29
+ {"text": "Sri Lanka Cricket team wins against India in thrilling match", "post_id": "EN_004"},
30
+ {"text": "Stock market shows bullish trends in JKH and Commercial Bank", "post_id": "EN_005"},
31
+ {"text": "Another earthquake aftershock reported in Colombo area", "post_id": "EN_006"},
32
+ {"text": "President announces relief fund for earthquake victims", "post_id": "EN_007"},
33
+ {"text": "Heavy rainfall expected in Southern Province", "post_id": "EN_008"},
34
+ ]
35
+
36
+ print(f"\nProcessing {len(test_texts)} texts...")
37
+
38
+ result = graph.invoke({
39
+ "input_texts": test_texts,
40
+ "batch_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
41
+ })
42
+
43
+ trending_results = result.get("trending_results", {})
44
+ anomaly_results = result.get("anomaly_results", {})
45
+
46
+ # Get trending data
47
+ trending_topics = trending_results.get("trending_topics", [])
48
+ spike_alerts = trending_results.get("spike_alerts", [])
49
+ entities = trending_results.get("entities", [])
50
+
51
+ print(f"Trending topics: {len(trending_topics)}")
52
+ print(f"Spike alerts: {len(spike_alerts)}")
53
+
54
+ # Create visualization
55
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
56
+ fig.suptitle('Vectorizer Agent: Trending Detection Dashboard', fontsize=16, fontweight='bold')
57
+
58
+ # ===== PLOT 1: Trending Topics Momentum =====
59
+ ax1 = axes[0, 0]
60
+ if trending_topics:
61
+ topics = [t.get('topic', '')[:15] for t in trending_topics[:10]]
62
+ momentums = [t.get('momentum', 0) for t in trending_topics[:10]]
63
+ colors = ['#e74c3c' if m > 30 else '#f39c12' if m > 10 else '#3498db' for m in momentums]
64
+
65
+ bars = ax1.barh(topics, momentums, color=colors, edgecolor='black', linewidth=0.5)
66
+ ax1.set_xlabel('Momentum Score', fontsize=11)
67
+ ax1.set_title('Top Trending Topics by Momentum', fontsize=12, fontweight='bold')
68
+ ax1.axvline(x=3, color='orange', linestyle='--', alpha=0.7, label='Spike Threshold (3x)')
69
+ ax1.axvline(x=2, color='green', linestyle='--', alpha=0.7, label='Trending Threshold (2x)')
70
+ ax1.legend(loc='lower right', fontsize=8)
71
+
72
+ # Add value labels
73
+ for bar, val in zip(bars, momentums):
74
+ ax1.text(val + 1, bar.get_y() + bar.get_height()/2, f'{val:.0f}x',
75
+ va='center', fontsize=9, fontweight='bold')
76
+ else:
77
+ ax1.text(0.5, 0.5, 'No trending topics', ha='center', va='center', fontsize=12)
78
+ ax1.set_title('Top Trending Topics', fontsize=12, fontweight='bold')
79
+
80
+ # ===== PLOT 2: Entity Types Distribution =====
81
+ ax2 = axes[0, 1]
82
+ if entities:
83
+ entity_types = {}
84
+ for e in entities:
85
+ t = e.get('type', 'unknown')
86
+ entity_types[t] = entity_types.get(t, 0) + 1
87
+
88
+ labels = list(entity_types.keys())
89
+ sizes = list(entity_types.values())
90
+ colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f1c40f'][:len(labels)]
91
+ explode = [0.05] * len(labels)
92
+
93
+ ax2.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
94
+ colors=colors, shadow=True, startangle=90)
95
+ ax2.set_title('Extracted Entity Types', fontsize=12, fontweight='bold')
96
+ else:
97
+ ax2.text(0.5, 0.5, 'No entities', ha='center', va='center', fontsize=12)
98
+ ax2.set_title('Extracted Entity Types', fontsize=12, fontweight='bold')
99
+
100
+ # ===== PLOT 3: Spike Alerts =====
101
+ ax3 = axes[1, 0]
102
+ if spike_alerts:
103
+ spike_topics = [s.get('topic', '')[:12] for s in spike_alerts[:8]]
104
+ spike_moms = [s.get('momentum', 0) for s in spike_alerts[:8]]
105
+
106
+ bars = ax3.bar(spike_topics, spike_moms, color='#e74c3c', edgecolor='black', linewidth=1)
107
+ ax3.set_ylabel('Momentum', fontsize=11)
108
+ ax3.set_title('🔥 SPIKE ALERTS (>3x Normal Volume)', fontsize=12, fontweight='bold', color='#c0392b')
109
+ ax3.axhline(y=3, color='orange', linestyle='--', alpha=0.7)
110
+ ax3.set_xticklabels(spike_topics, rotation=45, ha='right', fontsize=9)
111
+
112
+ # Add explosion effect
113
+ for bar, val in zip(bars, spike_moms):
114
+ ax3.text(bar.get_x() + bar.get_width()/2, val + 2, f'{val:.0f}x',
115
+ ha='center', fontsize=10, fontweight='bold', color='#c0392b')
116
+ else:
117
+ ax3.text(0.5, 0.5, 'No spike alerts', ha='center', va='center', fontsize=12)
118
+ ax3.set_title('Spike Alerts', fontsize=12, fontweight='bold')
119
+
120
+ # ===== PLOT 4: Pipeline Summary =====
121
+ ax4 = axes[1, 1]
122
+ ax4.axis('off')
123
+
124
+ # Create a summary box
125
+ summary_text = f"""
126
+ ╔══════════════════════════════════════════════════╗
127
+ ║ VECTORIZER AGENT PIPELINE SUMMARY ║
128
+ ╠══════════════════════════════════════════════════╣
129
+ ║ ║
130
+ ║ 📝 Texts Processed: {len(test_texts):>5} ║
131
+ ║ 🌐 Entities Extracted: {len(entities):>5} ║
132
+ ║ 📈 Trending Topics: {len(trending_topics):>5} ║
133
+ ║ 🔥 Spike Alerts: {len(spike_alerts):>5} ║
134
+ ║ ⚠️ Anomalies Detected: {anomaly_results.get('anomalies_found', 0):>5} ║
135
+ ║ ║
136
+ ╠══════════════════════════════════════════════════╣
137
+ ║ Top Trending: ║
138
+ """
139
+
140
+ if trending_topics:
141
+ for i, t in enumerate(trending_topics[:3]):
142
+ topic = t.get('topic', 'N/A')[:20]
143
+ mom = t.get('momentum', 0)
144
+ summary_text += f"║ {i+1}. {topic:<20} ({mom:.0f}x) ║\n"
145
+ else:
146
+ summary_text += "║ (No trending topics) ║\n"
147
+
148
+ summary_text += """╚══════════════════════════════════════════════════╝"""
149
+
150
+ ax4.text(0.5, 0.5, summary_text, family='monospace', fontsize=9,
151
+ ha='center', va='center',
152
+ bbox=dict(boxstyle='round', facecolor='#ecf0f1', edgecolor='#2c3e50'))
153
+
154
+ plt.tight_layout()
155
+ plt.subplots_adjust(top=0.93)
156
+
157
+ # Save
158
+ output_path = Path('trending_detection_visualization.png')
159
+ plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='white')
160
+ print(f"\n✅ Visualization saved: {output_path}")
161
+
162
+ # Also save to artifacts
163
+ artifacts_dir = Path(r'C:\Users\LENOVO\.gemini\antigravity\brain\b892f63f-afbc-4c4a-bbf1-37195faf04a5')
164
+ if artifacts_dir.exists():
165
+ artifacts_output = artifacts_dir / 'trending_visualization.png'
166
+ plt.savefig(str(artifacts_output), dpi=150, bbox_inches='tight', facecolor='white')
167
+ print(f"✅ Also saved to: {artifacts_output}")
168
+
169
+ plt.close()
170
+
171
+ print("\n" + "=" * 60)
172
+ print("VISUALIZATION COMPLETE")
173
+ print("=" * 60)