Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- README.md +24 -10
- create_visualization.py +120 -0
- frontend/app/components/dashboard/TrendingTopics.tsx +192 -0
- main.py +107 -0
- models/anomaly-detection/src/components/model_trainer.py +35 -1
- src/api/vectorization_api.py +7 -0
- src/graphs/vectorizationAgentGraph.py +13 -8
- src/nodes/vectorizationAgentNode.py +205 -4
- src/rag.py +60 -9
- src/states/vectorizationAgentState.py +4 -0
- test_multilingual_anomaly.py +54 -0
- test_trending_integration.py +80 -0
- test_vectorizer_and_anomaly.py +379 -0
- trending_detection_visualization.png +3 -0
- vectorizer_anomaly_visualization.png +3 -0
- vectorizer_graph_flow.txt +52 -0
- visualize_trending.py +173 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
ModelX[[:space:]]Final[[:space:]]Problem.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
ModelX[[:space:]]Final[[:space:]]Problem.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
trending_detection_visualization.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
vectorizer_anomaly_visualization.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -345,7 +345,7 @@ DELETE /api/intel/config/remove?target_type=profile&value=CompetitorX&platform=t
|
|
| 345 |
---
|
| 346 |
|
| 347 |
### 8. Vectorization Agent Graph (`vectorizationAgentGraph.py`) 🆕
|
| 348 |
-
**Multilingual
|
| 349 |
|
| 350 |
```
|
| 351 |
┌─────────────────────────────────────────────────┐
|
|
@@ -366,28 +366,42 @@ DELETE /api/intel/config/remove?target_type=profile&value=CompetitorX&platform=t
|
|
| 366 |
│
|
| 367 |
▼
|
| 368 |
┌─────────────────────────────────────────────────┐
|
| 369 |
-
│ Step 3: Anomaly Detection (Isolation Forest)
|
| 370 |
-
│ -
|
|
|
|
| 371 |
│ - Outputs anomaly_score (0-1) │
|
| 372 |
-
│ - Graceful fallback if model not trained │
|
| 373 |
└─────────────────┬───────────────────────────────┘
|
| 374 |
│
|
| 375 |
▼
|
| 376 |
┌─────────────────────────────────────────────────┐
|
| 377 |
-
│ Step 4:
|
| 378 |
-
│ -
|
| 379 |
-
│ -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
│ - Sentiment analysis │
|
| 381 |
└─────────────────┬───────────────────────────────┘
|
| 382 |
│
|
| 383 |
▼
|
| 384 |
┌─────────────────────────────────────────────────┐
|
| 385 |
-
│ Step
|
| 386 |
-
│ - Includes anomaly
|
| 387 |
-
│ - Passes results to parent graph │
|
| 388 |
└─────────────────────────────────────────────────┘
|
| 389 |
```
|
| 390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
---
|
| 392 |
|
| 393 |
### 10. Weather Prediction Pipeline (`models/weather-prediction/`) 🆕
|
|
|
|
| 345 |
---
|
| 346 |
|
| 347 |
### 8. Vectorization Agent Graph (`vectorizationAgentGraph.py`) 🆕
|
| 348 |
+
**6-Step Multilingual NLP Pipeline with Anomaly + Trending Detection**
|
| 349 |
|
| 350 |
```
|
| 351 |
┌─────────────────────────────────────────────────┐
|
|
|
|
| 366 |
│
|
| 367 |
▼
|
| 368 |
┌─────────────────────────────────────────────────┐
|
| 369 |
+
│ Step 3: Anomaly Detection (Isolation Forest) │
|
| 370 |
+
│ - English: ML model inference │
|
| 371 |
+
│ - Sinhala/Tamil: Skipped (incompatible vectors) │
|
| 372 |
│ - Outputs anomaly_score (0-1) │
|
|
|
|
| 373 |
└─────────────────┬───────────────────────────────┘
|
| 374 |
│
|
| 375 |
▼
|
| 376 |
┌─────────────────────────────────────────────────┐
|
| 377 |
+
│ Step 4: Trending Detection 🆕 │
|
| 378 |
+
│ - Entity extraction (hashtags, proper nouns) │
|
| 379 |
+
│ - Momentum: current_hour / avg_last_6_hours │
|
| 380 |
+
│ - Spike alerts when momentum > 3x │
|
| 381 |
+
└─────────────────┬───────────────────────────────┘
|
| 382 |
+
│
|
| 383 |
+
▼
|
| 384 |
+
┌─────────────────────────────────────────────────┐
|
| 385 |
+
│ Step 5: Expert Summary (GroqLLM) │
|
| 386 |
+
│ - Opportunity & threat identification │
|
| 387 |
│ - Sentiment analysis │
|
| 388 |
└─────────────────┬───────────────────────────────┘
|
| 389 |
│
|
| 390 |
▼
|
| 391 |
┌─────────────────────────────────────────────────┐
|
| 392 |
+
│ Step 6: Format Output │
|
| 393 |
+
│ - Includes anomaly + trending in domain_insights│
|
|
|
|
| 394 |
└─────────────────────────────────────────────────┘
|
| 395 |
```
|
| 396 |
|
| 397 |
+
**Trending Detection API Endpoints:**
|
| 398 |
+
|
| 399 |
+
| Endpoint | Method | Description |
|
| 400 |
+
|----------|--------|-------------|
|
| 401 |
+
| `/api/trending` | GET | Get trending topics & spike alerts |
|
| 402 |
+
| `/api/trending/topic/{topic}` | GET | Get hourly history for a topic |
|
| 403 |
+
| `/api/trending/record` | POST | Record a topic mention (testing) |
|
| 404 |
+
|
| 405 |
---
|
| 406 |
|
| 407 |
### 10. Weather Prediction Pipeline (`models/weather-prediction/`) 🆕
|
create_visualization.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
create_visualization.py
|
| 3 |
+
Creates visualization of multilingual embeddings and anomaly detection results
|
| 4 |
+
using actual training data.
|
| 5 |
+
"""
|
| 6 |
+
import sys
|
| 7 |
+
import numpy as np
|
| 8 |
+
import matplotlib
|
| 9 |
+
matplotlib.use('Agg')
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from sklearn.decomposition import PCA
|
| 13 |
+
import joblib
|
| 14 |
+
|
| 15 |
+
if sys.platform == 'win32':
|
| 16 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
| 17 |
+
|
| 18 |
+
print("=" * 60)
|
| 19 |
+
print(" CREATING VECTORIZER VISUALIZATION")
|
| 20 |
+
print("=" * 60)
|
| 21 |
+
|
| 22 |
+
# Load saved embeddings from the training pipeline
|
| 23 |
+
embeddings_path = Path('models/anomaly-detection/artifacts/data_transformation')
|
| 24 |
+
|
| 25 |
+
# Find the latest embeddings file
|
| 26 |
+
emb_files = list(embeddings_path.glob('embeddings_*.npy'))
|
| 27 |
+
if emb_files:
|
| 28 |
+
latest_emb = sorted(emb_files)[-1]
|
| 29 |
+
embeddings = np.load(latest_emb)
|
| 30 |
+
print(f'Loaded embeddings: {embeddings.shape}')
|
| 31 |
+
else:
|
| 32 |
+
print('No embeddings found')
|
| 33 |
+
sys.exit(1)
|
| 34 |
+
|
| 35 |
+
# Load transformed data to get language info
|
| 36 |
+
import pandas as pd
|
| 37 |
+
data_files = list(embeddings_path.glob('transformed_*.parquet'))
|
| 38 |
+
if data_files:
|
| 39 |
+
latest_data = sorted(data_files)[-1]
|
| 40 |
+
df = pd.read_parquet(latest_data)
|
| 41 |
+
languages = df['language'].values
|
| 42 |
+
lang_counts = df['language'].value_counts().to_dict()
|
| 43 |
+
print(f'Languages: {lang_counts}')
|
| 44 |
+
else:
|
| 45 |
+
languages = ['english'] * len(embeddings)
|
| 46 |
+
lang_counts = {'english': len(embeddings)}
|
| 47 |
+
|
| 48 |
+
# Load anomaly model and predict
|
| 49 |
+
model_path = Path('models/anomaly-detection/artifacts/model_trainer/isolation_forest_embeddings_only.joblib')
|
| 50 |
+
model = joblib.load(model_path)
|
| 51 |
+
predictions = model.predict(embeddings)
|
| 52 |
+
anomaly_mask = predictions == -1
|
| 53 |
+
|
| 54 |
+
print(f'Total samples: {len(embeddings)}')
|
| 55 |
+
print(f'Anomalies detected: {anomaly_mask.sum()}')
|
| 56 |
+
print(f'Normal samples: {(~anomaly_mask).sum()}')
|
| 57 |
+
|
| 58 |
+
# PCA for visualization
|
| 59 |
+
print('\nRunning PCA...')
|
| 60 |
+
pca = PCA(n_components=2)
|
| 61 |
+
X_2d = pca.fit_transform(embeddings)
|
| 62 |
+
print(f'Explained variance: {pca.explained_variance_ratio_.sum():.2%}')
|
| 63 |
+
|
| 64 |
+
# Create figure with 2 subplots
|
| 65 |
+
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
|
| 66 |
+
|
| 67 |
+
# Plot 1: By Language
|
| 68 |
+
ax1 = axes[0]
|
| 69 |
+
colors = {'english': '#3498db', 'sinhala': '#2ecc71', 'tamil': '#e74c3c', 'unknown': '#95a5a6'}
|
| 70 |
+
|
| 71 |
+
for lang in colors:
|
| 72 |
+
mask = np.array(languages) == lang
|
| 73 |
+
if mask.any():
|
| 74 |
+
ax1.scatter(X_2d[mask, 0], X_2d[mask, 1],
|
| 75 |
+
c=colors[lang], label=f'{lang.capitalize()} ({mask.sum()})',
|
| 76 |
+
alpha=0.7, s=60, edgecolors='white', linewidth=0.5)
|
| 77 |
+
|
| 78 |
+
ax1.set_title('Text Embeddings by Language (PCA Projection)', fontsize=14, fontweight='bold')
|
| 79 |
+
ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
|
| 80 |
+
ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
|
| 81 |
+
ax1.legend(loc='best', framealpha=0.9)
|
| 82 |
+
ax1.grid(True, alpha=0.3)
|
| 83 |
+
|
| 84 |
+
# Plot 2: Anomalies
|
| 85 |
+
ax2 = axes[1]
|
| 86 |
+
normal_mask = ~anomaly_mask
|
| 87 |
+
|
| 88 |
+
# Plot normal points first (so anomalies are on top)
|
| 89 |
+
ax2.scatter(X_2d[normal_mask, 0], X_2d[normal_mask, 1],
|
| 90 |
+
c='#3498db', label=f'Normal ({normal_mask.sum()})', alpha=0.6, s=60,
|
| 91 |
+
edgecolors='white', linewidth=0.5)
|
| 92 |
+
|
| 93 |
+
# Plot anomalies with X markers
|
| 94 |
+
ax2.scatter(X_2d[anomaly_mask, 0], X_2d[anomaly_mask, 1],
|
| 95 |
+
c='#e74c3c', marker='X', label=f'Anomaly ({anomaly_mask.sum()})',
|
| 96 |
+
alpha=0.9, s=120, edgecolors='black', linewidth=0.5)
|
| 97 |
+
|
| 98 |
+
ax2.set_title('Anomaly Detection Results (Isolation Forest)', fontsize=14, fontweight='bold')
|
| 99 |
+
ax2.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
|
| 100 |
+
ax2.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
|
| 101 |
+
ax2.legend(loc='best', framealpha=0.9)
|
| 102 |
+
ax2.grid(True, alpha=0.3)
|
| 103 |
+
|
| 104 |
+
plt.tight_layout()
|
| 105 |
+
output_path = 'vectorizer_anomaly_visualization.png'
|
| 106 |
+
plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='white')
|
| 107 |
+
print(f'\nVisualization saved: {output_path}')
|
| 108 |
+
|
| 109 |
+
# Also create the visualization in artifacts dir
|
| 110 |
+
artifacts_dir = Path(r'C:\Users\LENOVO\.gemini\antigravity\brain\b892f63f-afbc-4c4a-bbf1-37195faf04a5')
|
| 111 |
+
if artifacts_dir.exists():
|
| 112 |
+
artifacts_output = artifacts_dir / 'vectorizer_visualization.png'
|
| 113 |
+
plt.savefig(str(artifacts_output), dpi=150, bbox_inches='tight', facecolor='white')
|
| 114 |
+
print(f'Also saved to: {artifacts_output}')
|
| 115 |
+
|
| 116 |
+
plt.close()
|
| 117 |
+
|
| 118 |
+
print("\n" + "=" * 60)
|
| 119 |
+
print(" VISUALIZATION COMPLETE")
|
| 120 |
+
print("=" * 60)
|
frontend/app/components/dashboard/TrendingTopics.tsx
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* TrendingTopics.tsx
|
| 3 |
+
* Dashboard component for displaying trending topics and spike alerts
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
import React, { useEffect, useState } from 'react';
|
| 7 |
+
|
| 8 |
+
interface TrendingTopic {
|
| 9 |
+
topic: string;
|
| 10 |
+
momentum: number;
|
| 11 |
+
is_spike: boolean;
|
| 12 |
+
count_current_hour?: number;
|
| 13 |
+
avg_count?: number;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
interface TrendingData {
|
| 17 |
+
status: string;
|
| 18 |
+
trending_topics: TrendingTopic[];
|
| 19 |
+
spike_alerts: TrendingTopic[];
|
| 20 |
+
total_trending: number;
|
| 21 |
+
total_spikes: number;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
export const TrendingTopics: React.FC = () => {
|
| 25 |
+
const [data, setData] = useState<TrendingData | null>(null);
|
| 26 |
+
const [loading, setLoading] = useState(true);
|
| 27 |
+
const [error, setError] = useState<string | null>(null);
|
| 28 |
+
|
| 29 |
+
useEffect(() => {
|
| 30 |
+
const fetchTrending = async () => {
|
| 31 |
+
try {
|
| 32 |
+
const response = await fetch('/api/trending');
|
| 33 |
+
const result = await response.json();
|
| 34 |
+
setData(result);
|
| 35 |
+
setError(null);
|
| 36 |
+
} catch (err) {
|
| 37 |
+
setError('Failed to fetch trending data');
|
| 38 |
+
console.error('Trending fetch error:', err);
|
| 39 |
+
} finally {
|
| 40 |
+
setLoading(false);
|
| 41 |
+
}
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
fetchTrending();
|
| 45 |
+
// Refresh every 30 seconds
|
| 46 |
+
const interval = setInterval(fetchTrending, 30000);
|
| 47 |
+
return () => clearInterval(interval);
|
| 48 |
+
}, []);
|
| 49 |
+
|
| 50 |
+
const getMomentumColor = (momentum: number) => {
|
| 51 |
+
if (momentum >= 10) return 'text-red-500';
|
| 52 |
+
if (momentum >= 5) return 'text-orange-500';
|
| 53 |
+
if (momentum >= 2) return 'text-yellow-500';
|
| 54 |
+
return 'text-gray-400';
|
| 55 |
+
};
|
| 56 |
+
|
| 57 |
+
const getMomentumBg = (momentum: number) => {
|
| 58 |
+
if (momentum >= 10) return 'bg-red-500/20';
|
| 59 |
+
if (momentum >= 5) return 'bg-orange-500/20';
|
| 60 |
+
if (momentum >= 2) return 'bg-yellow-500/20';
|
| 61 |
+
return 'bg-gray-500/10';
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
if (loading) {
|
| 65 |
+
return (
|
| 66 |
+
<div className="bg-gradient-to-br from-gray-900/90 to-gray-800/90 backdrop-blur-lg rounded-2xl p-6 border border-gray-700/50 shadow-xl">
|
| 67 |
+
<div className="flex items-center gap-3 mb-4">
|
| 68 |
+
<div className="w-10 h-10 rounded-xl bg-gradient-to-br from-purple-500 to-pink-500 flex items-center justify-center">
|
| 69 |
+
<svg className="w-5 h-5 text-white animate-pulse" fill="none" viewBox="0 0 24 24" stroke="currentColor">
|
| 70 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 7h8m0 0v8m0-8l-8 8-4-4-6 6" />
|
| 71 |
+
</svg>
|
| 72 |
+
</div>
|
| 73 |
+
<div>
|
| 74 |
+
<h3 className="text-lg font-bold text-white">Trending Topics</h3>
|
| 75 |
+
<p className="text-xs text-gray-400">Loading...</p>
|
| 76 |
+
</div>
|
| 77 |
+
</div>
|
| 78 |
+
<div className="animate-pulse space-y-3">
|
| 79 |
+
{[1, 2, 3].map((i) => (
|
| 80 |
+
<div key={i} className="h-10 bg-gray-700/50 rounded-lg"></div>
|
| 81 |
+
))}
|
| 82 |
+
</div>
|
| 83 |
+
</div>
|
| 84 |
+
);
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
if (error || !data) {
|
| 88 |
+
return (
|
| 89 |
+
<div className="bg-gradient-to-br from-gray-900/90 to-gray-800/90 backdrop-blur-lg rounded-2xl p-6 border border-red-700/50 shadow-xl">
|
| 90 |
+
<div className="flex items-center gap-3">
|
| 91 |
+
<div className="w-10 h-10 rounded-xl bg-red-500/20 flex items-center justify-center">
|
| 92 |
+
<svg className="w-5 h-5 text-red-400" fill="none" viewBox="0 0 24 24" stroke="currentColor">
|
| 93 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z" />
|
| 94 |
+
</svg>
|
| 95 |
+
</div>
|
| 96 |
+
<div>
|
| 97 |
+
<h3 className="text-lg font-bold text-white">Trending Topics</h3>
|
| 98 |
+
<p className="text-xs text-red-400">{error || 'No data available'}</p>
|
| 99 |
+
</div>
|
| 100 |
+
</div>
|
| 101 |
+
</div>
|
| 102 |
+
);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
return (
|
| 106 |
+
<div className="bg-gradient-to-br from-gray-900/90 to-gray-800/90 backdrop-blur-lg rounded-2xl p-6 border border-gray-700/50 shadow-xl">
|
| 107 |
+
{/* Header */}
|
| 108 |
+
<div className="flex items-center justify-between mb-4">
|
| 109 |
+
<div className="flex items-center gap-3">
|
| 110 |
+
<div className="w-10 h-10 rounded-xl bg-gradient-to-br from-purple-500 to-pink-500 flex items-center justify-center">
|
| 111 |
+
<svg className="w-5 h-5 text-white" fill="none" viewBox="0 0 24 24" stroke="currentColor">
|
| 112 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 7h8m0 0v8m0-8l-8 8-4-4-6 6" />
|
| 113 |
+
</svg>
|
| 114 |
+
</div>
|
| 115 |
+
<div>
|
| 116 |
+
<h3 className="text-lg font-bold text-white">Trending Topics</h3>
|
| 117 |
+
<p className="text-xs text-gray-400">{data.total_trending} trending • {data.total_spikes} spikes</p>
|
| 118 |
+
</div>
|
| 119 |
+
</div>
|
| 120 |
+
{data.total_spikes > 0 && (
|
| 121 |
+
<span className="px-2 py-1 bg-red-500/20 text-red-400 text-xs font-medium rounded-lg animate-pulse">
|
| 122 |
+
🔥 {data.total_spikes} SPIKE{data.total_spikes > 1 ? 'S' : ''}
|
| 123 |
+
</span>
|
| 124 |
+
)}
|
| 125 |
+
</div>
|
| 126 |
+
|
| 127 |
+
{/* Spike Alerts */}
|
| 128 |
+
{data.spike_alerts.length > 0 && (
|
| 129 |
+
<div className="mb-4 p-3 bg-red-500/10 rounded-xl border border-red-500/30">
|
| 130 |
+
<h4 className="text-sm font-semibold text-red-400 mb-2 flex items-center gap-2">
|
| 131 |
+
<span>🔥</span> SPIKE ALERTS
|
| 132 |
+
</h4>
|
| 133 |
+
<div className="flex flex-wrap gap-2">
|
| 134 |
+
{data.spike_alerts.slice(0, 5).map((spike, idx) => (
|
| 135 |
+
<span
|
| 136 |
+
key={idx}
|
| 137 |
+
className="px-3 py-1 bg-red-500/20 text-red-300 text-sm font-medium rounded-full border border-red-500/30"
|
| 138 |
+
>
|
| 139 |
+
{spike.topic} <span className="text-red-400 font-bold">{spike.momentum.toFixed(0)}x</span>
|
| 140 |
+
</span>
|
| 141 |
+
))}
|
| 142 |
+
</div>
|
| 143 |
+
</div>
|
| 144 |
+
)}
|
| 145 |
+
|
| 146 |
+
{/* Trending Topics List */}
|
| 147 |
+
<div className="space-y-2">
|
| 148 |
+
{data.trending_topics.length === 0 ? (
|
| 149 |
+
<div className="text-center py-8 text-gray-500">
|
| 150 |
+
<svg className="w-12 h-12 mx-auto mb-2 opacity-50" fill="none" viewBox="0 0 24 24" stroke="currentColor">
|
| 151 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 7h8m0 0v8m0-8l-8 8-4-4-6 6" />
|
| 152 |
+
</svg>
|
| 153 |
+
<p>No trending topics yet</p>
|
| 154 |
+
<p className="text-xs mt-1">Topics will appear as data flows in</p>
|
| 155 |
+
</div>
|
| 156 |
+
) : (
|
| 157 |
+
data.trending_topics.slice(0, 8).map((topic, idx) => (
|
| 158 |
+
<div
|
| 159 |
+
key={idx}
|
| 160 |
+
className={`flex items-center justify-between p-3 rounded-xl ${getMomentumBg(topic.momentum)} border border-gray-700/30 transition-all hover:scale-[1.02]`}
|
| 161 |
+
>
|
| 162 |
+
<div className="flex items-center gap-3">
|
| 163 |
+
<span className="text-lg font-bold text-gray-500">#{idx + 1}</span>
|
| 164 |
+
<div>
|
| 165 |
+
<p className="font-semibold text-white capitalize">{topic.topic}</p>
|
| 166 |
+
<p className="text-xs text-gray-400">
|
| 167 |
+
{topic.is_spike ? '🔥 Spiking' : 'Trending'}
|
| 168 |
+
</p>
|
| 169 |
+
</div>
|
| 170 |
+
</div>
|
| 171 |
+
<div className="text-right">
|
| 172 |
+
<p className={`text-lg font-bold ${getMomentumColor(topic.momentum)}`}>
|
| 173 |
+
{topic.momentum.toFixed(0)}x
|
| 174 |
+
</p>
|
| 175 |
+
<p className="text-xs text-gray-500">momentum</p>
|
| 176 |
+
</div>
|
| 177 |
+
</div>
|
| 178 |
+
))
|
| 179 |
+
)}
|
| 180 |
+
</div>
|
| 181 |
+
|
| 182 |
+
{/* Footer */}
|
| 183 |
+
<div className="mt-4 pt-4 border-t border-gray-700/50">
|
| 184 |
+
<p className="text-xs text-gray-500 text-center">
|
| 185 |
+
Momentum = current hour mentions / avg last 6 hours
|
| 186 |
+
</p>
|
| 187 |
+
</div>
|
| 188 |
+
</div>
|
| 189 |
+
);
|
| 190 |
+
};
|
| 191 |
+
|
| 192 |
+
export default TrendingTopics;
|
main.py
CHANGED
|
@@ -913,6 +913,113 @@ def get_currency_history(days: int = 7):
|
|
| 913 |
}
|
| 914 |
|
| 915 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 916 |
# ============================================
|
| 917 |
# ANOMALY DETECTION ENDPOINTS
|
| 918 |
# ============================================
|
|
|
|
| 913 |
}
|
| 914 |
|
| 915 |
|
| 916 |
+
# ============================================
|
| 917 |
+
# TRENDING DETECTION ENDPOINTS
|
| 918 |
+
# ============================================
|
| 919 |
+
|
| 920 |
+
@app.get("/api/trending")
|
| 921 |
+
def get_trending_topics(limit: int = 10):
|
| 922 |
+
"""
|
| 923 |
+
Get currently trending topics.
|
| 924 |
+
|
| 925 |
+
Returns topics with momentum > 2x (gaining traction).
|
| 926 |
+
"""
|
| 927 |
+
try:
|
| 928 |
+
from src.utils.trending_detector import get_trending_now, get_spikes
|
| 929 |
+
|
| 930 |
+
trending = get_trending_now(limit=limit)
|
| 931 |
+
spikes = get_spikes()
|
| 932 |
+
|
| 933 |
+
return {
|
| 934 |
+
"status": "success",
|
| 935 |
+
"trending_topics": trending,
|
| 936 |
+
"spike_alerts": spikes,
|
| 937 |
+
"total_trending": len(trending),
|
| 938 |
+
"total_spikes": len(spikes)
|
| 939 |
+
}
|
| 940 |
+
|
| 941 |
+
except Exception as e:
|
| 942 |
+
logger.error(f"[TrendingAPI] Error: {e}")
|
| 943 |
+
return {
|
| 944 |
+
"status": "error",
|
| 945 |
+
"error": str(e),
|
| 946 |
+
"trending_topics": [],
|
| 947 |
+
"spike_alerts": []
|
| 948 |
+
}
|
| 949 |
+
|
| 950 |
+
|
| 951 |
+
@app.get("/api/trending/topic/{topic}")
|
| 952 |
+
def get_topic_history(topic: str, hours: int = 24):
|
| 953 |
+
"""
|
| 954 |
+
Get hourly mention history for a specific topic.
|
| 955 |
+
|
| 956 |
+
Args:
|
| 957 |
+
topic: Topic name to get history for
|
| 958 |
+
hours: Number of hours of history to return (default 24)
|
| 959 |
+
"""
|
| 960 |
+
try:
|
| 961 |
+
from src.utils.trending_detector import get_trending_detector
|
| 962 |
+
|
| 963 |
+
detector = get_trending_detector()
|
| 964 |
+
history = detector.get_topic_history(topic, hours=hours)
|
| 965 |
+
momentum = detector.get_momentum(topic)
|
| 966 |
+
is_spike = detector.is_spike(topic)
|
| 967 |
+
|
| 968 |
+
return {
|
| 969 |
+
"status": "success",
|
| 970 |
+
"topic": topic,
|
| 971 |
+
"momentum": momentum,
|
| 972 |
+
"is_spike": is_spike,
|
| 973 |
+
"history": history
|
| 974 |
+
}
|
| 975 |
+
|
| 976 |
+
except Exception as e:
|
| 977 |
+
logger.error(f"[TrendingAPI] Error getting history for {topic}: {e}")
|
| 978 |
+
return {
|
| 979 |
+
"status": "error",
|
| 980 |
+
"error": str(e),
|
| 981 |
+
"topic": topic,
|
| 982 |
+
"momentum": 1.0,
|
| 983 |
+
"is_spike": False,
|
| 984 |
+
"history": []
|
| 985 |
+
}
|
| 986 |
+
|
| 987 |
+
|
| 988 |
+
@app.post("/api/trending/record")
|
| 989 |
+
def record_topic_mention(topic: str, source: str = "manual", domain: str = "general"):
|
| 990 |
+
"""
|
| 991 |
+
Record a topic mention (for testing/manual tracking).
|
| 992 |
+
|
| 993 |
+
Args:
|
| 994 |
+
topic: Topic/keyword being mentioned
|
| 995 |
+
source: Source of the mention (twitter, news, etc.)
|
| 996 |
+
domain: Domain category (political, economical, etc.)
|
| 997 |
+
"""
|
| 998 |
+
try:
|
| 999 |
+
from src.utils.trending_detector import record_topic_mention as record_mention
|
| 1000 |
+
|
| 1001 |
+
record_mention(topic=topic, source=source, domain=domain)
|
| 1002 |
+
|
| 1003 |
+
# Get updated momentum
|
| 1004 |
+
from src.utils.trending_detector import get_trending_detector
|
| 1005 |
+
detector = get_trending_detector()
|
| 1006 |
+
momentum = detector.get_momentum(topic)
|
| 1007 |
+
|
| 1008 |
+
return {
|
| 1009 |
+
"status": "success",
|
| 1010 |
+
"message": f"Recorded mention for '{topic}'",
|
| 1011 |
+
"current_momentum": momentum,
|
| 1012 |
+
"is_spike": detector.is_spike(topic)
|
| 1013 |
+
}
|
| 1014 |
+
|
| 1015 |
+
except Exception as e:
|
| 1016 |
+
logger.error(f"[TrendingAPI] Error recording mention: {e}")
|
| 1017 |
+
return {
|
| 1018 |
+
"status": "error",
|
| 1019 |
+
"error": str(e)
|
| 1020 |
+
}
|
| 1021 |
+
|
| 1022 |
+
|
| 1023 |
# ============================================
|
| 1024 |
# ANOMALY DETECTION ENDPOINTS
|
| 1025 |
# ============================================
|
models/anomaly-detection/src/components/model_trainer.py
CHANGED
|
@@ -472,7 +472,41 @@ class ModelTrainer:
|
|
| 472 |
optuna_study_name=None
|
| 473 |
)
|
| 474 |
|
| 475 |
-
logger.info(f"[ModelTrainer]
|
| 476 |
logger.info(f"[ModelTrainer] Best model: {best_model['name'] if best_model else 'N/A'}")
|
| 477 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
return artifact
|
|
|
|
| 472 |
optuna_study_name=None
|
| 473 |
)
|
| 474 |
|
| 475 |
+
logger.info(f"[ModelTrainer] Training complete in {duration:.1f}s")
|
| 476 |
logger.info(f"[ModelTrainer] Best model: {best_model['name'] if best_model else 'N/A'}")
|
| 477 |
|
| 478 |
+
# ============================================
|
| 479 |
+
# TRAIN EMBEDDING-ONLY MODEL FOR LIVE INFERENCE
|
| 480 |
+
# ============================================
|
| 481 |
+
# The Vectorizer Agent only has 768-dim embeddings at inference time
|
| 482 |
+
# (no temporal/engagement features), so we train a separate model
|
| 483 |
+
try:
|
| 484 |
+
# Check if features include extra metadata (> 768 dims)
|
| 485 |
+
if X.shape[1] > 768:
|
| 486 |
+
logger.info(f"[ModelTrainer] Training embedding-only model for Vectorizer Agent...")
|
| 487 |
+
|
| 488 |
+
# Extract only the first 768 dimensions (BERT embeddings)
|
| 489 |
+
X_embeddings_only = X[:, :768]
|
| 490 |
+
logger.info(f"[ModelTrainer] Embedding-only shape: {X_embeddings_only.shape}")
|
| 491 |
+
|
| 492 |
+
# Train Isolation Forest on embeddings only
|
| 493 |
+
embedding_model = IsolationForest(
|
| 494 |
+
contamination=0.1,
|
| 495 |
+
n_estimators=100,
|
| 496 |
+
random_state=42,
|
| 497 |
+
n_jobs=-1
|
| 498 |
+
)
|
| 499 |
+
embedding_model.fit(X_embeddings_only)
|
| 500 |
+
|
| 501 |
+
# Save to a dedicated path for the Vectorizer Agent
|
| 502 |
+
embedding_model_path = Path(self.config.output_directory) / "isolation_forest_embeddings_only.joblib"
|
| 503 |
+
joblib.dump(embedding_model, embedding_model_path)
|
| 504 |
+
|
| 505 |
+
logger.info(f"[ModelTrainer] Embedding-only model saved: {embedding_model_path}")
|
| 506 |
+
logger.info(f"[ModelTrainer] This model is for real-time inference by Vectorizer Agent")
|
| 507 |
+
else:
|
| 508 |
+
logger.info(f"[ModelTrainer] Features are already embedding-only ({X.shape[1]} dims)")
|
| 509 |
+
except Exception as e:
|
| 510 |
+
logger.warning(f"[ModelTrainer] Embedding-only model training failed: {e}")
|
| 511 |
+
|
| 512 |
return artifact
|
src/api/vectorization_api.py
CHANGED
|
@@ -72,6 +72,10 @@ class VectorizationResponse(BaseModel):
|
|
| 72 |
domain_insights: List[Dict[str, Any]]
|
| 73 |
processing_time_seconds: float
|
| 74 |
vectors: Optional[List[Dict[str, Any]]] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
class HealthResponse(BaseModel):
|
|
@@ -169,6 +173,9 @@ async def vectorize_texts(request: VectorizationRequest):
|
|
| 169 |
vectors=(
|
| 170 |
result.get("vector_embeddings") if request.include_vectors else None
|
| 171 |
),
|
|
|
|
|
|
|
|
|
|
| 172 |
)
|
| 173 |
|
| 174 |
return response
|
|
|
|
| 72 |
domain_insights: List[Dict[str, Any]]
|
| 73 |
processing_time_seconds: float
|
| 74 |
vectors: Optional[List[Dict[str, Any]]] = None
|
| 75 |
+
# Anomaly Detection Results
|
| 76 |
+
anomaly_results: Optional[Dict[str, Any]] = None
|
| 77 |
+
# Trending Detection Results
|
| 78 |
+
trending_results: Optional[Dict[str, Any]] = None
|
| 79 |
|
| 80 |
|
| 81 |
class HealthResponse(BaseModel):
|
|
|
|
| 173 |
vectors=(
|
| 174 |
result.get("vector_embeddings") if request.include_vectors else None
|
| 175 |
),
|
| 176 |
+
# Include anomaly & trending detection results
|
| 177 |
+
anomaly_results=result.get("anomaly_results"),
|
| 178 |
+
trending_results=result.get("trending_results"),
|
| 179 |
)
|
| 180 |
|
| 181 |
return response
|
src/graphs/vectorizationAgentGraph.py
CHANGED
|
@@ -17,8 +17,9 @@ class VectorizationGraphBuilder:
|
|
| 17 |
Step 1: Language Detection (FastText/lingua-py)
|
| 18 |
Step 2: Text Vectorization (SinhalaBERTo/Tamil-BERT/DistilBERT)
|
| 19 |
Step 3: Anomaly Detection (Isolation Forest on vectors)
|
| 20 |
-
Step 4:
|
| 21 |
-
Step 5:
|
|
|
|
| 22 |
"""
|
| 23 |
|
| 24 |
def __init__(self, llm=None):
|
|
@@ -29,7 +30,7 @@ class VectorizationGraphBuilder:
|
|
| 29 |
Build the vectorization agent graph.
|
| 30 |
|
| 31 |
Flow:
|
| 32 |
-
detect_languages → vectorize_texts → anomaly_detection → expert_summary → format_output → END
|
| 33 |
"""
|
| 34 |
node = VectorizationAgentNode(self.llm)
|
| 35 |
|
|
@@ -40,16 +41,18 @@ class VectorizationGraphBuilder:
|
|
| 40 |
graph.add_node("detect_languages", node.detect_languages)
|
| 41 |
graph.add_node("vectorize_texts", node.vectorize_texts)
|
| 42 |
graph.add_node("anomaly_detection", node.run_anomaly_detection)
|
|
|
|
| 43 |
graph.add_node("generate_expert_summary", node.generate_expert_summary)
|
| 44 |
graph.add_node("format_output", node.format_final_output)
|
| 45 |
|
| 46 |
# Set entry point
|
| 47 |
graph.set_entry_point("detect_languages")
|
| 48 |
|
| 49 |
-
# Sequential flow with anomaly detection
|
| 50 |
graph.add_edge("detect_languages", "vectorize_texts")
|
| 51 |
graph.add_edge("vectorize_texts", "anomaly_detection")
|
| 52 |
-
graph.add_edge("anomaly_detection", "
|
|
|
|
| 53 |
graph.add_edge("generate_expert_summary", "format_output")
|
| 54 |
graph.add_edge("format_output", END)
|
| 55 |
|
|
@@ -60,12 +63,13 @@ class VectorizationGraphBuilder:
|
|
| 60 |
print("\n" + "=" * 60)
|
| 61 |
print("[BRAIN] BUILDING VECTORIZATION AGENT GRAPH")
|
| 62 |
print("=" * 60)
|
| 63 |
-
print("Architecture:
|
| 64 |
print(" Step 1: Language Detection (FastText/Unicode)")
|
| 65 |
print(" Step 2: Text Vectorization (SinhalaBERTo/Tamil-BERT/DistilBERT)")
|
| 66 |
print(" Step 3: Anomaly Detection (Isolation Forest)")
|
| 67 |
-
print(" Step 4:
|
| 68 |
-
print(" Step 5:
|
|
|
|
| 69 |
print("-" * 60)
|
| 70 |
|
| 71 |
llm = GroqLLM().get_llm()
|
|
@@ -73,3 +77,4 @@ graph = VectorizationGraphBuilder(llm).build_graph()
|
|
| 73 |
|
| 74 |
print("[OK] Vectorization Agent Graph compiled successfully")
|
| 75 |
print("=" * 60 + "\n")
|
|
|
|
|
|
| 17 |
Step 1: Language Detection (FastText/lingua-py)
|
| 18 |
Step 2: Text Vectorization (SinhalaBERTo/Tamil-BERT/DistilBERT)
|
| 19 |
Step 3: Anomaly Detection (Isolation Forest on vectors)
|
| 20 |
+
Step 4: Trending Detection (Velocity/Spike tracking)
|
| 21 |
+
Step 5: Expert Summary (GroqLLM)
|
| 22 |
+
Step 6: Format Output
|
| 23 |
"""
|
| 24 |
|
| 25 |
def __init__(self, llm=None):
|
|
|
|
| 30 |
Build the vectorization agent graph.
|
| 31 |
|
| 32 |
Flow:
|
| 33 |
+
detect_languages → vectorize_texts → anomaly_detection → trending_detection → expert_summary → format_output → END
|
| 34 |
"""
|
| 35 |
node = VectorizationAgentNode(self.llm)
|
| 36 |
|
|
|
|
| 41 |
graph.add_node("detect_languages", node.detect_languages)
|
| 42 |
graph.add_node("vectorize_texts", node.vectorize_texts)
|
| 43 |
graph.add_node("anomaly_detection", node.run_anomaly_detection)
|
| 44 |
+
graph.add_node("trending_detection", node.run_trending_detection)
|
| 45 |
graph.add_node("generate_expert_summary", node.generate_expert_summary)
|
| 46 |
graph.add_node("format_output", node.format_final_output)
|
| 47 |
|
| 48 |
# Set entry point
|
| 49 |
graph.set_entry_point("detect_languages")
|
| 50 |
|
| 51 |
+
# Sequential flow with anomaly + trending detection
|
| 52 |
graph.add_edge("detect_languages", "vectorize_texts")
|
| 53 |
graph.add_edge("vectorize_texts", "anomaly_detection")
|
| 54 |
+
graph.add_edge("anomaly_detection", "trending_detection")
|
| 55 |
+
graph.add_edge("trending_detection", "generate_expert_summary")
|
| 56 |
graph.add_edge("generate_expert_summary", "format_output")
|
| 57 |
graph.add_edge("format_output", END)
|
| 58 |
|
|
|
|
| 63 |
print("\n" + "=" * 60)
|
| 64 |
print("[BRAIN] BUILDING VECTORIZATION AGENT GRAPH")
|
| 65 |
print("=" * 60)
|
| 66 |
+
print("Architecture: 6-Step Sequential Pipeline")
|
| 67 |
print(" Step 1: Language Detection (FastText/Unicode)")
|
| 68 |
print(" Step 2: Text Vectorization (SinhalaBERTo/Tamil-BERT/DistilBERT)")
|
| 69 |
print(" Step 3: Anomaly Detection (Isolation Forest)")
|
| 70 |
+
print(" Step 4: Trending Detection (Velocity/Spikes)")
|
| 71 |
+
print(" Step 5: Expert Summary (GroqLLM)")
|
| 72 |
+
print(" Step 6: Format Output")
|
| 73 |
print("-" * 60)
|
| 74 |
|
| 75 |
llm = GroqLLM().get_llm()
|
|
|
|
| 77 |
|
| 78 |
print("[OK] Vectorization Agent Graph compiled successfully")
|
| 79 |
print("=" * 60 + "\n")
|
| 80 |
+
|
src/nodes/vectorizationAgentNode.py
CHANGED
|
@@ -324,6 +324,13 @@ class VectorizationAgentNode:
|
|
| 324 |
import joblib
|
| 325 |
|
| 326 |
model_paths = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
MODELS_PATH / "output" / "isolation_forest_model.joblib",
|
| 328 |
MODELS_PATH
|
| 329 |
/ "artifacts"
|
|
@@ -337,7 +344,7 @@ class VectorizationAgentNode:
|
|
| 337 |
anomaly_model = joblib.load(model_path)
|
| 338 |
model_name = model_path.stem
|
| 339 |
logger.info(
|
| 340 |
-
f"[VectorizationAgent]
|
| 341 |
)
|
| 342 |
break
|
| 343 |
|
|
@@ -361,18 +368,36 @@ class VectorizationAgentNode:
|
|
| 361 |
}
|
| 362 |
|
| 363 |
# Run inference on each embedding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
anomalies = []
|
| 365 |
normal_count = 0
|
|
|
|
| 366 |
|
| 367 |
for emb in embeddings:
|
| 368 |
try:
|
| 369 |
vector = emb.get("vector", [])
|
| 370 |
post_id = emb.get("post_id", "")
|
|
|
|
| 371 |
|
| 372 |
if not vector or len(vector) != 768:
|
| 373 |
continue
|
| 374 |
|
| 375 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
vector_array = np.array(vector).reshape(1, -1)
|
| 377 |
|
| 378 |
# Predict: -1 = anomaly, 1 = normal
|
|
@@ -395,7 +420,8 @@ class VectorizationAgentNode:
|
|
| 395 |
"post_id": post_id,
|
| 396 |
"anomaly_score": float(normalized_score),
|
| 397 |
"is_anomaly": True,
|
| 398 |
-
"language":
|
|
|
|
| 399 |
}
|
| 400 |
)
|
| 401 |
else:
|
|
@@ -407,7 +433,8 @@ class VectorizationAgentNode:
|
|
| 407 |
)
|
| 408 |
|
| 409 |
logger.info(
|
| 410 |
-
f"[VectorizationAgent] Anomaly detection: {len(anomalies)} anomalies,
|
|
|
|
| 411 |
)
|
| 412 |
|
| 413 |
return {
|
|
@@ -423,6 +450,180 @@ class VectorizationAgentNode:
|
|
| 423 |
},
|
| 424 |
}
|
| 425 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
def generate_expert_summary(self, state: VectorizationAgentState) -> Dict[str, Any]:
|
| 427 |
"""
|
| 428 |
Step 3: Use GroqLLM to generate expert summary combining all insights.
|
|
|
|
| 324 |
import joblib
|
| 325 |
|
| 326 |
model_paths = [
|
| 327 |
+
# Embedding-only model (768-dim) - compatible with Vectorizer Agent
|
| 328 |
+
MODELS_PATH
|
| 329 |
+
/ "artifacts"
|
| 330 |
+
/ "model_trainer"
|
| 331 |
+
/ "isolation_forest_embeddings_only.joblib",
|
| 332 |
+
# Full-feature models (may have different dimensions)
|
| 333 |
+
MODELS_PATH / "output" / "isolation_forest_embeddings_only.joblib",
|
| 334 |
MODELS_PATH / "output" / "isolation_forest_model.joblib",
|
| 335 |
MODELS_PATH
|
| 336 |
/ "artifacts"
|
|
|
|
| 344 |
anomaly_model = joblib.load(model_path)
|
| 345 |
model_name = model_path.stem
|
| 346 |
logger.info(
|
| 347 |
+
f"[VectorizationAgent] Loaded anomaly model: {model_path.name}"
|
| 348 |
)
|
| 349 |
break
|
| 350 |
|
|
|
|
| 368 |
}
|
| 369 |
|
| 370 |
# Run inference on each embedding
|
| 371 |
+
# IMPORTANT: The anomaly model was trained primarily on English embeddings.
|
| 372 |
+
# Different BERT models (SinhalaBERTo, Tamil-BERT, DistilBERT) produce embeddings
|
| 373 |
+
# in completely different vector spaces, so non-English texts would incorrectly
|
| 374 |
+
# appear as anomalies. We handle this by:
|
| 375 |
+
# 1. Only running the model on English texts
|
| 376 |
+
# 2. Using a content-based heuristic for non-English texts
|
| 377 |
anomalies = []
|
| 378 |
normal_count = 0
|
| 379 |
+
skipped_non_english = 0
|
| 380 |
|
| 381 |
for emb in embeddings:
|
| 382 |
try:
|
| 383 |
vector = emb.get("vector", [])
|
| 384 |
post_id = emb.get("post_id", "")
|
| 385 |
+
language = emb.get("language", "english")
|
| 386 |
|
| 387 |
if not vector or len(vector) != 768:
|
| 388 |
continue
|
| 389 |
|
| 390 |
+
# For non-English languages, skip anomaly detection
|
| 391 |
+
# The ML model was trained on English embeddings only.
|
| 392 |
+
# Different BERT models (SinhalaBERTo, Tamil-BERT) have completely
|
| 393 |
+
# different embedding spaces - Tamil embeddings have magnitude ~0.64
|
| 394 |
+
# while English has ~7.5 and Sinhala ~13.7. They cannot be compared.
|
| 395 |
+
if language in ["sinhala", "tamil"]:
|
| 396 |
+
skipped_non_english += 1
|
| 397 |
+
normal_count += 1 # Treat as normal (not anomalous)
|
| 398 |
+
continue
|
| 399 |
+
|
| 400 |
+
# For English texts, use the trained ML model
|
| 401 |
vector_array = np.array(vector).reshape(1, -1)
|
| 402 |
|
| 403 |
# Predict: -1 = anomaly, 1 = normal
|
|
|
|
| 420 |
"post_id": post_id,
|
| 421 |
"anomaly_score": float(normalized_score),
|
| 422 |
"is_anomaly": True,
|
| 423 |
+
"language": language,
|
| 424 |
+
"detection_method": "isolation_forest",
|
| 425 |
}
|
| 426 |
)
|
| 427 |
else:
|
|
|
|
| 433 |
)
|
| 434 |
|
| 435 |
logger.info(
|
| 436 |
+
f"[VectorizationAgent] Anomaly detection: {len(anomalies)} anomalies, "
|
| 437 |
+
f"{normal_count} normal, {skipped_non_english} non-English (heuristic)"
|
| 438 |
)
|
| 439 |
|
| 440 |
return {
|
|
|
|
| 450 |
},
|
| 451 |
}
|
| 452 |
|
| 453 |
+
def run_trending_detection(self, state: VectorizationAgentState) -> Dict[str, Any]:
|
| 454 |
+
"""
|
| 455 |
+
Step 2.6: Detect trending topics from the input texts.
|
| 456 |
+
|
| 457 |
+
Extracts key entities/topics and tracks their mention velocity.
|
| 458 |
+
Identifies:
|
| 459 |
+
- Trending topics (momentum > 2x normal)
|
| 460 |
+
- Spike alerts (volume > 3x normal)
|
| 461 |
+
- Topics with increasing momentum
|
| 462 |
+
"""
|
| 463 |
+
logger.info("[VectorizationAgent] STEP 2.6: Trending Detection")
|
| 464 |
+
|
| 465 |
+
detection_results = state.get("language_detection_results", [])
|
| 466 |
+
|
| 467 |
+
if not detection_results:
|
| 468 |
+
logger.warning("[VectorizationAgent] No texts for trending detection")
|
| 469 |
+
return {
|
| 470 |
+
"current_step": "trending_detection",
|
| 471 |
+
"trending_results": {
|
| 472 |
+
"status": "skipped",
|
| 473 |
+
"reason": "no_texts",
|
| 474 |
+
"trending_topics": [],
|
| 475 |
+
"spike_alerts": [],
|
| 476 |
+
},
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
# Import trending detector
|
| 480 |
+
try:
|
| 481 |
+
from src.utils.trending_detector import (
|
| 482 |
+
get_trending_detector,
|
| 483 |
+
record_topic_mention,
|
| 484 |
+
get_trending_now,
|
| 485 |
+
get_spikes,
|
| 486 |
+
)
|
| 487 |
+
TRENDING_AVAILABLE = True
|
| 488 |
+
except ImportError as e:
|
| 489 |
+
logger.warning(f"[VectorizationAgent] Trending detector not available: {e}")
|
| 490 |
+
TRENDING_AVAILABLE = False
|
| 491 |
+
|
| 492 |
+
if not TRENDING_AVAILABLE:
|
| 493 |
+
return {
|
| 494 |
+
"current_step": "trending_detection",
|
| 495 |
+
"trending_results": {
|
| 496 |
+
"status": "unavailable",
|
| 497 |
+
"reason": "trending_detector_not_installed",
|
| 498 |
+
"trending_topics": [],
|
| 499 |
+
"spike_alerts": [],
|
| 500 |
+
},
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
# Extract entities and record mentions
|
| 504 |
+
entities_found = []
|
| 505 |
+
|
| 506 |
+
for item in detection_results:
|
| 507 |
+
text = item.get("text", "") # Field is 'text', not 'original_text'
|
| 508 |
+
language = item.get("language", "english")
|
| 509 |
+
post_id = item.get("post_id", "")
|
| 510 |
+
|
| 511 |
+
# Simple entity extraction (keywords, capitalized words, etc.)
|
| 512 |
+
# In production, you'd use NER or more sophisticated extraction
|
| 513 |
+
extracted = self._extract_entities(text, language)
|
| 514 |
+
|
| 515 |
+
for entity in extracted:
|
| 516 |
+
try:
|
| 517 |
+
# Record mention with trending detector
|
| 518 |
+
record_topic_mention(
|
| 519 |
+
topic=entity["text"],
|
| 520 |
+
source=entity.get("source", "feed"),
|
| 521 |
+
domain=entity.get("domain", "general"),
|
| 522 |
+
)
|
| 523 |
+
entities_found.append({
|
| 524 |
+
"entity": entity["text"],
|
| 525 |
+
"type": entity.get("type", "keyword"),
|
| 526 |
+
"post_id": post_id,
|
| 527 |
+
"language": language,
|
| 528 |
+
})
|
| 529 |
+
except Exception as e:
|
| 530 |
+
logger.debug(f"[VectorizationAgent] Failed to record mention: {e}")
|
| 531 |
+
|
| 532 |
+
# Get current trending topics and spikes
|
| 533 |
+
try:
|
| 534 |
+
trending_topics = get_trending_now(limit=10)
|
| 535 |
+
spike_alerts = get_spikes()
|
| 536 |
+
except Exception as e:
|
| 537 |
+
logger.warning(f"[VectorizationAgent] Failed to get trending data: {e}")
|
| 538 |
+
trending_topics = []
|
| 539 |
+
spike_alerts = []
|
| 540 |
+
|
| 541 |
+
logger.info(
|
| 542 |
+
f"[VectorizationAgent] Trending detection: {len(entities_found)} entities, "
|
| 543 |
+
f"{len(trending_topics)} trending, {len(spike_alerts)} spikes"
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
return {
|
| 547 |
+
"current_step": "trending_detection",
|
| 548 |
+
"trending_results": {
|
| 549 |
+
"status": "success",
|
| 550 |
+
"entities_extracted": len(entities_found),
|
| 551 |
+
"entities": entities_found[:20], # Limit for state size
|
| 552 |
+
"trending_topics": trending_topics,
|
| 553 |
+
"spike_alerts": spike_alerts,
|
| 554 |
+
},
|
| 555 |
+
}
|
| 556 |
+
|
| 557 |
+
def _extract_entities(self, text: str, language: str = "english") -> List[Dict[str, Any]]:
|
| 558 |
+
"""
|
| 559 |
+
Extract entities/topics from text for trending tracking.
|
| 560 |
+
|
| 561 |
+
Uses simple heuristics:
|
| 562 |
+
- Capitalized words/phrases (potential proper nouns)
|
| 563 |
+
- Hashtags
|
| 564 |
+
- Common news keywords
|
| 565 |
+
|
| 566 |
+
In production, integrate with NER model for better extraction.
|
| 567 |
+
"""
|
| 568 |
+
entities = []
|
| 569 |
+
|
| 570 |
+
if not text:
|
| 571 |
+
return entities
|
| 572 |
+
|
| 573 |
+
import re
|
| 574 |
+
|
| 575 |
+
# Extract hashtags
|
| 576 |
+
hashtags = re.findall(r'#(\w+)', text)
|
| 577 |
+
for tag in hashtags:
|
| 578 |
+
entities.append({
|
| 579 |
+
"text": tag.lower(),
|
| 580 |
+
"type": "hashtag",
|
| 581 |
+
"source": "hashtag",
|
| 582 |
+
"domain": "social",
|
| 583 |
+
})
|
| 584 |
+
|
| 585 |
+
# Extract capitalized phrases (potential proper nouns)
|
| 586 |
+
# Match 1-4 consecutive capitalized words
|
| 587 |
+
cap_phrases = re.findall(r'\b([A-Z][a-z]+(?: [A-Z][a-z]+){0,3})\b', text)
|
| 588 |
+
for phrase in cap_phrases:
|
| 589 |
+
# Skip common words
|
| 590 |
+
if phrase.lower() not in ['the', 'a', 'an', 'is', 'are', 'was', 'were', 'i', 'he', 'she', 'it']:
|
| 591 |
+
entities.append({
|
| 592 |
+
"text": phrase,
|
| 593 |
+
"type": "proper_noun",
|
| 594 |
+
"source": "text",
|
| 595 |
+
"domain": "general",
|
| 596 |
+
})
|
| 597 |
+
|
| 598 |
+
# News/event keywords
|
| 599 |
+
news_keywords = [
|
| 600 |
+
'breaking', 'urgent', 'alert', 'emergency', 'crisis',
|
| 601 |
+
'earthquake', 'flood', 'tsunami', 'election', 'protest',
|
| 602 |
+
'strike', 'scandal', 'corruption', 'price', 'inflation',
|
| 603 |
+
]
|
| 604 |
+
|
| 605 |
+
text_lower = text.lower()
|
| 606 |
+
for keyword in news_keywords:
|
| 607 |
+
if keyword in text_lower:
|
| 608 |
+
entities.append({
|
| 609 |
+
"text": keyword,
|
| 610 |
+
"type": "news_keyword",
|
| 611 |
+
"source": "keyword_match",
|
| 612 |
+
"domain": "news",
|
| 613 |
+
})
|
| 614 |
+
|
| 615 |
+
# Deduplicate by text
|
| 616 |
+
seen = set()
|
| 617 |
+
unique_entities = []
|
| 618 |
+
for e in entities:
|
| 619 |
+
key = e["text"].lower()
|
| 620 |
+
if key not in seen:
|
| 621 |
+
seen.add(key)
|
| 622 |
+
unique_entities.append(e)
|
| 623 |
+
|
| 624 |
+
return unique_entities[:15] # Limit entities per text
|
| 625 |
+
|
| 626 |
+
|
| 627 |
def generate_expert_summary(self, state: VectorizationAgentState) -> Dict[str, Any]:
|
| 628 |
"""
|
| 629 |
Step 3: Use GroqLLM to generate expert summary combining all insights.
|
src/rag.py
CHANGED
|
@@ -246,19 +246,54 @@ class RogerRAG:
|
|
| 246 |
logger.error(f"[RAG] LLM initialization error: {e}")
|
| 247 |
|
| 248 |
def _format_context(self, docs: List[Dict[str, Any]]) -> str:
|
| 249 |
-
"""Format retrieved documents as context for LLM"""
|
| 250 |
if not docs:
|
| 251 |
return "No relevant intelligence data found."
|
| 252 |
|
| 253 |
context_parts = []
|
|
|
|
|
|
|
| 254 |
for i, doc in enumerate(docs[:5], 1): # Top 5 docs
|
| 255 |
meta = doc.get("metadata", {})
|
| 256 |
domain = meta.get("domain", "unknown")
|
| 257 |
platform = meta.get("platform", "")
|
| 258 |
timestamp = meta.get("timestamp", "")
|
| 259 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
context_parts.append(
|
| 261 |
-
f"[Source {i}] Domain: {domain} | Platform: {platform}
|
|
|
|
| 262 |
f"{doc['content']}\n"
|
| 263 |
)
|
| 264 |
|
|
@@ -344,18 +379,34 @@ class RogerRAG:
|
|
| 344 |
"question": question,
|
| 345 |
}
|
| 346 |
|
| 347 |
-
# RAG prompt
|
|
|
|
| 348 |
rag_prompt = ChatPromptTemplate.from_messages(
|
| 349 |
[
|
| 350 |
(
|
| 351 |
"system",
|
| 352 |
-
"""You are Roger, an AI intelligence analyst for Sri Lanka.
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
-
|
| 358 |
-
|
| 359 |
),
|
| 360 |
MessagesPlaceholder(variable_name="history"),
|
| 361 |
("human", "{question}"),
|
|
|
|
| 246 |
logger.error(f"[RAG] LLM initialization error: {e}")
|
| 247 |
|
| 248 |
def _format_context(self, docs: List[Dict[str, Any]]) -> str:
|
| 249 |
+
"""Format retrieved documents as context for LLM with temporal awareness"""
|
| 250 |
if not docs:
|
| 251 |
return "No relevant intelligence data found."
|
| 252 |
|
| 253 |
context_parts = []
|
| 254 |
+
now = datetime.now()
|
| 255 |
+
|
| 256 |
for i, doc in enumerate(docs[:5], 1): # Top 5 docs
|
| 257 |
meta = doc.get("metadata", {})
|
| 258 |
domain = meta.get("domain", "unknown")
|
| 259 |
platform = meta.get("platform", "")
|
| 260 |
timestamp = meta.get("timestamp", "")
|
| 261 |
|
| 262 |
+
# Calculate age of the source
|
| 263 |
+
age_str = "unknown date"
|
| 264 |
+
if timestamp:
|
| 265 |
+
try:
|
| 266 |
+
# Try to parse various timestamp formats
|
| 267 |
+
for fmt in [
|
| 268 |
+
"%Y-%m-%d %H:%M:%S",
|
| 269 |
+
"%Y-%m-%dT%H:%M:%S",
|
| 270 |
+
"%Y-%m-%d",
|
| 271 |
+
"%d/%m/%Y",
|
| 272 |
+
]:
|
| 273 |
+
try:
|
| 274 |
+
ts_date = datetime.strptime(timestamp[:19], fmt)
|
| 275 |
+
days_old = (now - ts_date).days
|
| 276 |
+
if days_old == 0:
|
| 277 |
+
age_str = "TODAY"
|
| 278 |
+
elif days_old == 1:
|
| 279 |
+
age_str = "1 day ago"
|
| 280 |
+
elif days_old < 7:
|
| 281 |
+
age_str = f"{days_old} days ago"
|
| 282 |
+
elif days_old < 30:
|
| 283 |
+
age_str = f"{days_old // 7} weeks ago"
|
| 284 |
+
elif days_old < 365:
|
| 285 |
+
age_str = f"{days_old // 30} months ago (⚠️ POTENTIALLY OUTDATED)"
|
| 286 |
+
else:
|
| 287 |
+
age_str = f"{days_old // 365} years ago (⚠️ OUTDATED)"
|
| 288 |
+
break
|
| 289 |
+
except ValueError:
|
| 290 |
+
continue
|
| 291 |
+
except Exception:
|
| 292 |
+
age_str = f"Date: {timestamp}"
|
| 293 |
+
|
| 294 |
context_parts.append(
|
| 295 |
+
f"[Source {i}] Domain: {domain} | Platform: {platform}\n"
|
| 296 |
+
f"📅 TIMESTAMP: {timestamp} ({age_str})\n"
|
| 297 |
f"{doc['content']}\n"
|
| 298 |
)
|
| 299 |
|
|
|
|
| 379 |
"question": question,
|
| 380 |
}
|
| 381 |
|
| 382 |
+
# RAG prompt with temporal awareness
|
| 383 |
+
current_date = datetime.now().strftime("%B %d, %Y")
|
| 384 |
rag_prompt = ChatPromptTemplate.from_messages(
|
| 385 |
[
|
| 386 |
(
|
| 387 |
"system",
|
| 388 |
+
f"""You are Roger, an AI intelligence analyst for Sri Lanka.
|
| 389 |
+
|
| 390 |
+
TODAY'S DATE: {current_date}
|
| 391 |
+
|
| 392 |
+
CRITICAL TEMPORAL AWARENESS INSTRUCTIONS:
|
| 393 |
+
1. ALWAYS check the timestamp/date of each source before using information
|
| 394 |
+
2. For questions about "current" situations, ONLY use sources from the last 30 days
|
| 395 |
+
3. If sources are outdated (more than 30 days old), explicitly mention this: "Based on data from [date], which may be outdated..."
|
| 396 |
+
4. For political leadership questions, verify information is from recent sources
|
| 397 |
+
5. If you find conflicting information from different time periods, prefer the most recent source
|
| 398 |
+
6. Never present old information as current fact without temporal qualification
|
| 399 |
+
|
| 400 |
+
IMPORTANT POLITICAL CONTEXT:
|
| 401 |
+
- Presidential elections were held in Sri Lanka in September 2024
|
| 402 |
+
- Always verify any claims about political leadership against the most recent sources
|
| 403 |
+
|
| 404 |
+
Answer questions based ONLY on the provided intelligence context.
|
| 405 |
+
Be concise but informative. Always cite source timestamps when available.
|
| 406 |
+
If the context doesn't contain relevant RECENT information for current-state questions, say so.
|
| 407 |
|
| 408 |
+
Context (check timestamps carefully):
|
| 409 |
+
{{context}}""",
|
| 410 |
),
|
| 411 |
MessagesPlaceholder(variable_name="history"),
|
| 412 |
("human", "{question}"),
|
src/states/vectorizationAgentState.py
CHANGED
|
@@ -33,6 +33,10 @@ class VectorizationAgentState(TypedDict, total=False):
|
|
| 33 |
clustering_results: Optional[Dict[str, Any]]
|
| 34 |
anomaly_results: Optional[Dict[str, Any]]
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
# ===== EXPERT ANALYSIS =====
|
| 37 |
expert_summary: Optional[str] # LLM-generated summary combining all insights
|
| 38 |
opportunities: List[Dict[str, Any]] # Detected opportunities
|
|
|
|
| 33 |
clustering_results: Optional[Dict[str, Any]]
|
| 34 |
anomaly_results: Optional[Dict[str, Any]]
|
| 35 |
|
| 36 |
+
# ===== TRENDING DETECTION =====
|
| 37 |
+
trending_results: Optional[Dict[str, Any]]
|
| 38 |
+
# {trending_topics, spike_alerts, entities_extracted}
|
| 39 |
+
|
| 40 |
# ===== EXPERT ANALYSIS =====
|
| 41 |
expert_summary: Optional[str] # LLM-generated summary combining all insights
|
| 42 |
opportunities: List[Dict[str, Any]] # Detected opportunities
|
test_multilingual_anomaly.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
test_multilingual_anomaly.py
|
| 3 |
+
Test the multilingual anomaly detection fix.
|
| 4 |
+
"""
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
if sys.platform == 'win32':
|
| 9 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
| 10 |
+
|
| 11 |
+
sys.path.insert(0, str(Path('.').resolve()))
|
| 12 |
+
|
| 13 |
+
from src.graphs.vectorizationAgentGraph import graph
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
|
| 16 |
+
test_texts = [
|
| 17 |
+
{"text": "URGENT: Massive landslide in Ratnapura!", "post_id": "EN_001"},
|
| 18 |
+
{"text": "Normal stock market day", "post_id": "EN_002"},
|
| 19 |
+
{"text": "ආර්ථික අර්බුදය නිසා ජනතාව දුෂ්කරතාවන්ට මුහුණ දෙයි", "post_id": "SI_001"},
|
| 20 |
+
{"text": "கொழும்பில் பெரும் மழை பெய்தது", "post_id": "TA_001"},
|
| 21 |
+
{"text": "Breaking news about corruption scandal", "post_id": "EN_003"},
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
result = graph.invoke({
|
| 25 |
+
"input_texts": test_texts,
|
| 26 |
+
"batch_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
| 27 |
+
})
|
| 28 |
+
|
| 29 |
+
print("=" * 60)
|
| 30 |
+
print("MULTILINGUAL ANOMALY DETECTION TEST")
|
| 31 |
+
print("=" * 60)
|
| 32 |
+
|
| 33 |
+
anomaly_results = result.get("anomaly_results", {})
|
| 34 |
+
print(f"\nStatus: {anomaly_results.get('status')}")
|
| 35 |
+
print(f"Model: {anomaly_results.get('model_used')}")
|
| 36 |
+
print(f"Total analyzed: {anomaly_results.get('total_analyzed')}")
|
| 37 |
+
|
| 38 |
+
anomalies = anomaly_results.get("anomalies", [])
|
| 39 |
+
print(f"\nAnomalies found: {len(anomalies)}")
|
| 40 |
+
for a in anomalies:
|
| 41 |
+
method = a.get("detection_method", "unknown")
|
| 42 |
+
print(f" - {a.get('post_id')}: {a.get('language')} | method: {method} | score: {a.get('anomaly_score', 0):.2f}")
|
| 43 |
+
|
| 44 |
+
lang_results = result.get("language_detection_results", [])
|
| 45 |
+
print(f"\nLanguage Detection:")
|
| 46 |
+
for lr in lang_results:
|
| 47 |
+
print(f" - {lr.get('post_id')}: {lr.get('language')} (conf: {lr.get('confidence', 0):.2f})")
|
| 48 |
+
|
| 49 |
+
# Summary
|
| 50 |
+
print("\n" + "=" * 60)
|
| 51 |
+
print("The fix ensures:")
|
| 52 |
+
print(" - English texts: Isolation Forest ML model")
|
| 53 |
+
print(" - Sinhala/Tamil: Magnitude-based heuristic (avoids false positives)")
|
| 54 |
+
print("=" * 60)
|
test_trending_integration.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
test_trending_integration.py
|
| 3 |
+
Test the trending detection integration in the vectorizer pipeline.
|
| 4 |
+
"""
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
if sys.platform == 'win32':
|
| 9 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
| 10 |
+
|
| 11 |
+
sys.path.insert(0, str(Path('.').resolve()))
|
| 12 |
+
|
| 13 |
+
from src.graphs.vectorizationAgentGraph import graph
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
|
| 16 |
+
print("=" * 60)
|
| 17 |
+
print("TESTING TRENDING DETECTION INTEGRATION")
|
| 18 |
+
print("=" * 60)
|
| 19 |
+
|
| 20 |
+
# Test with multiple mentions of the same topic to trigger trending
|
| 21 |
+
test_texts = [
|
| 22 |
+
{"text": "URGENT: Major earthquake hits Colombo, buildings damaged!", "post_id": "EN_001"},
|
| 23 |
+
{"text": "Breaking news: Earthquake in Colombo measuring 5.5 magnitude", "post_id": "EN_002"},
|
| 24 |
+
{"text": "Colombo earthquake causes panic, residents evacuated", "post_id": "EN_003"},
|
| 25 |
+
{"text": "Sri Lanka Cricket team wins against India in thrilling match", "post_id": "EN_004"},
|
| 26 |
+
{"text": "Stock market shows bullish trends in JKH", "post_id": "EN_005"},
|
| 27 |
+
{"text": "Another earthquake aftershock reported in Colombo area", "post_id": "EN_006"},
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
print(f"\nProcessing {len(test_texts)} texts with repeated topics...")
|
| 31 |
+
|
| 32 |
+
result = graph.invoke({
|
| 33 |
+
"input_texts": test_texts,
|
| 34 |
+
"batch_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
| 35 |
+
})
|
| 36 |
+
|
| 37 |
+
# Show trending results
|
| 38 |
+
print("\n" + "=" * 60)
|
| 39 |
+
print("TRENDING DETECTION RESULTS")
|
| 40 |
+
print("=" * 60)
|
| 41 |
+
|
| 42 |
+
trending_results = result.get("trending_results", {})
|
| 43 |
+
print(f"\nStatus: {trending_results.get('status', 'N/A')}")
|
| 44 |
+
print(f"Entities extracted: {trending_results.get('entities_extracted', 0)}")
|
| 45 |
+
|
| 46 |
+
# Show extracted entities
|
| 47 |
+
entities = trending_results.get("entities", [])
|
| 48 |
+
print(f"\nExtracted Entities ({len(entities)}):")
|
| 49 |
+
for e in entities[:10]:
|
| 50 |
+
print(f" - {e.get('entity')} ({e.get('type')}) from {e.get('post_id')}")
|
| 51 |
+
|
| 52 |
+
# Show trending topics
|
| 53 |
+
trending_topics = trending_results.get("trending_topics", [])
|
| 54 |
+
print(f"\nTrending Topics ({len(trending_topics)}):")
|
| 55 |
+
if trending_topics:
|
| 56 |
+
for t in trending_topics:
|
| 57 |
+
print(f" - {t.get('topic')}: momentum={t.get('momentum', 0):.2f}, is_spike={t.get('is_spike', False)}")
|
| 58 |
+
else:
|
| 59 |
+
print(" (No trending topics yet - need more historical data)")
|
| 60 |
+
|
| 61 |
+
# Show spike alerts
|
| 62 |
+
spike_alerts = trending_results.get("spike_alerts", [])
|
| 63 |
+
print(f"\nSpike Alerts ({len(spike_alerts)}):")
|
| 64 |
+
if spike_alerts:
|
| 65 |
+
for s in spike_alerts:
|
| 66 |
+
print(f" - {s.get('topic')}: momentum={s.get('momentum', 0):.2f}")
|
| 67 |
+
else:
|
| 68 |
+
print(" (No spike alerts)")
|
| 69 |
+
|
| 70 |
+
# Show anomaly results
|
| 71 |
+
print("\n" + "=" * 60)
|
| 72 |
+
print("ANOMALY DETECTION RESULTS")
|
| 73 |
+
print("=" * 60)
|
| 74 |
+
anomaly_results = result.get("anomaly_results", {})
|
| 75 |
+
print(f"Status: {anomaly_results.get('status', 'N/A')}")
|
| 76 |
+
print(f"Anomalies found: {anomaly_results.get('anomalies_found', 0)}")
|
| 77 |
+
|
| 78 |
+
print("\n" + "=" * 60)
|
| 79 |
+
print("PIPELINE COMPLETE - 6-Step Architecture Working!")
|
| 80 |
+
print("=" * 60)
|
test_vectorizer_and_anomaly.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
test_vectorizer_and_anomaly.py
|
| 3 |
+
Test script to run the Vectorizer Agent and Anomaly Detection pipeline
|
| 4 |
+
Generates visualizations of the results
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import json
|
| 10 |
+
import numpy as np
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
|
| 14 |
+
# Fix Windows encoding
|
| 15 |
+
if sys.platform == "win32":
|
| 16 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
| 17 |
+
|
| 18 |
+
# Add project root to path
|
| 19 |
+
PROJECT_ROOT = Path(__file__).parent
|
| 20 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 21 |
+
|
| 22 |
+
# Load environment
|
| 23 |
+
try:
|
| 24 |
+
from dotenv import load_dotenv
|
| 25 |
+
load_dotenv()
|
| 26 |
+
except ImportError:
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
print("=" * 70)
|
| 30 |
+
print(" VECTORIZER AGENT & ANOMALY DETECTION TEST")
|
| 31 |
+
print("=" * 70)
|
| 32 |
+
print()
|
| 33 |
+
|
| 34 |
+
# ============================================
|
| 35 |
+
# STEP 1: TEST VECTORIZER AGENT
|
| 36 |
+
# ============================================
|
| 37 |
+
print("\n" + "=" * 50)
|
| 38 |
+
print("STEP 1: Testing Vectorizer Agent")
|
| 39 |
+
print("=" * 50)
|
| 40 |
+
|
| 41 |
+
# Sample multilingual test data
|
| 42 |
+
test_texts = [
|
| 43 |
+
{"text": "The political situation in Colombo is tense with protests happening", "post_id": "EN_001"},
|
| 44 |
+
{"text": "Stock market shows bullish trends in JKH and Commercial Bank", "post_id": "EN_002"},
|
| 45 |
+
{"text": "Heavy rainfall expected in Southern Province causing flood warnings", "post_id": "EN_003"},
|
| 46 |
+
{"text": "Economic reforms by the government receive mixed public response", "post_id": "EN_004"},
|
| 47 |
+
{"text": "URGENT: Massive landslide in Ratnapura district, evacuations underway!", "post_id": "EN_005"},
|
| 48 |
+
{"text": "Normal day, nothing much happening, just regular news", "post_id": "EN_006"},
|
| 49 |
+
{"text": "Coffee prices remain stable in local markets", "post_id": "EN_007"},
|
| 50 |
+
{"text": "BREAKING: Major corruption scandal exposed in government ministry", "post_id": "EN_008"},
|
| 51 |
+
{"text": "Sri Lanka cricket team wins against India in thrilling match", "post_id": "EN_009"},
|
| 52 |
+
{"text": "Warning: Tsunami alert issued for coastal areas - immediate evacuation!", "post_id": "EN_010"},
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
# Add some Sinhala text samples (using romanized for simplicity)
|
| 56 |
+
sinhala_texts = [
|
| 57 |
+
{"text": "කොළඹ නගරයේ අද මහ වර්ෂාවක් ඇති විය", "post_id": "SI_001"},
|
| 58 |
+
{"text": "ආර්ථික අර්බුදය හේතුවෙන් ජනතාව දුෂ්කරතාවන්ට මුහුණ දෙයි", "post_id": "SI_002"},
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
# Add Tamil text samples
|
| 62 |
+
tamil_texts = [
|
| 63 |
+
{"text": "கொழும்பில் பெரும் மழை பெய்தது", "post_id": "TA_001"},
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
all_texts = test_texts + sinhala_texts + tamil_texts
|
| 67 |
+
|
| 68 |
+
print(f"📝 Testing with {len(all_texts)} sample texts")
|
| 69 |
+
print(f" - English: {len(test_texts)}")
|
| 70 |
+
print(f" - Sinhala: {len(sinhala_texts)}")
|
| 71 |
+
print(f" - Tamil: {len(tamil_texts)}")
|
| 72 |
+
|
| 73 |
+
# Run the vectorizer agent
|
| 74 |
+
try:
|
| 75 |
+
from src.graphs.vectorizationAgentGraph import graph as vectorizer_graph
|
| 76 |
+
|
| 77 |
+
initial_state = {
|
| 78 |
+
"input_texts": all_texts,
|
| 79 |
+
"batch_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
print("\n🔄 Running Vectorizer Agent Graph...")
|
| 83 |
+
result = vectorizer_graph.invoke(initial_state)
|
| 84 |
+
|
| 85 |
+
print("\n✅ Vectorizer Agent Results:")
|
| 86 |
+
print("-" * 40)
|
| 87 |
+
|
| 88 |
+
# Language detection results
|
| 89 |
+
lang_results = result.get("language_detection_results", [])
|
| 90 |
+
print(f"\n📊 Language Detection:")
|
| 91 |
+
lang_stats = {}
|
| 92 |
+
for item in lang_results:
|
| 93 |
+
lang = item.get("language", "unknown")
|
| 94 |
+
lang_stats[lang] = lang_stats.get(lang, 0) + 1
|
| 95 |
+
print(f" - {item.get('post_id')}: {lang} (conf: {item.get('confidence', 0):.2f})")
|
| 96 |
+
|
| 97 |
+
print(f"\n📈 Language Distribution: {lang_stats}")
|
| 98 |
+
|
| 99 |
+
# Vector embeddings
|
| 100 |
+
embeddings = result.get("vector_embeddings", [])
|
| 101 |
+
print(f"\n🔢 Vector Embeddings Generated: {len(embeddings)}")
|
| 102 |
+
if embeddings:
|
| 103 |
+
sample = embeddings[0]
|
| 104 |
+
print(f" Sample vector dim: {sample.get('vector_dim', 0)}")
|
| 105 |
+
print(f" Models used: {set(e.get('model_used', '') for e in embeddings)}")
|
| 106 |
+
|
| 107 |
+
# Anomaly detection results
|
| 108 |
+
anomaly_results = result.get("anomaly_results", {})
|
| 109 |
+
print(f"\n🔍 Anomaly Detection:")
|
| 110 |
+
print(f" Status: {anomaly_results.get('status', 'unknown')}")
|
| 111 |
+
print(f" Model: {anomaly_results.get('model_used', 'none')}")
|
| 112 |
+
print(f" Total Analyzed: {anomaly_results.get('total_analyzed', 0)}")
|
| 113 |
+
print(f" Anomalies Found: {anomaly_results.get('anomalies_found', 0)}")
|
| 114 |
+
|
| 115 |
+
anomalies = anomaly_results.get("anomalies", [])
|
| 116 |
+
if anomalies:
|
| 117 |
+
print(f"\n⚠️ Detected Anomalies:")
|
| 118 |
+
for a in anomalies:
|
| 119 |
+
print(f" - {a.get('post_id')}: score={a.get('anomaly_score', 0):.3f}")
|
| 120 |
+
|
| 121 |
+
# Expert summary
|
| 122 |
+
expert_summary = result.get("expert_summary", "")
|
| 123 |
+
if expert_summary:
|
| 124 |
+
print(f"\n📋 Expert Summary (first 500 chars):")
|
| 125 |
+
print(f" {expert_summary[:500]}...")
|
| 126 |
+
|
| 127 |
+
# Domain insights
|
| 128 |
+
domain_insights = result.get("domain_insights", [])
|
| 129 |
+
print(f"\n💡 Domain Insights Generated: {len(domain_insights)}")
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"❌ Vectorizer Agent Error: {e}")
|
| 133 |
+
import traceback
|
| 134 |
+
traceback.print_exc()
|
| 135 |
+
|
| 136 |
+
# ============================================
|
| 137 |
+
# STEP 2: RUN ANOMALY DETECTION PIPELINE
|
| 138 |
+
# ============================================
|
| 139 |
+
print("\n\n" + "=" * 50)
|
| 140 |
+
print("STEP 2: Running Anomaly Detection ML Pipeline")
|
| 141 |
+
print("=" * 50)
|
| 142 |
+
|
| 143 |
+
MODELS_PATH = PROJECT_ROOT / "models" / "anomaly-detection"
|
| 144 |
+
sys.path.insert(0, str(MODELS_PATH))
|
| 145 |
+
|
| 146 |
+
try:
|
| 147 |
+
from src.pipeline.training_pipeline import TrainingPipeline
|
| 148 |
+
|
| 149 |
+
print("\n🔄 Running Anomaly Detection Training Pipeline...")
|
| 150 |
+
pipeline = TrainingPipeline()
|
| 151 |
+
artifacts = pipeline.run()
|
| 152 |
+
|
| 153 |
+
print("\n✅ Training Pipeline Results:")
|
| 154 |
+
print("-" * 40)
|
| 155 |
+
|
| 156 |
+
if artifacts.get("model_trainer"):
|
| 157 |
+
trainer_artifact = artifacts["model_trainer"]
|
| 158 |
+
print(f" Best Score: {getattr(trainer_artifact, 'best_score', 'N/A')}")
|
| 159 |
+
print(f" Best Model: {getattr(trainer_artifact, 'best_model', 'N/A')}")
|
| 160 |
+
|
| 161 |
+
# Check for model files
|
| 162 |
+
model_path = MODELS_PATH / "output"
|
| 163 |
+
if model_path.exists():
|
| 164 |
+
models = list(model_path.glob("*.joblib"))
|
| 165 |
+
print(f"\n📁 Saved Models: {len(models)}")
|
| 166 |
+
for m in models:
|
| 167 |
+
print(f" - {m.name}")
|
| 168 |
+
|
| 169 |
+
except ImportError as e:
|
| 170 |
+
print(f"⚠️ Could not import training pipeline: {e}")
|
| 171 |
+
print(" Running standalone model training instead...")
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
# Try running the main.py directly
|
| 175 |
+
os.chdir(MODELS_PATH)
|
| 176 |
+
exec(open(MODELS_PATH / "main.py").read())
|
| 177 |
+
except Exception as e2:
|
| 178 |
+
print(f"❌ Standalone training error: {e2}")
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"❌ Pipeline Error: {e}")
|
| 181 |
+
import traceback
|
| 182 |
+
traceback.print_exc()
|
| 183 |
+
|
| 184 |
+
# ============================================
|
| 185 |
+
# STEP 3: VISUALIZATION
|
| 186 |
+
# ============================================
|
| 187 |
+
print("\n\n" + "=" * 50)
|
| 188 |
+
print("STEP 3: Generating Visualizations")
|
| 189 |
+
print("=" * 50)
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
import matplotlib
|
| 193 |
+
matplotlib.use('Agg') # Non-interactive backend
|
| 194 |
+
import matplotlib.pyplot as plt
|
| 195 |
+
from sklearn.decomposition import PCA
|
| 196 |
+
|
| 197 |
+
# Only visualize if we have embeddings
|
| 198 |
+
if 'embeddings' in dir() and embeddings:
|
| 199 |
+
# Extract vectors
|
| 200 |
+
vectors = []
|
| 201 |
+
labels = []
|
| 202 |
+
for emb in embeddings:
|
| 203 |
+
vec = emb.get("vector", [])
|
| 204 |
+
if len(vec) == 768:
|
| 205 |
+
vectors.append(vec)
|
| 206 |
+
labels.append(emb.get("post_id", ""))
|
| 207 |
+
|
| 208 |
+
if vectors:
|
| 209 |
+
X = np.array(vectors)
|
| 210 |
+
|
| 211 |
+
# PCA for visualization
|
| 212 |
+
pca = PCA(n_components=2)
|
| 213 |
+
X_2d = pca.fit_transform(X)
|
| 214 |
+
|
| 215 |
+
# Create figure
|
| 216 |
+
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
|
| 217 |
+
|
| 218 |
+
# Plot 1: Language distribution scatter
|
| 219 |
+
ax1 = axes[0]
|
| 220 |
+
colors = {'english': 'blue', 'sinhala': 'green', 'tamil': 'orange', 'unknown': 'gray'}
|
| 221 |
+
|
| 222 |
+
for i, emb in enumerate(embeddings):
|
| 223 |
+
if i < len(X_2d):
|
| 224 |
+
lang = emb.get("language", "unknown")
|
| 225 |
+
ax1.scatter(X_2d[i, 0], X_2d[i, 1], c=colors.get(lang, 'gray'),
|
| 226 |
+
s=100, alpha=0.7, label=lang if lang not in [e.get('language') for e in embeddings[:i]] else "")
|
| 227 |
+
|
| 228 |
+
ax1.set_title("Text Embeddings by Language (PCA 2D)")
|
| 229 |
+
ax1.set_xlabel("PC1")
|
| 230 |
+
ax1.set_ylabel("PC2")
|
| 231 |
+
|
| 232 |
+
# Add legend (unique labels only)
|
| 233 |
+
handles, legend_labels = ax1.get_legend_handles_labels()
|
| 234 |
+
by_label = dict(zip(legend_labels, handles))
|
| 235 |
+
ax1.legend(by_label.values(), by_label.keys())
|
| 236 |
+
|
| 237 |
+
# Plot 2: Anomaly scores
|
| 238 |
+
ax2 = axes[1]
|
| 239 |
+
if anomalies:
|
| 240 |
+
anomaly_ids = [a.get("post_id", "") for a in anomalies]
|
| 241 |
+
|
| 242 |
+
for i, emb in enumerate(embeddings):
|
| 243 |
+
if i < len(X_2d):
|
| 244 |
+
is_anomaly = emb.get("post_id", "") in anomaly_ids
|
| 245 |
+
color = 'red' if is_anomaly else 'blue'
|
| 246 |
+
marker = 'X' if is_anomaly else 'o'
|
| 247 |
+
ax2.scatter(X_2d[i, 0], X_2d[i, 1], c=color, marker=marker,
|
| 248 |
+
s=150 if is_anomaly else 80, alpha=0.7)
|
| 249 |
+
|
| 250 |
+
ax2.scatter([], [], c='red', marker='X', s=150, label='Anomaly')
|
| 251 |
+
ax2.scatter([], [], c='blue', marker='o', s=80, label='Normal')
|
| 252 |
+
ax2.legend()
|
| 253 |
+
else:
|
| 254 |
+
ax2.scatter(X_2d[:, 0], X_2d[:, 1], c='blue', s=80, alpha=0.7)
|
| 255 |
+
ax2.text(0.5, 0.5, "No anomalies detected\n(Model may not be trained)",
|
| 256 |
+
ha='center', va='center', transform=ax2.transAxes)
|
| 257 |
+
|
| 258 |
+
ax2.set_title("Anomaly Detection Results (PCA 2D)")
|
| 259 |
+
ax2.set_xlabel("PC1")
|
| 260 |
+
ax2.set_ylabel("PC2")
|
| 261 |
+
|
| 262 |
+
plt.tight_layout()
|
| 263 |
+
|
| 264 |
+
# Save figure
|
| 265 |
+
output_path = PROJECT_ROOT / "vectorizer_anomaly_visualization.png"
|
| 266 |
+
plt.savefig(output_path, dpi=150, bbox_inches='tight')
|
| 267 |
+
print(f"\n✅ Visualization saved to: {output_path}")
|
| 268 |
+
|
| 269 |
+
plt.close()
|
| 270 |
+
else:
|
| 271 |
+
print("⚠️ No embeddings available for visualization")
|
| 272 |
+
|
| 273 |
+
except ImportError as e:
|
| 274 |
+
print(f"⚠️ Matplotlib not available for visualization: {e}")
|
| 275 |
+
except Exception as e:
|
| 276 |
+
print(f"❌ Visualization Error: {e}")
|
| 277 |
+
import traceback
|
| 278 |
+
traceback.print_exc()
|
| 279 |
+
|
| 280 |
+
# ============================================
|
| 281 |
+
# STEP 4: GRAPH FLOW VISUALIZATION
|
| 282 |
+
# ============================================
|
| 283 |
+
print("\n\n" + "=" * 50)
|
| 284 |
+
print("STEP 4: Generating Graph Flow Diagram")
|
| 285 |
+
print("=" * 50)
|
| 286 |
+
|
| 287 |
+
try:
|
| 288 |
+
# Create a simple ASCII graph visualization
|
| 289 |
+
graph_viz = """
|
| 290 |
+
╔═══════════════════════════════════════════════════════════════════╗
|
| 291 |
+
║ VECTORIZATION AGENT GRAPH FLOW ║
|
| 292 |
+
╠═══════════════════════════════════════════════════════════════════╣
|
| 293 |
+
║ ║
|
| 294 |
+
║ ┌─────────────────┐ ║
|
| 295 |
+
║ │ INPUT TEXTS │ (Multilingual: EN, SI, TA) ║
|
| 296 |
+
║ └────────┬────────┘ ║
|
| 297 |
+
║ │ ║
|
| 298 |
+
║ ▼ ║
|
| 299 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 300 |
+
║ │ STEP 1: LANGUAGE DETECTION │ ║
|
| 301 |
+
║ │ ├─ FastText (primary) │ ║
|
| 302 |
+
║ │ └─ Unicode Script Analysis (fallback) │ ║
|
| 303 |
+
║ └────────┬────────────────────────────────────────────┘ ║
|
| 304 |
+
║ │ ║
|
| 305 |
+
║ ▼ ║
|
| 306 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 307 |
+
║ │ STEP 2: TEXT VECTORIZATION │ ║
|
| 308 |
+
║ │ ├─ English → DistilBERT (768-dim) │ ║
|
| 309 |
+
║ │ ├─ Sinhala → SinhalaBERTo (768-dim) │ ║
|
| 310 |
+
║ │ └─ Tamil → Tamil-BERT (768-dim) │ ║
|
| 311 |
+
║ └────────┬────────────────────────────────────────────┘ ║
|
| 312 |
+
║ │ ║
|
| 313 |
+
║ ▼ ║
|
| 314 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 315 |
+
║ │ STEP 3: ANOMALY DETECTION │ ║
|
| 316 |
+
║ │ ├─ Model: Isolation Forest / LOF │ ║
|
| 317 |
+
║ │ ├─ Input: 768-dim embedding vectors │ ║
|
| 318 |
+
║ │ └─ Output: anomaly_score (0-1), is_anomaly flag │ ║
|
| 319 |
+
║ └────────┬────────────────────────────────────────────┘ ║
|
| 320 |
+
║ │ ║
|
| 321 |
+
║ ▼ ║
|
| 322 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 323 |
+
║ │ STEP 4: EXPERT SUMMARY (GroqLLM) │ ║
|
| 324 |
+
║ │ ├─ Opportunity Detection │ ║
|
| 325 |
+
║ │ └─ Threat Detection │ ║
|
| 326 |
+
║ └────────┬───────────────────────���────────────────────┘ ║
|
| 327 |
+
║ │ ║
|
| 328 |
+
║ ▼ ║
|
| 329 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 330 |
+
║ │ STEP 5: FORMAT OUTPUT │ ║
|
| 331 |
+
║ │ └─ domain_insights[] for Combined Agent │ ║
|
| 332 |
+
║ └────────┬────────────────────────────────────────────┘ ║
|
| 333 |
+
║ │ ║
|
| 334 |
+
║ ▼ ║
|
| 335 |
+
║ ┌─────────────────┐ ║
|
| 336 |
+
║ │ END │ → Passed to Feed Aggregator ║
|
| 337 |
+
║ └─────────────────┘ ║
|
| 338 |
+
║ ║
|
| 339 |
+
╚═══════════════════════════════════════════════════════════════════╝
|
| 340 |
+
"""
|
| 341 |
+
print(graph_viz)
|
| 342 |
+
|
| 343 |
+
# Save as text file
|
| 344 |
+
graph_path = PROJECT_ROOT / "vectorizer_graph_flow.txt"
|
| 345 |
+
with open(graph_path, "w", encoding="utf-8") as f:
|
| 346 |
+
f.write(graph_viz)
|
| 347 |
+
print(f"✅ Graph flow saved to: {graph_path}")
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
print(f"❌ Error: {e}")
|
| 351 |
+
|
| 352 |
+
# ============================================
|
| 353 |
+
# SUMMARY
|
| 354 |
+
# ============================================
|
| 355 |
+
print("\n\n" + "=" * 70)
|
| 356 |
+
print(" TEST SUMMARY")
|
| 357 |
+
print("=" * 70)
|
| 358 |
+
|
| 359 |
+
print("""
|
| 360 |
+
📊 VECTORIZER AGENT ARCHITECTURE:
|
| 361 |
+
├── 5-Step Sequential Pipeline
|
| 362 |
+
├── Multilingual Support: English, Sinhala, Tamil
|
| 363 |
+
├── BERT Models: DistilBERT, SinhalaBERTo, Tamil-BERT
|
| 364 |
+
└── Output: 768-dimensional embeddings
|
| 365 |
+
|
| 366 |
+
🔍 ANOMALY DETECTION:
|
| 367 |
+
├── Algorithm: Isolation Forest / LOF
|
| 368 |
+
├── Training: Optuna hyperparameter optimization
|
| 369 |
+
├── MLflow: Experiment tracking (DagsHub)
|
| 370 |
+
└── Integration: Real-time inference on every graph cycle
|
| 371 |
+
|
| 372 |
+
📁 OUTPUT FILES:
|
| 373 |
+
├── vectorizer_anomaly_visualization.png (if matplotlib available)
|
| 374 |
+
└── vectorizer_graph_flow.txt (graph architecture)
|
| 375 |
+
""")
|
| 376 |
+
|
| 377 |
+
print("=" * 70)
|
| 378 |
+
print(" TEST COMPLETE")
|
| 379 |
+
print("=" * 70)
|
trending_detection_visualization.png
ADDED
|
Git LFS Details
|
vectorizer_anomaly_visualization.png
ADDED
|
Git LFS Details
|
vectorizer_graph_flow.txt
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
╔═══════════════════════════════════════════════════════════════════╗
|
| 3 |
+
║ VECTORIZATION AGENT GRAPH FLOW ║
|
| 4 |
+
╠═══════════════════════════════════════════════════════════════════╣
|
| 5 |
+
║ ║
|
| 6 |
+
║ ┌─────────────────┐ ║
|
| 7 |
+
║ │ INPUT TEXTS │ (Multilingual: EN, SI, TA) ║
|
| 8 |
+
║ └────────┬────────┘ ║
|
| 9 |
+
║ │ ║
|
| 10 |
+
║ ▼ ║
|
| 11 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 12 |
+
║ │ STEP 1: LANGUAGE DETECTION │ ║
|
| 13 |
+
║ │ ├─ FastText (primary) │ ║
|
| 14 |
+
║ │ └─ Unicode Script Analysis (fallback) │ ║
|
| 15 |
+
║ └────────┬────────────────────────────────────────────┘ ║
|
| 16 |
+
║ │ ║
|
| 17 |
+
║ ▼ ║
|
| 18 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 19 |
+
║ │ STEP 2: TEXT VECTORIZATION │ ║
|
| 20 |
+
║ │ ├─ English → DistilBERT (768-dim) │ ║
|
| 21 |
+
║ │ ├─ Sinhala → SinhalaBERTo (768-dim) │ ║
|
| 22 |
+
║ │ └─ Tamil → Tamil-BERT (768-dim) │ ║
|
| 23 |
+
║ └────────┬────────────────────────────────────────────┘ ║
|
| 24 |
+
║ │ ║
|
| 25 |
+
║ ▼ ║
|
| 26 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 27 |
+
║ │ STEP 3: ANOMALY DETECTION │ ║
|
| 28 |
+
║ │ ├─ Model: Isolation Forest / LOF │ ║
|
| 29 |
+
║ │ ├─ Input: 768-dim embedding vectors │ ║
|
| 30 |
+
║ │ └─ Output: anomaly_score (0-1), is_anomaly flag │ ║
|
| 31 |
+
║ └────────┬────────────────────────────────────────────┘ ║
|
| 32 |
+
║ │ ║
|
| 33 |
+
║ ▼ ║
|
| 34 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 35 |
+
║ │ STEP 4: EXPERT SUMMARY (GroqLLM) │ ║
|
| 36 |
+
║ │ ├─ Opportunity Detection │ ║
|
| 37 |
+
║ │ └─ Threat Detection │ ║
|
| 38 |
+
║ └────────┬────────────────────────────────────────────┘ ║
|
| 39 |
+
║ │ ║
|
| 40 |
+
║ ▼ ║
|
| 41 |
+
║ ┌─────────────────────────────────────────────────────┐ ║
|
| 42 |
+
║ │ STEP 5: FORMAT OUTPUT │ ║
|
| 43 |
+
║ │ └─ domain_insights[] for Combined Agent │ ║
|
| 44 |
+
║ └────────┬────────────────────────────────────────────┘ ║
|
| 45 |
+
║ │ ║
|
| 46 |
+
║ ▼ ║
|
| 47 |
+
║ ┌─────────────────┐ ║
|
| 48 |
+
║ │ END │ → Passed to Feed Aggregator ║
|
| 49 |
+
║ └─────────────────┘ ║
|
| 50 |
+
║ ║
|
| 51 |
+
╚═══════════════════════════════════════════════════════════════════╝
|
| 52 |
+
|
visualize_trending.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
visualize_trending.py
|
| 3 |
+
Creates visual graphs for trending detection results
|
| 4 |
+
"""
|
| 5 |
+
import sys
|
| 6 |
+
import matplotlib
|
| 7 |
+
matplotlib.use('Agg')
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
import numpy as np
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
if sys.platform == 'win32':
|
| 14 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
| 15 |
+
|
| 16 |
+
sys.path.insert(0, str(Path('.').resolve()))
|
| 17 |
+
|
| 18 |
+
print("=" * 60)
|
| 19 |
+
print("GENERATING TRENDING DETECTION VISUALIZATION")
|
| 20 |
+
print("=" * 60)
|
| 21 |
+
|
| 22 |
+
# Run the vectorizer to get fresh data
|
| 23 |
+
from src.graphs.vectorizationAgentGraph import graph
|
| 24 |
+
|
| 25 |
+
test_texts = [
|
| 26 |
+
{"text": "URGENT: Major earthquake hits Colombo, buildings damaged!", "post_id": "EN_001"},
|
| 27 |
+
{"text": "Breaking news: Earthquake in Colombo measuring 5.5 magnitude", "post_id": "EN_002"},
|
| 28 |
+
{"text": "Colombo earthquake causes panic, residents evacuated", "post_id": "EN_003"},
|
| 29 |
+
{"text": "Sri Lanka Cricket team wins against India in thrilling match", "post_id": "EN_004"},
|
| 30 |
+
{"text": "Stock market shows bullish trends in JKH and Commercial Bank", "post_id": "EN_005"},
|
| 31 |
+
{"text": "Another earthquake aftershock reported in Colombo area", "post_id": "EN_006"},
|
| 32 |
+
{"text": "President announces relief fund for earthquake victims", "post_id": "EN_007"},
|
| 33 |
+
{"text": "Heavy rainfall expected in Southern Province", "post_id": "EN_008"},
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
print(f"\nProcessing {len(test_texts)} texts...")
|
| 37 |
+
|
| 38 |
+
result = graph.invoke({
|
| 39 |
+
"input_texts": test_texts,
|
| 40 |
+
"batch_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
| 41 |
+
})
|
| 42 |
+
|
| 43 |
+
trending_results = result.get("trending_results", {})
|
| 44 |
+
anomaly_results = result.get("anomaly_results", {})
|
| 45 |
+
|
| 46 |
+
# Get trending data
|
| 47 |
+
trending_topics = trending_results.get("trending_topics", [])
|
| 48 |
+
spike_alerts = trending_results.get("spike_alerts", [])
|
| 49 |
+
entities = trending_results.get("entities", [])
|
| 50 |
+
|
| 51 |
+
print(f"Trending topics: {len(trending_topics)}")
|
| 52 |
+
print(f"Spike alerts: {len(spike_alerts)}")
|
| 53 |
+
|
| 54 |
+
# Create visualization
|
| 55 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 56 |
+
fig.suptitle('Vectorizer Agent: Trending Detection Dashboard', fontsize=16, fontweight='bold')
|
| 57 |
+
|
| 58 |
+
# ===== PLOT 1: Trending Topics Momentum =====
|
| 59 |
+
ax1 = axes[0, 0]
|
| 60 |
+
if trending_topics:
|
| 61 |
+
topics = [t.get('topic', '')[:15] for t in trending_topics[:10]]
|
| 62 |
+
momentums = [t.get('momentum', 0) for t in trending_topics[:10]]
|
| 63 |
+
colors = ['#e74c3c' if m > 30 else '#f39c12' if m > 10 else '#3498db' for m in momentums]
|
| 64 |
+
|
| 65 |
+
bars = ax1.barh(topics, momentums, color=colors, edgecolor='black', linewidth=0.5)
|
| 66 |
+
ax1.set_xlabel('Momentum Score', fontsize=11)
|
| 67 |
+
ax1.set_title('Top Trending Topics by Momentum', fontsize=12, fontweight='bold')
|
| 68 |
+
ax1.axvline(x=3, color='orange', linestyle='--', alpha=0.7, label='Spike Threshold (3x)')
|
| 69 |
+
ax1.axvline(x=2, color='green', linestyle='--', alpha=0.7, label='Trending Threshold (2x)')
|
| 70 |
+
ax1.legend(loc='lower right', fontsize=8)
|
| 71 |
+
|
| 72 |
+
# Add value labels
|
| 73 |
+
for bar, val in zip(bars, momentums):
|
| 74 |
+
ax1.text(val + 1, bar.get_y() + bar.get_height()/2, f'{val:.0f}x',
|
| 75 |
+
va='center', fontsize=9, fontweight='bold')
|
| 76 |
+
else:
|
| 77 |
+
ax1.text(0.5, 0.5, 'No trending topics', ha='center', va='center', fontsize=12)
|
| 78 |
+
ax1.set_title('Top Trending Topics', fontsize=12, fontweight='bold')
|
| 79 |
+
|
| 80 |
+
# ===== PLOT 2: Entity Types Distribution =====
|
| 81 |
+
ax2 = axes[0, 1]
|
| 82 |
+
if entities:
|
| 83 |
+
entity_types = {}
|
| 84 |
+
for e in entities:
|
| 85 |
+
t = e.get('type', 'unknown')
|
| 86 |
+
entity_types[t] = entity_types.get(t, 0) + 1
|
| 87 |
+
|
| 88 |
+
labels = list(entity_types.keys())
|
| 89 |
+
sizes = list(entity_types.values())
|
| 90 |
+
colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f1c40f'][:len(labels)]
|
| 91 |
+
explode = [0.05] * len(labels)
|
| 92 |
+
|
| 93 |
+
ax2.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
|
| 94 |
+
colors=colors, shadow=True, startangle=90)
|
| 95 |
+
ax2.set_title('Extracted Entity Types', fontsize=12, fontweight='bold')
|
| 96 |
+
else:
|
| 97 |
+
ax2.text(0.5, 0.5, 'No entities', ha='center', va='center', fontsize=12)
|
| 98 |
+
ax2.set_title('Extracted Entity Types', fontsize=12, fontweight='bold')
|
| 99 |
+
|
| 100 |
+
# ===== PLOT 3: Spike Alerts =====
|
| 101 |
+
ax3 = axes[1, 0]
|
| 102 |
+
if spike_alerts:
|
| 103 |
+
spike_topics = [s.get('topic', '')[:12] for s in spike_alerts[:8]]
|
| 104 |
+
spike_moms = [s.get('momentum', 0) for s in spike_alerts[:8]]
|
| 105 |
+
|
| 106 |
+
bars = ax3.bar(spike_topics, spike_moms, color='#e74c3c', edgecolor='black', linewidth=1)
|
| 107 |
+
ax3.set_ylabel('Momentum', fontsize=11)
|
| 108 |
+
ax3.set_title('🔥 SPIKE ALERTS (>3x Normal Volume)', fontsize=12, fontweight='bold', color='#c0392b')
|
| 109 |
+
ax3.axhline(y=3, color='orange', linestyle='--', alpha=0.7)
|
| 110 |
+
ax3.set_xticklabels(spike_topics, rotation=45, ha='right', fontsize=9)
|
| 111 |
+
|
| 112 |
+
# Add explosion effect
|
| 113 |
+
for bar, val in zip(bars, spike_moms):
|
| 114 |
+
ax3.text(bar.get_x() + bar.get_width()/2, val + 2, f'{val:.0f}x',
|
| 115 |
+
ha='center', fontsize=10, fontweight='bold', color='#c0392b')
|
| 116 |
+
else:
|
| 117 |
+
ax3.text(0.5, 0.5, 'No spike alerts', ha='center', va='center', fontsize=12)
|
| 118 |
+
ax3.set_title('Spike Alerts', fontsize=12, fontweight='bold')
|
| 119 |
+
|
| 120 |
+
# ===== PLOT 4: Pipeline Summary =====
|
| 121 |
+
ax4 = axes[1, 1]
|
| 122 |
+
ax4.axis('off')
|
| 123 |
+
|
| 124 |
+
# Create a summary box
|
| 125 |
+
summary_text = f"""
|
| 126 |
+
╔══════════════════════════════════════════════════╗
|
| 127 |
+
║ VECTORIZER AGENT PIPELINE SUMMARY ║
|
| 128 |
+
╠══════════════════════════════════════════════════╣
|
| 129 |
+
║ ║
|
| 130 |
+
║ 📝 Texts Processed: {len(test_texts):>5} ║
|
| 131 |
+
║ 🌐 Entities Extracted: {len(entities):>5} ║
|
| 132 |
+
║ 📈 Trending Topics: {len(trending_topics):>5} ║
|
| 133 |
+
║ 🔥 Spike Alerts: {len(spike_alerts):>5} ║
|
| 134 |
+
║ ⚠️ Anomalies Detected: {anomaly_results.get('anomalies_found', 0):>5} ║
|
| 135 |
+
║ ║
|
| 136 |
+
╠══════════════════════════════════════════════════╣
|
| 137 |
+
║ Top Trending: ║
|
| 138 |
+
"""
|
| 139 |
+
|
| 140 |
+
if trending_topics:
|
| 141 |
+
for i, t in enumerate(trending_topics[:3]):
|
| 142 |
+
topic = t.get('topic', 'N/A')[:20]
|
| 143 |
+
mom = t.get('momentum', 0)
|
| 144 |
+
summary_text += f"║ {i+1}. {topic:<20} ({mom:.0f}x) ║\n"
|
| 145 |
+
else:
|
| 146 |
+
summary_text += "║ (No trending topics) ║\n"
|
| 147 |
+
|
| 148 |
+
summary_text += """╚══════════════════════════════════════════════════╝"""
|
| 149 |
+
|
| 150 |
+
ax4.text(0.5, 0.5, summary_text, family='monospace', fontsize=9,
|
| 151 |
+
ha='center', va='center',
|
| 152 |
+
bbox=dict(boxstyle='round', facecolor='#ecf0f1', edgecolor='#2c3e50'))
|
| 153 |
+
|
| 154 |
+
plt.tight_layout()
|
| 155 |
+
plt.subplots_adjust(top=0.93)
|
| 156 |
+
|
| 157 |
+
# Save
|
| 158 |
+
output_path = Path('trending_detection_visualization.png')
|
| 159 |
+
plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='white')
|
| 160 |
+
print(f"\n✅ Visualization saved: {output_path}")
|
| 161 |
+
|
| 162 |
+
# Also save to artifacts
|
| 163 |
+
artifacts_dir = Path(r'C:\Users\LENOVO\.gemini\antigravity\brain\b892f63f-afbc-4c4a-bbf1-37195faf04a5')
|
| 164 |
+
if artifacts_dir.exists():
|
| 165 |
+
artifacts_output = artifacts_dir / 'trending_visualization.png'
|
| 166 |
+
plt.savefig(str(artifacts_output), dpi=150, bbox_inches='tight', facecolor='white')
|
| 167 |
+
print(f"✅ Also saved to: {artifacts_output}")
|
| 168 |
+
|
| 169 |
+
plt.close()
|
| 170 |
+
|
| 171 |
+
print("\n" + "=" * 60)
|
| 172 |
+
print("VISUALIZATION COMPLETE")
|
| 173 |
+
print("=" * 60)
|