File size: 1,591 Bytes
ff3017c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import logging
import sys
from pathlib import Path
from collections import Counter
# Setup path
sys.path.append(str(Path.cwd()))
from src.storage.config import config
from src.storage.chromadb_store import ChromaDBStore
# Mute logging
logging.basicConfig(level=logging.ERROR)
def inspect():
print("Connecting to ChromaDB...")
store = ChromaDBStore()
if not store.collection:
print("Could not connect to collection.")
return
count = store.collection.count()
print(f"Total documents: {count}")
if count == 0:
return
# Get a sample or all metadata
# ChromaDB get() without ids returns everything if limit allows, or we can page.
# tailored for 2000 docs, let's just get all metadatas.
print("Fetching metadata...")
data = store.collection.get(include=["metadatas"])
metadatas = data["metadatas"]
domains = Counter()
sources = Counter()
impacts = Counter()
for meta in metadatas:
if not meta: continue
domains[meta.get("domain", "unknown")] += 1
sources[meta.get("platform", "unknown")] += 1
impacts[meta.get("impact_type", "unknown")] += 1
print("\n--- Domain Distribution ---")
for d, c in domains.most_common():
print(f"{d}: {c}")
print("\n--- Source/Platform Distribution ---")
for s, c in sources.most_common():
print(f"{s}: {c}")
print("\n--- Impact Type Distribution ---")
for i, c in impacts.most_common():
print(f"{i}: {c}")
if __name__ == "__main__":
inspect()
|