File size: 1,591 Bytes
ff3017c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

import logging
import sys
from pathlib import Path
from collections import Counter

# Setup path
sys.path.append(str(Path.cwd()))

from src.storage.config import config
from src.storage.chromadb_store import ChromaDBStore

# Mute logging
logging.basicConfig(level=logging.ERROR)

def inspect():
    print("Connecting to ChromaDB...")
    store = ChromaDBStore()
    
    if not store.collection:
        print("Could not connect to collection.")
        return

    count = store.collection.count()
    print(f"Total documents: {count}")
    
    if count == 0:
        return

    # Get a sample or all metadata
    # ChromaDB get() without ids returns everything if limit allows, or we can page.
    # tailored for 2000 docs, let's just get all metadatas.
    
    print("Fetching metadata...")
    data = store.collection.get(include=["metadatas"])
    metadatas = data["metadatas"]
    
    domains = Counter()
    sources = Counter()
    impacts = Counter()
    
    for meta in metadatas:
        if not meta: continue
        domains[meta.get("domain", "unknown")] += 1
        sources[meta.get("platform", "unknown")] += 1
        impacts[meta.get("impact_type", "unknown")] += 1
        
    print("\n--- Domain Distribution ---")
    for d, c in domains.most_common():
        print(f"{d}: {c}")
        
    print("\n--- Source/Platform Distribution ---")
    for s, c in sources.most_common():
        print(f"{s}: {c}")

    print("\n--- Impact Type Distribution ---")
    for i, c in impacts.most_common():
        print(f"{i}: {c}")

if __name__ == "__main__":
    inspect()