"""Experiment P3: Breaking the 20K 80% ceiling. Hypothesis: NN pre-filter (top-20) misses the correct cue at large scale. Tests: 1. Oracle analysis: is the correct cue in top-K? What K is needed? 2. Hierarchical memory: cluster memories, route query to relevant cluster 3. Re-ranking: top-K NN → cross-similarity re-rank → Hopfield on re-ranked 4. Multiple projections: ensemble of NN lookups with different random projections """ import sys import time from pathlib import Path import torch import torch.nn as nn import numpy as np DEVICE = "cuda" PAIRS = [ ("What's the weather like today?", "User checks weather every morning"), ("Let's deploy the new version", "Deployment uses GitHub Actions with k3s"), ("The database is slow again", "Missing index on users table"), ("I need to fix the authentication bug", "JWT tokens with 24h expiry in Redis"), ("The API returns 500 errors", "OOM in the Python worker"), ("Let's set up monitoring", "Prometheus + Grafana on OCI"), ("Tests failing in CI", "CI needs postgres service container"), ("Memory usage too high", "Leak in websocket handler"), ("Help with Docker setup", "docker-compose for dev, k3s for prod"), ("Log files too large", "Logs rotate daily, shipped to Loki"), ] PARAPHRASES = [ "How's the weather?", "Ship the release", "DB performance terrible", "Fix the login issue", "Server errors everywhere", "Need observability", "CI tests breaking", "Service using too much RAM", "Docker config help", "Logs eating disk space", ] def cosine(a, b): return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item() def load_model(): from sentence_transformers import SentenceTransformer return SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE) def build_memory(model, n_bg): """Build memory with test pairs + background.""" cue_embs = model.encode([p[0] for p in PAIRS], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE) target_embs = model.encode([p[1] for p in PAIRS], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE) para_embs = model.encode(PARAPHRASES, convert_to_tensor=True, normalize_embeddings=True, device=DEVICE) all_cues = list(cue_embs) all_targets = list(target_embs) all_mids = list(range(len(PAIRS))) if n_bg > 0: topics = ["server", "db", "api", "fe", "be", "cache", "queue", "net", "store", "auth", "docker", "k8s"] bg_cues = [f"The {topics[i%len(topics)]} has issue {i}" for i in range(n_bg)] bg_targets = [f"Fix {topics[i%len(topics)]} issue {i}" for i in range(n_bg)] bg_c = model.encode(bg_cues, convert_to_tensor=True, normalize_embeddings=True, device=DEVICE, batch_size=256) bg_t = model.encode(bg_targets, convert_to_tensor=True, normalize_embeddings=True, device=DEVICE, batch_size=256) for i in range(n_bg): all_cues.append(bg_c[i]) all_targets.append(bg_t[i]) all_mids.append(100 + i) cue_mat = torch.stack(all_cues) target_mat = torch.stack(all_targets) return cue_mat, target_mat, all_mids, cue_embs, target_embs, para_embs def test_topk_coverage(model, n_bg_list): """Is the correct cue in top-K? What K do we need?""" print("=== Test 1: Top-K Coverage Analysis ===\n") for n_bg in n_bg_list: cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg) for K in [5, 10, 20, 50, 100, 200]: in_topk = 0 for i in range(len(PARAPHRASES)): sims = para_embs[i] @ cue_mat.T _, top_idx = sims.topk(min(K, len(mids))) top_mids = [mids[j] for j in top_idx.tolist()] if i in top_mids: in_topk += 1 n = len(PARAPHRASES) print(f" N={n_bg+len(PAIRS):>6}, K={K:>3}: " f"{in_topk}/{n} ({in_topk/n:.0%}) correct cue in top-K") print() def test_two_stage_topk(model, n_bg): """Vary K in two-stage Hopfield to find optimal.""" print(f"\n=== Test 2: Two-Stage K Optimization (bg={n_bg}) ===\n") cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg) for K in [5, 10, 20, 50, 100, 200]: correct = 0 for i in range(len(PARAPHRASES)): sims = para_embs[i] @ cue_mat.T k = min(K, len(mids)) _, top_idx = sims.topk(k) cand_cues = cue_mat[top_idx] cand_targets = target_mat[top_idx] cand_mids = [mids[j] for j in top_idx.tolist()] # Hopfield settle xi = para_embs[i] for _ in range(3): scores = 16.0 * (xi @ cand_cues.T) attn = torch.softmax(scores, dim=0) xi = attn @ cand_cues xi = nn.functional.normalize(xi, dim=0) scores = 16.0 * (xi @ cand_cues.T) attn = torch.softmax(scores, dim=0) mid_scores = {} for j, mid in enumerate(cand_mids): mid_scores[mid] = mid_scores.get(mid, 0) + attn[j].item() best_mid = max(mid_scores, key=mid_scores.get) if best_mid == i: correct += 1 n = len(PARAPHRASES) print(f" K={K:>3}: {correct}/{n} ({correct/n:.0%})") def test_hierarchical(model, n_bg): """Cluster memories by topic, route query to relevant cluster.""" print(f"\n=== Test 3: Hierarchical Memory (bg={n_bg}) ===\n") cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg) # Simple clustering: k-means on cue embeddings from torch import cdist n_clusters = max(10, (n_bg + len(PAIRS)) // 100) # K-means (simple implementation) N = cue_mat.shape[0] centroids = cue_mat[torch.randperm(N)[:n_clusters]].clone() for _ in range(20): dists = 1 - cue_mat @ centroids.T # cosine distance assignments = dists.argmin(dim=1) for c in range(n_clusters): mask = assignments == c if mask.sum() > 0: centroids[c] = nn.functional.normalize(cue_mat[mask].mean(dim=0), dim=0) # Route query to top-3 clusters, then Hopfield within correct = 0 for i in range(len(PARAPHRASES)): # Find relevant clusters cluster_sims = para_embs[i] @ centroids.T top_clusters = cluster_sims.topk(3).indices # Gather candidates from top clusters cand_idx = [] for c in top_clusters: cluster_members = (assignments == c).nonzero().squeeze(-1).tolist() cand_idx.extend(cluster_members) cand_idx = list(set(cand_idx)) if not cand_idx: continue # Hopfield on candidates cand_cues = cue_mat[cand_idx] cand_targets = target_mat[cand_idx] cand_mids = [mids[j] for j in cand_idx] K = min(20, len(cand_idx)) sims = para_embs[i] @ cand_cues.T _, top_local = sims.topk(K) local_cues = cand_cues[top_local] local_mids = [cand_mids[j] for j in top_local.tolist()] xi = para_embs[i] for _ in range(3): scores = 16.0 * (xi @ local_cues.T) attn = torch.softmax(scores, dim=0) xi = attn @ local_cues xi = nn.functional.normalize(xi, dim=0) scores = 16.0 * (xi @ local_cues.T) attn = torch.softmax(scores, dim=0) mid_scores = {} for j, mid in enumerate(local_mids): mid_scores[mid] = mid_scores.get(mid, 0) + attn[j].item() best_mid = max(mid_scores, key=mid_scores.get) if best_mid == i: correct += 1 n = len(PARAPHRASES) print(f" Hierarchical (clusters={n_clusters}): {correct}/{n} ({correct/n:.0%})") def main(): print("=" * 60) print("Experiment P3: Breaking the 20K Ceiling") print("=" * 60) model = load_model() # Test 1: Top-K coverage test_topk_coverage(model, [0, 500, 2000, 5000, 10000, 20000]) # Test 2: K optimization for bg in [2000, 10000, 20000]: test_two_stage_topk(model, bg) # Test 3: Hierarchical for bg in [2000, 10000, 20000]: test_hierarchical(model, bg) if __name__ == "__main__": main()