nuonuo/experiments/exp04c_optimal_config.py

"""Experiment 4c: Find optimal config for real-world use.

From exp04b: k=50 gives 95% paraphrase recall (best).
Need to verify capacity is still sufficient at k=50.
Also: test with more realistic memory counts (100-1000).
"""

import sys
import time
import json
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np

DEVICE = "cuda"
RESULTS_DIR = Path(__file__).parent.parent / "doc"


def cosine(a, b):
    if a.norm() == 0 or b.norm() == 0:
        return 0.0
    return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()


def winner_take_all(x, k):
    _, idx = x.topk(k, dim=-1)
    out = torch.zeros_like(x)
    out.scatter_(-1, idx, 1.0)
    return out


class UnifiedHebbianMemory:
    def __init__(self, input_dim, code_dim, k):
        self.k = k
        self.code_dim = code_dim
        self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
                     * (1.0 / input_dim**0.5))
        self.W = torch.zeros(code_dim, code_dim, device=DEVICE)

    def sep(self, x):
        return winner_take_all(x @ self.proj, self.k)

    def learn(self, cue_emb, target_emb):
        self.W += torch.outer(self.sep(target_emb), self.sep(cue_emb))

    def recall(self, query_emb):
        code = self.sep(query_emb)
        raw = self.W @ code
        return winner_take_all(raw, self.k)


def test_capacity_with_real_embeddings(model, code_dim, k, max_memories=2000):
    """Generate lots of diverse sentence pairs and test recall."""
    from sentence_transformers import SentenceTransformer

    # Generate diverse sentences programmatically
    topics = [
        "deploy", "database", "API", "testing", "monitoring", "security",
        "frontend", "backend", "caching", "logging", "backup", "server",
        "CI/CD", "Docker", "Kubernetes", "microservice", "authentication",
        "performance", "debugging", "refactoring"
    ]
    actions = [
        "is broken", "needs updating", "has a bug", "was configured wrong",
        "needs optimization", "requires migration", "should be refactored",
        "has a memory leak", "is timing out", "needs documentation"
    ]
    facts = [
        "was fixed last week by adding an index",
        "uses the new v3 API endpoint",
        "is scheduled for maintenance on Friday",
        "requires admin access to modify",
        "has a known issue with large payloads",
        "was migrated from AWS to GCP",
        "needs Python 3.12 or higher",
        "uses Redis for session storage",
        "has rate limiting at 1000 req/min",
        "is monitored by PagerDuty"
    ]

    cue_sentences = []
    target_sentences = []
    for i in range(max_memories):
        topic = topics[i % len(topics)]
        action = actions[i % len(actions)]
        fact = facts[i % len(facts)]
        idx = i // (len(topics) * len(actions))

        cue_sentences.append(f"The {topic} system {action} (issue #{i})")
        target_sentences.append(f"{topic} {fact}, ticket #{i}, priority {idx}")

    embed_dim = model.get_sentence_embedding_dimension()
    mem = UnifiedHebbianMemory(embed_dim, code_dim, k)

    # Encode in batches
    batch_size = 256
    checkpoints = [50, 100, 200, 500, 1000, 2000]
    all_cue_embs = []
    all_target_embs = []

    print(f"  Config: code_dim={code_dim}, k={k}")

    for start in range(0, max_memories, batch_size):
        end = min(start + batch_size, max_memories)
        cue_embs = model.encode(cue_sentences[start:end],
                                convert_to_tensor=True,
                                normalize_embeddings=True, device=DEVICE)
        target_embs = model.encode(target_sentences[start:end],
                                   convert_to_tensor=True,
                                   normalize_embeddings=True, device=DEVICE)

        for i in range(cue_embs.shape[0]):
            mem.learn(cue_embs[i], target_embs[i])
            all_cue_embs.append(cue_embs[i])
            all_target_embs.append(target_embs[i])

        total = len(all_cue_embs)
        if total in checkpoints:
            # Test on random sample
            sample_n = min(100, total)
            indices = torch.randperm(total)[:sample_n].tolist()

            correct = 0
            for idx in indices:
                recalled = mem.recall(all_cue_embs[idx])
                target_code = mem.sep(all_target_embs[idx])
                if cosine(recalled, target_code) > 0.5:
                    correct += 1

            w_norm = mem.W.norm().item()
            print(f"    N={total:>5}: Recall={correct}/{sample_n} "
                  f"({correct/sample_n:.0%}), W_norm={w_norm:.0f}")


def test_paraphrase_at_scale(model, code_dim, k, n_memories):
    """Add many memories, then test paraphrase recall on a subset."""
    embed_dim = model.get_sentence_embedding_dimension()
    mem = UnifiedHebbianMemory(embed_dim, code_dim, k)

    # Add background memories (noise)
    bg_cues = [f"Background task number {i} about topic {i%20}" for i in range(n_memories)]
    bg_targets = [f"Background fact {i} with detail {i%10}" for i in range(n_memories)]

    bg_cue_embs = model.encode(bg_cues, convert_to_tensor=True,
                                normalize_embeddings=True, device=DEVICE,
                                batch_size=256)
    bg_target_embs = model.encode(bg_targets, convert_to_tensor=True,
                                   normalize_embeddings=True, device=DEVICE,
                                   batch_size=256)

    for i in range(n_memories):
        mem.learn(bg_cue_embs[i], bg_target_embs[i])

    # Now add our specific test memories
    test_pairs = [
        ("What's the weather like today?", "User prefers to check weather every morning"),
        ("Let's deploy the new version", "The deployment pipeline uses GitHub Actions with k3s"),
        ("The database is slow again", "Missing index on users table caused slowdown last time"),
        ("I need to fix the auth bug", "Auth service uses JWT tokens with 24h expiry in Redis"),
        ("The API returns 500 errors", "Last 500 was caused by OOM in the Python worker"),
    ]
    paraphrases = [
        "How's the weather outside?",
        "We should push the new release",
        "DB performance is terrible",
        "There's a login bug to fix",
        "Getting internal server errors",
    ]

    test_cue_embs = model.encode([p[0] for p in test_pairs],
                                  convert_to_tensor=True,
                                  normalize_embeddings=True, device=DEVICE)
    test_target_embs = model.encode([p[1] for p in test_pairs],
                                     convert_to_tensor=True,
                                     normalize_embeddings=True, device=DEVICE)
    para_embs = model.encode(paraphrases, convert_to_tensor=True,
                              normalize_embeddings=True, device=DEVICE)

    for i in range(len(test_pairs)):
        mem.learn(test_cue_embs[i], test_target_embs[i])

    # Test exact recall
    exact_correct = 0
    for i in range(len(test_pairs)):
        recalled = mem.recall(test_cue_embs[i])
        tc = mem.sep(test_target_embs[i])
        if cosine(recalled, tc) > 0.5:
            exact_correct += 1

    # Test paraphrase recall
    para_correct = 0
    for i in range(len(paraphrases)):
        recalled = mem.recall(para_embs[i])
        tc = mem.sep(test_target_embs[i])
        if cosine(recalled, tc) > 0.5:
            para_correct += 1

    n = len(test_pairs)
    print(f"  bg={n_memories}, code={code_dim}, k={k}: "
          f"Exact={exact_correct}/{n}, Para={para_correct}/{n}")
    return exact_correct / n, para_correct / n


def main():
    print("=" * 60)
    print("Experiment 4c: Optimal Config + Scale Testing")
    print("=" * 60)

    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)

    # Test 1: Capacity with real embeddings
    print("\n=== Capacity Test ===")
    for code_dim, k in [(8192, 50), (16384, 50), (16384, 20), (32768, 50)]:
        test_capacity_with_real_embeddings(model, code_dim, k, max_memories=2000)
        print()

    # Test 2: Paraphrase at scale
    print("\n=== Paraphrase Recall at Scale ===")
    for n_bg in [0, 100, 500, 1000]:
        for code_dim, k in [(8192, 50), (16384, 50)]:
            test_paraphrase_at_scale(model, code_dim, k, n_bg)


if __name__ == "__main__":
    main()