noc/mem/benchmarks/noise_vs_scale.py

"""Does recall noise decrease as memory count grows?

At various scales, measure:
1. Recall accuracy (R@3) for relevant queries
2. Max cosine similarity for irrelevant queries
3. Separation gap between relevant and irrelevant

If nocmem works well at scale, the gap should widen — relevant queries
should score much higher than irrelevant ones as the memory pool grows.
"""

import json
import time
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from nuonuo.hippocampus import HippocampalMemory

DEVICE = "cuda"
EMBED_DIM = 384
DATA_FILE = "benchmarks/longmemeval.json"

IRRELEVANT_QUERIES = [
    "今天天气怎么样",
    "你喜欢吃什么",
    "嗨",
    "讲个笑话",
    "明天会下雨吗",
    "你觉得猫可爱还是狗可爱",
    "人生的意义是什么",
    "帮我写一首诗",
    "地球到月球有多远",
    "如何学会游泳",
]

BETA_CONFIGS = [16.0, 32.0, 64.0]
SCALES = [50, 200, 500, 1000, 3000]


def main():
    print("noise vs scale benchmark\n")
    print("loading encoder...")
    encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)

    def emb(text):
        return encoder.encode([text], convert_to_tensor=True,
                              normalize_embeddings=True, device=DEVICE)[0]

    def emb_batch(texts):
        return encoder.encode(texts, convert_to_tensor=True,
                              normalize_embeddings=True, device=DEVICE,
                              batch_size=256, show_progress_bar=False)

    # load data
    print("loading data...")
    with open(DATA_FILE) as f:
        data = json.load(f)

    # collect unique chunks with their source question index
    all_chunks = []  # (text, question_idx, session_id)
    seen = set()
    for qi, item in enumerate(data):
        for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
            for i in range(0, len(sess) - 1, 2):
                key = (sid, i)
                if key in seen:
                    continue
                seen.add(key)
                user = sess[i]["content"]
                asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
                text = f"{user}\n{asst}"[:1000]
                all_chunks.append((text, qi, sid))
    print(f"  {len(all_chunks)} unique chunks")

    # pre-embed irrelevant queries
    irrel_embs = [emb(q) for q in IRRELEVANT_QUERIES]

    # collect relevant queries: for each question, we know the answer session
    # pick first 50 questions that have at least one answer session
    relevant_queries = []
    for item in data[:100]:
        answer_sids = set(item["answer_session_ids"])
        relevant_queries.append((item["question"], answer_sids))
    rel_query_embs = emb_batch([q for q, _ in relevant_queries])

    print(f"  {len(relevant_queries)} relevant queries")
    print(f"  {len(IRRELEVANT_QUERIES)} irrelevant queries")

    # filter scales to what we have
    scales = [s for s in SCALES if s <= len(all_chunks)]

    for beta in BETA_CONFIGS:
        print(f"\n{'='*70}")
        print(f"  β = {beta}")
        print(f"{'='*70}")
        print(f"{'Scale':>7} | {'R@3':>6} | {'Rel maxcos':>10} {'Irrel maxcos':>12} {'Gap':>8} | {'Rel attn':>9} {'Irrel attn':>11}")
        print("-" * 80)

        for n in scales:
            subset = all_chunks[:n]
            texts = [c[0] for c in subset]
            sids = [c[2] for c in subset]

            # embed and build memory
            embeddings = emb_batch(texts)
            hip = HippocampalMemory(
                embed_dim=EMBED_DIM, beta=beta, hopfield_top_k=10, device=DEVICE,
            )
            for i in range(n):
                hip.store(embeddings[i], embeddings[i],
                          metadata={"session_id": sids[i]})

            cue_mat = hip._get_cue_matrix()

            # --- relevant queries ---
            rel_max_cos = []
            rel_top_attn = []
            hits = 0
            tested = 0

            for qi in range(len(relevant_queries)):
                question, answer_sids = relevant_queries[qi]
                qe = rel_query_embs[qi]

                # check if any answer session is in this subset
                subset_sids = set(sids)
                if not (answer_sids & subset_sids):
                    continue
                tested += 1

                # cosine sim
                cos_sims = qe @ cue_mat.T
                rel_max_cos.append(cos_sims.max().item())

                # recall
                results = hip.recall(qe, top_k=3)
                top_attn = results[0].similarity if results else 0
                rel_top_attn.append(top_attn)

                recalled_sids = {r.metadata["session_id"] for r in results}
                if answer_sids & recalled_sids:
                    hits += 1

            r3 = hits / tested * 100 if tested > 0 else 0
            avg_rel_cos = np.mean(rel_max_cos) if rel_max_cos else 0
            avg_rel_attn = np.mean(rel_top_attn) if rel_top_attn else 0

            # --- irrelevant queries ---
            irrel_max_cos = []
            irrel_top_attn = []
            for qe in irrel_embs:
                cos_sims = qe @ cue_mat.T
                irrel_max_cos.append(cos_sims.max().item())

                results = hip.recall(qe, top_k=3)
                top_attn = results[0].similarity if results else 0
                irrel_top_attn.append(top_attn)

            avg_irrel_cos = np.mean(irrel_max_cos)
            avg_irrel_attn = np.mean(irrel_top_attn)

            gap = avg_rel_cos - avg_irrel_cos

            print(f"{n:>7,} | {r3:>5.1f}% | {avg_rel_cos:>10.3f} {avg_irrel_cos:>12.3f} {gap:>8.3f} | {avg_rel_attn:>8.0%} {avg_irrel_attn:>10.0%}")

            del hip
            torch.cuda.empty_cache()

    print(f"\n── 解读 ──")
    print(f"Rel maxcos:   相关查询的最大余弦相似度（越高越好）")
    print(f"Irrel maxcos: 无关查询的最大余弦相似度（越低越好）")
    print(f"Gap:          两者之差（越大越好 = 越容易区分）")
    print(f"Rel attn:     相关查询 top1 的 Hopfield attention 权重")
    print(f"Irrel attn:   无关查询 top1 的 Hopfield attention 权重（越低 = 越少噪音）")


if __name__ == "__main__":
    main()