RandomCoder-lab · RandomCoder-lab · May 22, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/experiments/transformerless_lm/activations_substrate.py b/experiments/transformerless_lm/activations_substrate.py
diff --git a/experiments/transformerless_lm/corpus.py b/experiments/transformerless_lm/corpus.py
@@ -46,13 +46,19 @@ def make_dataset(seq_len: int = 64, source: str = "embedded"):
                     fast smoke tests and the original tiny-bench)
       - "tinyshakespeare": load tinyshakespeare.txt (1.1 MB) — used
                            by the scale experiment
+      - "omc": load omc_codebase.txt (~4 MB of OMC source: .py/.rs/.md/.toml).
+               More diverse than English prose; 210 unique chars.
     """
     import os
     import torch
     if source == "tinyshakespeare":
         path = os.path.join(os.path.dirname(__file__), "tinyshakespeare.txt")
         with open(path, "r") as f:
             text = f.read()
+    elif source == "omc":
+        path = os.path.join(os.path.dirname(__file__), "omc_codebase.txt")
+        with open(path, "r") as f:
+            text = f.read()
     else:
         text = CORPUS
     chars = sorted(set(text))

diff --git a/experiments/transformerless_lm/corpus_word.py b/experiments/transformerless_lm/corpus_word.py
@@ -0,0 +1,88 @@
+"""Word-level tokenizer for TinyShakespeare.
+
+The char-level vocab (65 chars) requires the model to learn that
+letters form words before it can learn word structure. Word-level
+tokenization gives the model atomic semantic units directly — the
+model's per-step prediction is a meaningful WORD, not a letter.
+
+Splits on whitespace + punctuation. Keeps punctuation as separate
+tokens (so 'ROMEO:' becomes ['ROMEO', ':']). Lowercase'd to keep
+vocab small.
+
+For TinyShakespeare (1.1 MB) the word vocab is roughly 25K unique
+tokens — much larger than 65 chars but each token carries more
+semantic weight per step.
+"""
+
+import os
+import re
+
+import torch
+
+
+_TOKEN_PATTERN = re.compile(r"[A-Za-z]+|[0-9]+|[^A-Za-z0-9\s]|\n+|\s+")
+
+
+def tokenize_text(text: str) -> list[str]:
+    """Split text into word-like tokens. Keeps newlines as their own
+    tokens so the model can learn line structure."""
+    tokens = _TOKEN_PATTERN.findall(text)
+    # Lowercase alphabetic tokens to shrink vocab. Keep punctuation as-is.
+    return [t.lower() if t.isalpha() else t for t in tokens]
+
+
+def make_word_dataset(source: str = "tinyshakespeare"):
+    """Returns (vocab, stoi, itos, encoded) for word-level tokenization.
+
+    vocab: list of unique tokens, sorted
+    stoi: token -> int
+    itos: int -> token
+    encoded: 1-D int tensor of token ids
+    """
+    base = os.path.dirname(__file__)
+    if source == "tinyshakespeare":
+        path = os.path.join(base, "tinyshakespeare.txt")
+    elif source == "omc":
+        path = os.path.join(base, "omc_codebase.txt")
+    else:
+        raise ValueError(f"unknown source: {source}")
+    with open(path) as f:
+        text = f.read()
+    tokens = tokenize_text(text)
+    vocab = sorted(set(tokens))
+    stoi = {t: i for i, t in enumerate(vocab)}
+    itos = {i: t for t, i in stoi.items()}
+    encoded = torch.tensor([stoi[t] for t in tokens], dtype=torch.long)
+    return vocab, stoi, itos, encoded
+
+
+def detokenize(token_ids, itos) -> str:
+    """Inverse of tokenize_text. Reconstructs text by joining tokens —
+    keeps newlines/whitespace tokens visible so the line structure
+    is preserved in the output."""
+    out = []
+    prev_alpha = False
+    for tid in token_ids:
+        t = itos[int(tid)]
+        # Add a space between alphanumeric runs; whitespace/newline
+        # tokens are emitted directly.
+        if t.isalnum():
+            if prev_alpha:
+                out.append(" ")
+            out.append(t)
+            prev_alpha = True
+        else:
+            out.append(t)
+            prev_alpha = False
+    return "".join(out)
+
+
+if __name__ == "__main__":
+    for src in ("tinyshakespeare", "omc"):
+        vocab, stoi, itos, enc = make_word_dataset(src)
+        print(f"{src}:")
+        print(f"  total tokens: {enc.numel():,}")
+        print(f"  unique vocab: {len(vocab):,}")
+        sample = detokenize(enc[:30].tolist(), itos)
+        print(f"  first 30 detok: {sample!r}")
+        print()
diff --git a/experiments/transformerless_lm/creativity_score.py b/experiments/transformerless_lm/creativity_score.py
@@ -0,0 +1,247 @@
+"""Shakespeare-aware creativity scoring.
+
+Replaces val=CE-on-next-token (which only rewards exact reproduction)
+with metrics that measure whether GENERATED text is Shakespeare-LIKE
+without being identical:
+
+  - n-gram overlap: fraction of n-char windows in generated text that
+    appear ANYWHERE in the corpus. Measures Shakespearean character
+    patterns without exact-word requirement.
+  - vocab overlap: fraction of generated tokens (whitespace-separated)
+    that match corpus vocabulary. Real English/Shakespeare words even
+    if not in the same sentence.
+  - line structure: avg line length, ratio of letters to total chars.
+    Captures stanza/line-break patterns.
+  - vowel-consonant transition rate: English alternates v/c; random
+    text doesn't. Score the alternation pattern.
+
+Use these to evaluate creative output of substrate-aligned model. A
+model that produces statistically-Shakespearean GIBBERISH gets ~0;
+a model that produces creative but recognizable English gets high.
+"""
+
+import string
+from collections import Counter
+
+
+
+
+VOWELS = set("aeiouAEIOU")
+LETTERS = set(string.ascii_letters)
+WHITESPACE = set(" \n\t")
+
+
+def char_ngram_overlap(generated: str, corpus_text: str, n: int) -> float:
+    """Fraction of n-char windows in generated that appear in corpus.
+    Higher = more Shakespearean char-pattern overlap."""
+    if len(generated) < n:
+        return 0.0
+    corpus_ngrams = set(corpus_text[i:i+n] for i in range(len(corpus_text) - n + 1))
+    gen_ngrams = [generated[i:i+n] for i in range(len(generated) - n + 1)]
+    if not gen_ngrams:
+        return 0.0
+    matches = sum(1 for g in gen_ngrams if g in corpus_ngrams)
+    return matches / len(gen_ngrams)
+
+
+def vocab_overlap(generated: str, corpus_text: str) -> float:
+    """Fraction of generated 'words' (whitespace-split) that appear in
+    the corpus vocabulary. Punctuation stripped for comparison."""
+    def clean(s):
+        return s.lower().strip(string.punctuation)
+    corpus_vocab = set(clean(w) for w in corpus_text.split() if clean(w))
+    gen_words = [clean(w) for w in generated.split() if clean(w)]
+    if not gen_words:
+        return 0.0
+    matches = sum(1 for w in gen_words if w in corpus_vocab)
+    return matches / len(gen_words)
+
+
+def line_structure_stats(generated: str) -> dict:
+    """Line-level statistics: line count, mean line length, std line
+    length. Compare to corpus to see if the model matches Shakespeare's
+    typical line structure."""
+    lines = [ln for ln in generated.split("\n") if ln.strip()]
+    if not lines:
+        return {"n_lines": 0, "mean_line_len": 0.0, "std_line_len": 0.0}
+    lengths = [len(ln) for ln in lines]
+    mean = sum(lengths) / len(lengths)
+    var = sum((L - mean) ** 2 for L in lengths) / len(lengths)
+    return {"n_lines": len(lines),
+             "mean_line_len": mean,
+             "std_line_len": var ** 0.5}
+
+
+def vc_alternation_rate(generated: str) -> float:
+    """Vowel-consonant alternation rate. English alternates v/c more
+    often than random text. Returns the fraction of adjacent letter
+    pairs that are (v,c) or (c,v) -- alternating, not same class."""
+    letters = [c for c in generated if c in LETTERS]
+    if len(letters) < 2:
+        return 0.0
+    alts = 0
+    for i in range(len(letters) - 1):
+        a, b = letters[i] in VOWELS, letters[i+1] in VOWELS
+        if a != b:
+            alts += 1
+    return alts / (len(letters) - 1)
+
+
+def line_length_match(generated: str, corpus_text: str) -> float:
+    """How close is the generated line-length distribution to the
+    corpus's? L1 distance over normalized histograms (lower = closer
+    to Shakespeare's line structure)."""
+    def hist(text, max_len=80):
+        lines = [ln for ln in text.split("\n") if ln.strip()]
+        h = [0] * (max_len + 1)
+        for ln in lines:
+            L = min(len(ln), max_len)
+            h[L] += 1
+        total = sum(h) or 1
+        return [x / total for x in h]
+    gen_h = hist(generated)
+    corp_h = hist(corpus_text)
+    return sum(abs(g - c) for g, c in zip(gen_h, corp_h))
+
+
+def real_word_fraction(generated: str, corpus_text: str,
+                          min_word_len: int = 3) -> float:
+    """Fraction of generated 'words' that are real (length >= min_word_len
+    AND appear in the corpus vocabulary). The strict gate against
+    gibberish: 'fan' is real even if Shakespeare uses it, 'xqrt' is not.
+    Short tokens (1-2 chars) excluded because they're noise-prone.
+    """
+    def clean(s):
+        return s.lower().strip(string.punctuation)
+    corpus_vocab = set(clean(w) for w in corpus_text.split() if clean(w))
+    gen_words = [clean(w) for w in generated.split() if clean(w)]
+    long_words = [w for w in gen_words if len(w) >= min_word_len]
+    if not long_words:
+        return 0.0
+    real = sum(1 for w in long_words if w in corpus_vocab)
+    return real / len(long_words)
+
+
+def common_word_presence(generated: str, corpus_text: str,
+                            top_k: int = 50) -> float:
+    """How many of the corpus's top-K most-common words appear in the
+    generated text. This is the strongest anti-gibberish signal:
+    Shakespeare uses 'the', 'and', 'of', 'my', 'I' frequently;
+    gibberish doesn't.
+    """
+    def clean(s):
+        return s.lower().strip(string.punctuation)
+    corpus_words = [clean(w) for w in corpus_text.split() if clean(w)]
+    corpus_freq = Counter(corpus_words)
+    top_words = set(w for w, _ in corpus_freq.most_common(top_k))
+    gen_words = set(clean(w) for w in generated.split() if clean(w))
+    if not top_words:
+        return 0.0
+    overlap = len(gen_words & top_words)
+    return overlap / len(top_words)
+
+
+def avg_word_length_match(generated: str, corpus_text: str) -> float:
+    """How close is generated avg word length to corpus avg?
+    Returns 1.0 - normalized_distance, clamped to [0, 1]."""
+    def clean(s):
+        return s.lower().strip(string.punctuation)
+    def avg(text):
+        words = [clean(w) for w in text.split() if clean(w)]
+        return (sum(len(w) for w in words) / len(words)) if words else 0.0
+    g = avg(generated); c = avg(corpus_text)
+    if c == 0:
+        return 0.0
+    return max(0.0, 1.0 - abs(g - c) / c)
+
+
+def ngram_diversity(generated: str, n: int = 3) -> float:
+    """Fraction of n-grams in the generated text that are UNIQUE.
+    1.0 = every n-gram appears once (max diversity).
+    0.0 = all n-grams identical (max repetition).
+    Counter-Goodhart against the model gaming overlap by repetition."""
+    if len(generated) < n:
+        return 0.0
+    ngrams = [generated[i:i+n] for i in range(len(generated) - n + 1)]
+    if not ngrams:
+        return 0.0
+    return len(set(ngrams)) / len(ngrams)
+
+
+def repetition_penalty(generated: str, n: int = 4,
+                         max_freq_threshold: int = 3) -> float:
+    """Penalty in [0, 1] for excessive n-gram repetition. 0 = no penalty.
+
+    For each n-gram appearing more than max_freq_threshold times, add a
+    penalty proportional to the excess. Strong signal against the
+    'fan fan, fan, fan' failure mode.
+    """
+    if len(generated) < n:
+        return 0.0
+    ngrams = [generated[i:i+n] for i in range(len(generated) - n + 1)]
+    counts = Counter(ngrams)
+    excess = sum(max(0, c - max_freq_threshold) for c in counts.values())
+    # Normalize by total ngrams; cap penalty at 1.0
+    return min(1.0, excess / max(1, len(ngrams)))
+
+
+def lexical_diversity(generated: str) -> float:
+    """Type-token ratio over 'words' (whitespace-split). Higher = more
+    varied vocabulary, lower = repetitive word use."""
+    import string as _s
+    words = [w.lower().strip(_s.punctuation) for w in generated.split()]
+    words = [w for w in words if w]
+    if not words:
+        return 0.0
+    return len(set(words)) / len(words)
+
+
+def creativity_score(generated: str, corpus_text: str) -> dict:
+    """Comprehensive Shakespeare-creativity score with anti-gibberish.
+
+    Penalties added in v2 to counter Goodhart's failure (model gaming
+    overlap metrics by repetition):
+      - ngram_diversity (multiplier; low = repetitive output)
+      - lexical_diversity (multiplier; low = same word over and over)
+      - repetition_penalty (subtractive; n-gram appears too many times)
+    """
+    n2 = char_ngram_overlap(generated, corpus_text, 2)
+    n3 = char_ngram_overlap(generated, corpus_text, 3)
+    n4 = char_ngram_overlap(generated, corpus_text, 4)
+    vocab = vocab_overlap(generated, corpus_text)
+    vc = vc_alternation_rate(generated)
+    line_dist = line_length_match(generated, corpus_text)
+    line_stats = line_structure_stats(generated)
+    # Strong anti-gibberish: common-word, real-word, and word-length.
+    cw = common_word_presence(generated, corpus_text, top_k=50)
+    rw = real_word_fraction(generated, corpus_text, min_word_len=3)
+    awl = avg_word_length_match(generated, corpus_text)
+    # Repetition penalty: only severe excess counts now (threshold scales
+    # with text length so real text's natural repetition doesn't penalize).
+    threshold = max(2, len(generated) // 50)
+    rep_pen = repetition_penalty(generated, n=4, max_freq_threshold=threshold)
+
+    composite = (
+        0.25 * rw +              # real-word fraction (HARDEST anti-gibberish)
+        0.15 * cw +              # common-word presence
+        0.15 * vocab +           # any vocab overlap (short tokens count)
+        0.10 * awl +             # word-length sanity
+        0.15 * n3 +              # 3-gram match (corpus patterns)
+        0.10 * n4 +              # 4-gram match (longer patterns)
+        0.10 * max(0.0, 1.0 - line_dist)   # line structure
+    ) - 0.3 * rep_pen
+    composite = max(0.0, min(1.0, composite))
+    return {
+        "ngram_2": n2,
+        "ngram_3": n3,
+        "ngram_4": n4,
+        "vocab_overlap": vocab,
+        "common_word_presence": cw,
+        "real_word_fraction": rw,
+        "avg_word_len_match": awl,
+        "vc_alternation": vc,
+        "line_dist": line_dist,
+        "line_stats": line_stats,
+        "repetition_penalty": rep_pen,
+        "creativity_score": composite,
+    }