diff --git a/experiments/transformerless_lm/losses_substrate.py b/experiments/transformerless_lm/losses_substrate.py index 3974130..0ed0798 100644 --- a/experiments/transformerless_lm/losses_substrate.py +++ b/experiments/transformerless_lm/losses_substrate.py @@ -344,3 +344,95 @@ def substrate_fft_loss(logits: torch.Tensor, targets: torch.Tensor, tgt_sin = target_onehot @ basis_sin fft_mismatch = ((pred_cos - tgt_cos) ** 2 + (pred_sin - tgt_sin) ** 2).mean() return ce + lambda_substrate * fft_mismatch + + +_PHI = (1.0 + 5.0 ** 0.5) / 2.0 +_PHI_PI = _PHI ** math.pi +_LOG_PHI_PI = math.log(_PHI_PI) + + +def substrate_omniweight_loss(logits: torch.Tensor, targets: torch.Tensor, + vocab_size: int, + lambda_substrate: float = 0.01, + window: int = 21) -> torch.Tensor: + """CE weighted by the substrate omniweight ledger evaluated on targets. + + Ports the inference-side omniweight standard (fluid form + phi^pi * tanh(delta / phi^pi)) to the training loss. Each target + token's CE contribution is multiplied by exp(fluid_delta) where + fluid_delta is the substrate's verdict on that token at its + position. Tokens the inference ledger would suppress (stagnating + repetitions) get their training gradient muted by the same standard + -- closes the train/inference omniweight asymmetry. + + Minimum-surface port: only the anti-stagnation primitive contributes + to the ledger here (Fibonacci-tier counts F(6)=8, F(7)=13, F(8)=21 + over the preceding window, matching substrate_anti_stagnation). + All deltas pass through the same phi^pi * tanh standard so + additional primitives can be added without architectural change. + + Weights are renormalized so mean weight = 1, preserving loss scale. + + Args: + logits: [B, T, V] + targets: [B, T] + vocab_size: V + lambda_substrate: weight on the FFT-spectrum term (matches + substrate_fft_loss; the CE term is the omniweight-modulated one) + window: anti-stagnation window in tokens (default F(8)=21) + + Returns: + scalar loss + """ + B, T = targets.shape + V = vocab_size + device = logits.device + dtype = logits.dtype + + # Per-position count of target[b,t] occurrences in targets[b, t-window:t]. + pos_idx = torch.arange(T, device=device) + diff = pos_idx.unsqueeze(1) - pos_idx.unsqueeze(0) # [T, T] + win_mask = ((diff > 0) & (diff <= window)).to(dtype) # [T, T] + eq = (targets.unsqueeze(2) == targets.unsqueeze(1)).to(dtype) # [B, T, T] + counts = (eq * win_mask.unsqueeze(0)).sum(dim=2) # [B, T] + + # Anti-stagnation contribution to the ledger (matches inference thresholds: + # count >= F(6)=8 -> divide by phi^pi -> delta = -log(phi^pi) + # count >= F(7)=13 -> divide by phi^(2pi) -> delta = -2*log(phi^pi) + # count >= F(8)=21 -> hard suppression -> delta = -4*log(phi^pi) + # (the inference path sets prob=0 at F(8); here we let tanh saturate.) + delta = torch.zeros_like(counts) + m_8 = (counts >= 8.0) & (counts < 13.0) + m_13 = (counts >= 13.0) & (counts < 21.0) + m_21 = counts >= 21.0 + delta = torch.where(m_8, torch.full_like(delta, -_LOG_PHI_PI), delta) + delta = torch.where(m_13, torch.full_like(delta, -2.0 * _LOG_PHI_PI), delta) + delta = torch.where(m_21, torch.full_like(delta, -4.0 * _LOG_PHI_PI), delta) + + # Fluid substrate standard: phi^pi * tanh(delta / phi^pi). Same form + # the inference omniweight uses (_omniweight_apply). + fluid_delta = _PHI_PI * torch.tanh(delta / _PHI_PI) + weight = torch.exp(fluid_delta) # bounded in [exp(-phi^pi), 1] + + # Per-token CE, weighted by the omniweight ledger. + ce_per_tok = F.cross_entropy( + logits.reshape(-1, V), + targets.reshape(-1), + reduction='none', + ).reshape(B, T) + ce = (ce_per_tok * weight).sum() / (weight.sum() + 1e-8) + + # Same FFT-spectrum substrate term as substrate_fft_loss. + fib_freqs = torch.tensor([1, 2, 3, 5, 8, 13, 21], dtype=dtype, device=device) + v_idx = torch.arange(vocab_size, dtype=dtype, device=device) + angles = 2 * math.pi * v_idx.unsqueeze(1) * fib_freqs.unsqueeze(0) / vocab_size + basis_cos = torch.cos(angles) + basis_sin = torch.sin(angles) + pred = F.softmax(logits, dim=-1) + target_onehot = F.one_hot(targets, vocab_size).to(pred.dtype) + pred_cos = pred @ basis_cos + pred_sin = pred @ basis_sin + tgt_cos = target_onehot @ basis_cos + tgt_sin = target_onehot @ basis_sin + fft_mismatch = ((pred_cos - tgt_cos) ** 2 + (pred_sin - tgt_sin) ** 2).mean() + return ce + lambda_substrate * fft_mismatch diff --git a/experiments/transformerless_lm/results_self_recursive.json b/experiments/transformerless_lm/results_self_recursive.json index d6ae04d..d1e2174 100644 --- a/experiments/transformerless_lm/results_self_recursive.json +++ b/experiments/transformerless_lm/results_self_recursive.json @@ -5,27 +5,27 @@ "n_params": 349564, "best_val": 4.102890983223915, "best_step": 125, - "wall": 1257.8584070205688, - "best_creativity_seen": 0.6953220237836641, + "wall": 1213.8069217205048, + "best_creativity_seen": 0.6954515518005908, "active_base_final_size": 512, "cycle_summary": [ { "cycle": 1, "samples_creativity": [ - 0.6953220237836641, - 0.6675120184343188, - 0.6507996673068358, - 0.6224636991740582, - 0.6133321892133292, - 0.6123096341601106, - 0.6092295065109937, - 0.5891774978693133 + 0.6954515518005908, + 0.665975947417424, + 0.6644801644953675, + 0.6602450752300563, + 0.6546445183117755, + 0.6385173503984254, + 0.6226070178378992, + 0.5657122128851239 ], "kept_top_k": [ - 0.6953220237836641, - 0.6675120184343188, - 0.6507996673068358, - 0.6224636991740582 + 0.6954515518005908, + 0.665975947417424, + 0.6644801644953675, + 0.6602450752300563 ], "n_added": 0, "n_rejected_baseline": 4, @@ -35,20 +35,20 @@ { "cycle": 2, "samples_creativity": [ - 0.6704232860437639, - 0.6648274799606466, - 0.6508234611117371, - 0.6476572093939725, - 0.6381352799594192, - 0.6264649982669753, - 0.6190503186377636, - 0.6128099077887375 + 0.6925937329097237, + 0.6757486842191784, + 0.6670032242621343, + 0.6393836294961299, + 0.6338352616029544, + 0.61841970334286, + 0.6165312168237119, + 0.5969597589342746 ], "kept_top_k": [ - 0.6704232860437639, - 0.6648274799606466, - 0.6508234611117371, - 0.6476572093939725 + 0.6925937329097237, + 0.6757486842191784, + 0.6670032242621343, + 0.6393836294961299 ], "n_added": 0, "n_rejected_baseline": 8, @@ -58,20 +58,20 @@ { "cycle": 3, "samples_creativity": [ - 0.652534468368593, - 0.6399732281764906, - 0.6361073646173617, - 0.6352755110236258, - 0.6163917326643213, - 0.5788199094054421, - 0.5764116215222896, - 0.552401523745161 + 0.6370045521701664, + 0.6209598868984683, + 0.6136993936809374, + 0.6119181278084301, + 0.6113188160395158, + 0.6049145827032254, + 0.5682205209402211, + 0.5260297505776333 ], "kept_top_k": [ - 0.652534468368593, - 0.6399732281764906, - 0.6361073646173617, - 0.6352755110236258 + 0.6370045521701664, + 0.6209598868984683, + 0.6136993936809374, + 0.6119181278084301 ], "n_added": 0, "n_rejected_baseline": 12, @@ -81,20 +81,20 @@ { "cycle": 4, "samples_creativity": [ - 0.6752925871317118, - 0.6395244356914296, - 0.6128842582046097, - 0.6093353742544283, - 0.5572953963165136, - 0.5321261453931008, - 0.5081252986104718, - 0.5049745581339583 + 0.6708169401052111, + 0.643879968820801, + 0.6432178463283171, + 0.6083445865462416, + 0.5988427646071594, + 0.5898160276240022, + 0.584465909306921, + 0.5430875149971612 ], "kept_top_k": [ - 0.6752925871317118, - 0.6395244356914296, - 0.6128842582046097, - 0.6093353742544283 + 0.6708169401052111, + 0.643879968820801, + 0.6432178463283171, + 0.6083445865462416 ], "n_added": 0, "n_rejected_baseline": 16, @@ -104,20 +104,20 @@ { "cycle": 5, "samples_creativity": [ - 0.6385087078805419, - 0.6033926539692167, - 0.5946966608405826, - 0.5765555818019316, - 0.5707155062853253, - 0.5701888708065214, - 0.5557957883470072, - 0.4792508607180266 + 0.6580255861865737, + 0.6418072406749432, + 0.6387471199559207, + 0.6312360447364886, + 0.6217118756077425, + 0.6137579630771366, + 0.5796060077629913, + 0.550425515376975 ], "kept_top_k": [ - 0.6385087078805419, - 0.6033926539692167, - 0.5946966608405826, - 0.5765555818019316 + 0.6580255861865737, + 0.6418072406749432, + 0.6387471199559207, + 0.6312360447364886 ], "n_added": 0, "n_rejected_baseline": 20, @@ -127,20 +127,20 @@ { "cycle": 6, "samples_creativity": [ - 0.6680947492435783, - 0.6648394616451381, - 0.6504234934110721, - 0.6445499193969801, - 0.6369385278356885, - 0.6330709613009443, - 0.6284713807559694, - 0.6125801943327258 + 0.6646241921607952, + 0.6643881919011203, + 0.662273441859053, + 0.6602025507129803, + 0.6503266766525893, + 0.6272803109419913, + 0.6230296816167293, + 0.5876236421628296 ], "kept_top_k": [ - 0.6680947492435783, - 0.6648394616451381, - 0.6504234934110721, - 0.6445499193969801 + 0.6646241921607952, + 0.6643881919011203, + 0.662273441859053, + 0.6602025507129803 ], "n_added": 0, "n_rejected_baseline": 24, @@ -165,79 +165,79 @@ 43, 57, 11, - 54, - 43, - 119, - 1, - 65, - 1, - 163, - 495, - 85, - 1, - 88, - 1, - 6, 0, - 6, - 0, - 6, + 44, + 43, + 72, + 44, + 43, 0, - 6, - 1, - 104, - 1, - 78, - 1, - 476, + 57, + 58, 1, 235, 1, - 145, - 6, - 1, 218, 1, - 218, + 443, + 6, 1, - 69, + 68, + 57, 1, - 66, + 443, 6, 1, - 69, - 7, + 213, + 6, 0, - 451, - 451, - 418, - 1, - 85, - 1, - 443, + 296, + 6, 0, 85, 1, - 85, - 1, - 69, + 296, 1, - 85, + 250, + 6, 1, - 85, + 457, 1, 85, 1, - 68, + 119, + 43, + 72, + 52, 43, + 43, + 52, + 56, + 119, + 58, + 1, + 51, 57, 43, - 6, - 0, + 58, + 60, + 58, + 44, + 52, 68, + 51, + 0, + 80, + 1, + 352, 1, + 115, + 1, + 457, 6, - 0 + 0, + 0, + 85 ], "refined_tokens": [ 56, @@ -257,78 +257,78 @@ 57, 11, 0, - 46, 1, - 65, + 6, + 0, 1, 44, - 56, + 6, 1, - 72, + 6, 1, - 40, + 6, 1, - 69, + 6, 1, - 85, + 42, + 0, + 119, 1, - 85, + 163, 1, 85, 1, - 74, - 104, - 69, - 52, + 115, + 1, + 95, + 1, + 40, 43, - 56, + 72, 43, 1, - 40, - 46, + 72, 1, - 163, - 163, + 119, + 56, 1, - 40, 1, - 69, + 67, + 119, + 6, 1, - 77, 1, - 69, + 72, + 58, 6, 1, + 44, + 119, 1, - 85, + 72, + 43, 1, - 85, + 44, 1, - 296, - 52, - 43, - 57, + 69, 1, - 250, + 73, 1, - 296, + 78, 1, + 69, 1, - 296, + 67, 1, - 85, + 67, 1, - 296, - 0, - 56, - 43, - 6, + 78, 1, - 250, + 69, + 6, + 69, 1, - 250, - 296, - 119 + 69 ] } } \ No newline at end of file diff --git a/experiments/transformerless_lm/train_self_recursive.py b/experiments/transformerless_lm/train_self_recursive.py index c37d3bf..ade3212 100644 --- a/experiments/transformerless_lm/train_self_recursive.py +++ b/experiments/transformerless_lm/train_self_recursive.py @@ -47,7 +47,8 @@ corpus_char_signature, corpus_multiscale_signature, substrate_harmony_loss_grounded, - substrate_multiscale_harmony_loss_grounded) + substrate_multiscale_harmony_loss_grounded, + substrate_omniweight_loss) from activations_substrate import SubstrateNegMultiAdvancedV2 from train_substrate_attention import FibRecLMSubsim from creativity_score import (creativity_score as compute_creativity_score, @@ -655,6 +656,68 @@ def build_punct_mask(vocab: list) -> torch.Tensor: return mask +def build_uppercase_mask(vocab: list) -> torch.Tensor: + """Mask = 1 for tokens whose first char is uppercase A-Z. + For grammar rule: capitalization after sentence boundary. + """ + V = len(vocab) + mask = torch.zeros(V) + for i, tok in enumerate(vocab): + if tok and len(tok) >= 1 and tok[0].isupper(): + mask[i] = 1.0 + return mask + + +def build_any_punct_mask(vocab: list) -> torch.Tensor: + """Mask = 1 for ANY single-char punctuation token (including + apostrophes, dashes -- broader than build_punct_mask which is + clause-closers only). For no-double-punctuation rule. + """ + V = len(vocab) + mask = torch.zeros(V) + pset = {'.', ',', '!', '?', ';', ':', "'", '"', '-', '(', ')'} + for i, tok in enumerate(vocab): + if tok in pset: + mask[i] = 1.0 + return mask + + +def substrate_grammar_capitalize(prev_str: str, probs: torch.Tensor, + uppercase_mask: torch.Tensor + ) -> torch.Tensor: + """Sentence-start capitalization rule. If previous emission was + '.', '!', '?', or '\\n', boost uppercase tokens by phi. + """ + if uppercase_mask is None: + return probs + if prev_str not in ('.', '!', '?', '\n'): + return probs + um = uppercase_mask.to(probs.device).to(probs.dtype) + boost = 1.0 + um * (_PHI_FOR_SAMPLING - 1.0) + out = probs * boost + return out / (out.sum() + 1e-8) + + +def substrate_grammar_no_double_punct(prev_str: str, + probs: torch.Tensor, + any_punct_mask: torch.Tensor + ) -> torch.Tensor: + """If previous emission was a punctuation char, hard-suppress + further punctuation. Prevents ',,', '..', '.,', etc. + Suppression by 1/phi^pi. + """ + if any_punct_mask is None: + return probs + punct_set = {'.', ',', '!', '?', ';', ':', "'", '"', '-'} + if prev_str not in punct_set: + return probs + pm = any_punct_mask.to(probs.device).to(probs.dtype) + suppress = 1.0 / (_PHI_FOR_SAMPLING ** math.pi) + multiplier = 1.0 - pm * (1.0 - suppress) + out = probs * multiplier + return out / (out.sum() + 1e-8) + + def build_vowel_start_mask(vocab: list) -> torch.Tensor: """Mask = 1 for tokens starting with a vowel, 0 otherwise. For phonotactics primitive (CV cluster relief). @@ -802,31 +865,32 @@ def substrate_agreement(last_content_ends_s: bool, probs: torch.Tensor, return out / (out.sum() + 1e-8) -def substrate_word_spacing(prev_tid: int, probs: torch.Tensor, - vocab: list, n_chars: int = 65) -> torch.Tensor: - """Word boundary enforcement with gentler suppression magnitude. - - After a word-token (rank >= n_chars), suppress every token except - space, newline, and punctuation. Magnitude eased from 1/phi^pi - (v69) to 1/phi^2 ~ 0.382: still strong enough to encourage - spacing but doesn't over-block apostrophe-internal sequences - ('tis, he's, etc.). +def build_allowed_after_word_mask(vocab: list, n_chars: int = 65, + suppress: float = None) -> torch.Tensor: + """Per-token multiplier mask for word_spacing primitive. + All tokens suppressed by 1/phi^2 except space/newline/punct chars + in the char region. Precomputed once. """ - if prev_tid < n_chars or not vocab: - return probs - allowed_chars = {' ', '\n', '.', ',', '!', '?', ';', ':', - "'", '-'} - allowed_idx = [] - for i in range(min(n_chars, len(vocab))): + V = len(vocab) + if suppress is None: + suppress = 1.0 / (_PHI_FOR_SAMPLING ** 2) + mask = torch.full((V,), suppress) + allowed_chars = {' ', '\n', '.', ',', '!', '?', ';', ':', "'", '-'} + for i in range(min(n_chars, V)): if vocab[i] in allowed_chars: - allowed_idx.append(i) - if not allowed_idx: + mask[i] = 1.0 + return mask + + +def substrate_word_spacing(prev_tid: int, probs: torch.Tensor, + vocab: list, n_chars: int = 65, + allowed_mask: torch.Tensor = None + ) -> torch.Tensor: + """Word boundary enforcement (vectorized via precomputed mask). + """ + if prev_tid < n_chars or allowed_mask is None: return probs - suppress = 1.0 / (_PHI_FOR_SAMPLING ** 2) - mask = torch.full_like(probs, suppress) - for i in allowed_idx: - mask[i] = 1.0 - out = probs * mask + out = probs * allowed_mask.to(probs.device).to(probs.dtype) return out / (out.sum() + 1e-8) @@ -865,6 +929,10 @@ def substrate_phonotactics(cluster_len: int, probs: torch.Tensor, return out / (out.sum() + 1e-8) +_VOWEL_ORDER = ['a', 'e', 'i', 'o', 'u'] +_VOWEL_TO_IDX = {v: i for i, v in enumerate(_VOWEL_ORDER)} + + def build_end_vowel_per_token(vocab: list) -> list: """Each token's final vowel (or '' if none). For rhyme primitive. """ @@ -879,42 +947,59 @@ def build_end_vowel_per_token(vocab: list) -> list: return end_vowels -def substrate_rhyme_resonance(recent_tokens: list, end_vowels: list, - probs: torch.Tensor) -> torch.Tensor: - """Reward sound-echo: tokens whose final vowel matches recent - tokens' final vowels. F(k) decay across last F(7)=13 tokens. +def build_end_vowel_idx_tensor(vocab: list) -> torch.Tensor: + """Per-token end-vowel index in {0..4} or -1 if no vowel. + Vectorizes the rhyme primitive's V-loop. + """ + V = len(vocab) + idx = torch.full((V,), -1, dtype=torch.long) + for i, tok in enumerate(vocab): + for ch in reversed(tok or ''): + if ch in _IAMBIC_VOWELS: + idx[i] = _VOWEL_TO_IDX.get(ch.lower(), -1) + break + return idx + - Pure substrate (last-vowel-of-token + Fibonacci decay). No rhyme - dictionary; the echo emerges from substrate sampling pressure. +def substrate_rhyme_resonance(recent_tokens: list, + end_vowel_idx: torch.Tensor, + probs: torch.Tensor) -> torch.Tensor: + """Vectorized rhyme resonance. + + end_vowel_idx: LongTensor[V] in {-1, 0..4}. Precomputed once. + Pressure per vowel computed by Python loop (~13 iters); boost + lookup is one tensor index op replacing the prior 500-elt loop. """ - if not recent_tokens or not end_vowels: + if not recent_tokens or end_vowel_idx is None: return probs phi = _PHI_FOR_SAMPLING phi_pi = phi ** math.pi - V_ev = len(end_vowels) - recent_pressure = {} + pressure = torch.zeros(len(_VOWEL_ORDER), dtype=probs.dtype, + device=probs.device) + V_ev = end_vowel_idx.shape[0] for i, tid in enumerate(reversed(recent_tokens[-13:])): - if tid >= V_ev: + if tid >= V_ev or tid < 0: continue - v = end_vowels[tid] - if not v: + v_idx = int(end_vowel_idx[tid].item()) + if v_idx < 0: continue kt = min(i, len(_FIB_NUMS_FOR_BIGRAM) - 1) w = _FIB_NUMS_FOR_BIGRAM[kt] / (phi_pi ** kt) - recent_pressure[v] = recent_pressure.get(v, 0.0) + w - if not recent_pressure: + pressure[v_idx] += w + if pressure.sum() <= 0: return probs - # Per-token log-boost halved by F(3)=2 -- substrate-canonical - # damping so anti-stagnation can override repeated same-vowel - # cascades (v62 'light light light' problem). - boost = torch.ones_like(probs) rhyme_scale = math.log(phi) / float(_FIB_NUMS_FOR_BIGRAM[3]) - for v, p in recent_pressure.items(): - log_boost = rhyme_scale * p / (1.0 + p) - bf = math.exp(log_boost) - for i, ev in enumerate(end_vowels): - if ev == v: - boost[i] = bf + log_boost_per_vowel = (rhyme_scale * pressure / (1.0 + pressure)) + # Vectorized lookup: for each token, fetch its vowel's boost. + evi = end_vowel_idx.to(probs.device) + valid = (evi >= 0) + safe_idx = evi.clamp(min=0) + log_boost_per_token = torch.where( + valid, + log_boost_per_vowel[safe_idx], + torch.zeros_like(probs), + ) + boost = torch.exp(log_boost_per_token) out = probs * boost return out / (out.sum() + 1e-8) @@ -1302,6 +1387,60 @@ def substrate_recency_penalty(history_tokens: torch.Tensor, logits: torch.Tensor _OMNIWEIGHT_RESERVE = _PHI_FOR_SAMPLING ** math.pi # ~4.53 +def _regret_score(seq: torch.Tensor, t: int, vocab: list, + n_chars: int = 65) -> float: + """Per-position regret: how badly this emission shouldn't be there. + + Factors (substrate-pure): + - over-emission: same token used F(5)+ times in last F(7)=13 + - immediate repetition: identical to previous token + - bigram saturation: (prev, current) fired F(4)+ times in last F(7) + - double punctuation: punct immediately after punct + - mid-word char: char emission after another alpha char (no space) + + Higher score = more regret = should be resampled. + """ + if t < 1 or t >= seq.shape[1]: + return 0.0 + tid = int(seq[0, t].item()) + if tid >= len(vocab) or tid < 0: + return 0.0 + tok = vocab[tid] + regret = 0.0 + F = _FIB_NUMS_FOR_BIGRAM + # Last F(7)=13 prior tokens. + start = max(0, t - F[7]) + prior = seq[0, start:t].tolist() + # Factor 1: over-emission + same_count = sum(1 for x in prior if x == tid) + if same_count > F[5]: + regret += float(same_count - F[5]) / float(F[5]) + # Factor 2: immediate repetition + prev_tid = int(seq[0, t - 1].item()) + if prev_tid == tid: + regret += 1.0 + # Factor 3: bigram saturation + bigram_count = 0 + for i in range(1, len(prior)): + if prior[i - 1] == prev_tid and prior[i] == tid: + bigram_count += 1 + if bigram_count > F[4]: + regret += float(bigram_count - F[4]) / float(F[4]) + # Factor 4: double punctuation + if tok in (',', '.', '!', '?', ';', ':') and prev_tid < len(vocab): + prev_tok = vocab[prev_tid] + if prev_tok in (',', '.', '!', '?', ';', ':'): + regret += 1.0 + # Factor 5: mid-word char emission (char after another alpha char) + if (tid < n_chars and tok and tok.isalpha() + and prev_tid < len(vocab)): + prev_tok = vocab[prev_tid] + if (prev_tok and prev_tok != ' ' + and prev_tok[-1].isalpha()): + regret += 0.5 + return regret + + def _omniweight_delta(base_probs: torch.Tensor, modified_probs: torch.Tensor) -> torch.Tensor: """Compute delta_log_p = log(modified) - log(base). Each primitive @@ -1324,24 +1463,96 @@ def _omniweight_apply(base_probs: torch.Tensor, return out / (out.sum() + 1e-8) +def substrate_unknown_register(coverage: torch.Tensor, + probs: torch.Tensor, + retrocausal_steps: int = None, + ) -> torch.Tensor: + """UNKNOWN-REGISTER with retrocausality. + + Present unknown: 1/(1+coverage) -- past-conditioned frontier. + Retrocausal: project coverage forward by F(3)=2 expected steps + using current probs distribution, then compute frontier of + the ANTICIPATED state. The future-that-would-happen feeds back + into the current emission. + + Final frontier = (1-alpha)*present_frontier + alpha*anticipated_frontier + alpha = 1/phi^pi ~ 0.221 + + Then mix probs with that blended frontier (substrate alpha). + + Time isn't linear: past coverage and anticipated coverage are + both present-tense registers in the same currency. + """ + if coverage is None: + return probs + if retrocausal_steps is None: + # F(2)=1: just ONE step lookahead, continuity-respecting. + # F(3)=2 was a discontinuous jump (ignored intermediate state). + retrocausal_steps = _FIB_NUMS_FOR_BIGRAM[2] # F(2) = 1 + cov = coverage.to(probs.device).to(probs.dtype) + # Present unknown + inv_now = 1.0 / (1.0 + cov) + frontier_now = inv_now / (inv_now.sum() + 1e-8) + # Anticipated unknown (retrocausal): coverage projected F(3) forward + # by current sampling distribution + expected_delta = float(retrocausal_steps) * probs + inv_future = 1.0 / (1.0 + cov + expected_delta) + frontier_future = inv_future / (inv_future.sum() + 1e-8) + # Blend past-frontier and future-frontier (both positive registers) + alpha_retro = 1.0 / (_PHI_FOR_SAMPLING ** math.pi) + blended_frontier = ((1.0 - alpha_retro) * frontier_now + + alpha_retro * frontier_future) + blended_frontier = blended_frontier / (blended_frontier.sum() + 1e-8) + # Apply blended frontier as omniweight contribution + alpha = 1.0 / (_PHI_FOR_SAMPLING ** math.pi) + out = (1.0 - alpha) * probs + alpha * blended_frontier + return out / (out.sum() + 1e-8) + + +def _self_eval_insight(base_probs: torch.Tensor, emitted_tid: int, + n_chars: int = 65) -> float: + """Compute self-evaluation insight signal for a just-emitted token. + + insight = 1 if: + - emitted token is a real word (rank >= n_chars), AND + - surprise (-log p_emitted) >= pi*log(phi) ~ 1.51 (substrate threshold) + insight = 0 otherwise. + + Recursive substrate self-monitoring: model rates its own emissions + against its own distribution. + """ + if emitted_tid < n_chars: + return 0.0 + V = base_probs.shape[0] + if not (0 <= emitted_tid < V): + return 0.0 + p = float(base_probs[emitted_tid].item()) + if p <= 0.0: + return 0.0 + surprise = -math.log(p + 1e-12) + threshold = math.pi * math.log(_PHI_FOR_SAMPLING) + return 1.0 if surprise >= threshold else 0.0 + + def _omniweight_apply_split(base_probs: torch.Tensor, math_delta: torch.Tensor, - lang_delta: torch.Tensor) -> torch.Tensor: - """SPLIT-BRAIN omniweight: RANK-MODULATED mixer. - - Per-token weight derived from substrate rank position: - rank 0 (most-functional) -> math_weight = 1, lang_weight = 0 - rank V/2 -> math_weight = 0.5, lang_weight = 0.5 - rank V-1 (rarest content) -> math_weight = 0, lang_weight = 1 + lang_delta: torch.Tensor, + momentum: float = 0.0) -> torch.Tensor: + """RANK-MODULATED split-brain mixer with momentum-modulated reserve. - Each hemisphere gets sovereignty over its natural domain: - Math owns frequency/decay -> dominates function words. - Language owns purpose/structure -> dominates content words. + Each hemisphere builds fluid delta via tanh-scaled reserve. + Reserve scaled by (1 + tanh(momentum)) -- when recent emissions + have been insightful (high surprise + real word), primitives get + more room. When noisy/expected, primitives constrained. - No more mixing in regions where one hemisphere doesn't belong. + Per-token weight by rank: math owns low rank, lang owns high rank. """ - math_fluid = _OMNIWEIGHT_RESERVE * torch.tanh(math_delta / _OMNIWEIGHT_RESERVE) - lang_fluid = _OMNIWEIGHT_RESERVE * torch.tanh(lang_delta / _OMNIWEIGHT_RESERVE) + # Momentum-modulated reserve (recursive substrate self-trust). + reserve = _OMNIWEIGHT_RESERVE * (1.0 + math.tanh(momentum)) + if reserve < 1e-3: + reserve = 1e-3 + math_fluid = reserve * torch.tanh(math_delta / reserve) + lang_fluid = reserve * torch.tanh(lang_delta / reserve) p_math = base_probs * torch.exp(math_fluid) p_lang = base_probs * torch.exp(lang_fluid) p_math = p_math / (p_math.sum() + 1e-8) @@ -1372,7 +1583,10 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int, end_vowels: list = None, punct_mask: torch.Tensor = None, newline_mask: torch.Tensor = None, - unpronounceable_mask: torch.Tensor = None): + unpronounceable_mask: torch.Tensor = None, + allowed_after_word_mask: torch.Tensor = None, + uppercase_mask: torch.Tensor = None, + any_punct_mask: torch.Tensor = None): """Sample n_new tokens autoregressively with substrate sampling AND a substrate-canonical recency penalty. @@ -1395,6 +1609,13 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int, char_run = 0 recent_pairs = [] # (prev_tok, current_tok) bigram history last_content_ends_s = False + creative_momentum = 0.0 # self-eval EMA register + momentum_history = [] # recent momentum values, F(7)=13 deep + coverage = torch.zeros(vocab_size) # unknown-register + if vocab is not None: + for tid in seq[0].tolist(): + if 0 <= tid < vocab_size: + coverage[tid] += 1.0 if vocab is not None: prompt_list = seq[0].tolist() for idx_pl, tid in enumerate(prompt_list): @@ -1462,6 +1683,33 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int, history_aw = seq[0, -21:] p = substrate_anti_stagnation(history_aw, base, vocab_size) math_delta += _omniweight_delta(base, p) + # Unknown-register: BOTH hemispheres feel curiosity equally. + # The frontier signal is meta -- exploration is neither pure + # frequency nor pure structure; both hemispheres receive it. + p_unknown = substrate_unknown_register(coverage, base) + d_unknown = _omniweight_delta(base, p_unknown) + math_delta += d_unknown + lang_delta += d_unknown + # ---- Grammar rules (v88): basic structural enforcement ---- + prev_str_g = '' + if vocab is not None and seq.shape[1] >= 1: + pid = int(seq[0, -1]) + if pid < len(vocab): + prev_str_g = vocab[pid] + # Capitalization after sentence boundary. + if uppercase_mask is not None: + p = substrate_grammar_capitalize( + prev_str_g, base, uppercase_mask) + d_gram = _omniweight_delta(base, p) + math_delta += d_gram + lang_delta += d_gram + # No double punctuation. + if any_punct_mask is not None: + p = substrate_grammar_no_double_punct( + prev_str_g, base, any_punct_mask) + d_gram = _omniweight_delta(base, p) + math_delta += d_gram + lang_delta += d_gram # ---- Language hemisphere ---- p = substrate_iambic_phase( syl_pos, base, vocab_size, newline_mask=newline_mask) @@ -1490,7 +1738,8 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int, lang_delta += _omniweight_delta(base, p) if vocab is not None and seq.shape[1] >= 1: p = substrate_word_spacing( - int(seq[0, -1]), base, vocab, n_chars=n_chars_local) + int(seq[0, -1]), base, vocab, n_chars=n_chars_local, + allowed_mask=allowed_after_word_mask) lang_delta += _omniweight_delta(base, p) if char_run >= _FIB_NUMS_FOR_BIGRAM[3]: p = substrate_char_cascade( @@ -1514,9 +1763,34 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int, p = substrate_subject_threading( seq_list, vocab, base, is_sentence_start=True) lang_delta += _omniweight_delta(base, p) - # Apply split-brain mixer (geometric mean). + # Apply split-brain mixer with momentum-modulated reserve. probs = _omniweight_apply_split( - base, math_delta, lang_delta).unsqueeze(0) + base, math_delta, lang_delta, + momentum=creative_momentum).unsqueeze(0) + # A. Three-mode behavior based on momentum sign. + if creative_momentum > 0.5: + # Exploit: sharpen distribution. + p = probs[0] ** _PHI_FOR_SAMPLING + probs[0] = p / (p.sum() + 1e-8) + elif creative_momentum < -0.5: + # Escape: flatten distribution. + p = probs[0] ** (1.0 / _PHI_FOR_SAMPLING) + probs[0] = p / (p.sum() + 1e-8) + # B. Backtrack-on-collapse: if recent momentum dropped + # >F(3)=2 mass over last F(5)=5 steps AND current is + # negative, force newline boost (substrate reset). + collapsed = False + if (len(momentum_history) >= _FIB_NUMS_FOR_BIGRAM[5] + and newline_mask is not None): + recent_window = momentum_history[-_FIB_NUMS_FOR_BIGRAM[5]:] + drop = max(recent_window) - creative_momentum + if drop > 0.3 and creative_momentum < -0.2: + collapsed = True + if collapsed and newline_mask is not None: + nm = newline_mask.to(probs[0].device).to(probs[0].dtype) + phi2 = _PHI_FOR_SAMPLING ** 2 + probs[0] = probs[0] * (1.0 + nm * (phi2 - 1.0)) + probs[0] = probs[0] / (probs[0].sum() + 1e-8) # Vocab curriculum (HARD mask, post-omniweight). if active_vocab_size is not None: probs[0] = substrate_vocab_curriculum( @@ -1560,6 +1834,18 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int, recent_pairs.append((prev_for_pair, nid)) if len(recent_pairs) > 13: recent_pairs = recent_pairs[-13:] + # Self-evaluation: update creative momentum EMA. + insight = _self_eval_insight(base, nid, n_chars_local) + inv_phi = 1.0 / _PHI_FOR_SAMPLING + creative_momentum = (inv_phi * creative_momentum + + (1.0 - inv_phi) * insight) + # Track momentum history for backtrack detection. + momentum_history.append(creative_momentum) + if len(momentum_history) > 13: + momentum_history = momentum_history[-13:] + # Update unknown-register coverage. + if 0 <= nid < vocab_size: + coverage[nid] += 1.0 model.train() return seq @@ -1579,7 +1865,10 @@ def _single_stage_refine(model, draft, vocab_size, scorer, mode: str, end_vowels: list = None, punct_mask: torch.Tensor = None, newline_mask: torch.Tensor = None, - unpronounceable_mask: torch.Tensor = None): + unpronounceable_mask: torch.Tensor = None, + allowed_after_word_mask: torch.Tensor = None, + uppercase_mask: torch.Tensor = None, + any_punct_mask: torch.Tensor = None): """One refinement stage: optimize a single score until plateau. mode: 'min' (harmony, quality) or 'max' (creativity). @@ -1611,7 +1900,20 @@ def _single_stage_refine(model, draft, vocab_size, scorer, mode: str, n_avail = confidences.shape[1] - prompt_in_ctx n_resample = max(1, int(resample_frac * n_avail)) n_resample = min(n_resample, max(1, n_avail)) - _, low_idx = confidences[0].topk(n_resample, largest=False) + # REGRET-DRIVEN SELECTION: judge each position by substrate + # criteria (over-emission, bigram lock, double punct, mid-word + # char) and pick highest-regret positions to resample. + # Falls back to low-confidence ordering as a tiebreaker. + regret_scores = torch.zeros(confidences.shape[1]) + n_chars_rg = sum(1 for t in vocab if len(t) == 1) if vocab else 65 + for j in range(prompt_in_ctx, confidences.shape[1]): + t_in_cur = j + 1 + offset + if 0 < t_in_cur < cur.shape[1]: + regret_scores[j] = _regret_score( + cur, t_in_cur, vocab or [], n_chars=n_chars_rg) + combined = regret_scores - 0.1 * confidences[0].cpu() + combined[:prompt_in_ctx] = -1e9 + _, low_idx = combined.topk(n_resample, largest=True) new = cur.clone() recency_window = 21 @@ -1718,7 +2020,8 @@ def _single_stage_refine(model, draft, vocab_size, scorer, mode: str, lang_delta += _omniweight_delta(base_probs, p) p = substrate_word_spacing( int(new[0, t_draft - 1]), base_probs, vocab, - n_chars=n_chars_r) + n_chars=n_chars_r, + allowed_mask=allowed_after_word_mask) lang_delta += _omniweight_delta(base_probs, p) if char_run_r >= _FIB_NUMS_FOR_BIGRAM[3]: p = substrate_char_cascade( @@ -1755,9 +2058,29 @@ def _single_stage_refine(model, draft, vocab_size, scorer, mode: str, p = substrate_anti_stagnation( history_aw, base_probs, vocab_size_local) math_delta += _omniweight_delta(base_probs, p) - # Apply split-brain mixer (geometric mean). + # Grammar rules: capitalization + no-double-punct. + prev_str_rg = '' + if vocab is not None and t_draft >= 1: + pid = int(new[0, t_draft - 1]) + if pid < len(vocab): + prev_str_rg = vocab[pid] + if uppercase_mask is not None: + p = substrate_grammar_capitalize( + prev_str_rg, base_probs, uppercase_mask) + d_g = _omniweight_delta(base_probs, p) + math_delta += d_g + lang_delta += d_g + if any_punct_mask is not None: + p = substrate_grammar_no_double_punct( + prev_str_rg, base_probs, any_punct_mask) + d_g = _omniweight_delta(base_probs, p) + math_delta += d_g + lang_delta += d_g + # Apply split-brain mixer. Momentum=0 in refine + # (no streaming history of base distributions). pos_probs = _omniweight_apply_split( - base_probs, math_delta, lang_delta) + base_probs, math_delta, lang_delta, + momentum=0.0) # Vocab curriculum (HARD mask, post-omniweight). if active_vocab_size is not None: pos_probs = substrate_vocab_curriculum( @@ -1802,7 +2125,10 @@ def staged_refine(model, prompt, n_new, vocab_size, end_vowels: list = None, punct_mask: torch.Tensor = None, newline_mask: torch.Tensor = None, - unpronounceable_mask: torch.Tensor = None): + unpronounceable_mask: torch.Tensor = None, + allowed_after_word_mask: torch.Tensor = None, + uppercase_mask: torch.Tensor = None, + any_punct_mask: torch.Tensor = None): """Staircase refinement: hit one score, then the next, then the next. Stage 1: substrate alignment (minimize harmony) -- match the shape. @@ -1818,7 +2144,7 @@ def staged_refine(model, prompt, n_new, vocab_size, with torch.no_grad(): draft = autoregressive_generate(model, prompt, n_new=n_new, vocab_size=vocab_size, - temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask) + temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask, allowed_after_word_mask=allowed_after_word_mask, uppercase_mask=uppercase_mask, any_punct_mask=any_punct_mask) stages_out = {} stages_out["initial"] = {"seq": draft.clone(), "harmony": harmony_scorer(draft), @@ -1831,7 +2157,7 @@ def staged_refine(model, prompt, n_new, vocab_size, n_iters=n_iters_per_stage, resample_frac=resample_frac, prompt_len=prompt_len, - temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask) + temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask, allowed_after_word_mask=allowed_after_word_mask, uppercase_mask=uppercase_mask, any_punct_mask=any_punct_mask) stages_out["after_harmony"] = {"seq": draft.clone(), "trajectory": h_traj, "harmony": harmony_scorer(draft), @@ -1844,7 +2170,7 @@ def staged_refine(model, prompt, n_new, vocab_size, n_iters=n_iters_per_stage, resample_frac=resample_frac, prompt_len=prompt_len, - temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask) + temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask, allowed_after_word_mask=allowed_after_word_mask, uppercase_mask=uppercase_mask, any_punct_mask=any_punct_mask) stages_out["after_quality"] = {"seq": draft.clone(), "trajectory": q_traj, "harmony": harmony_scorer(draft), @@ -1858,7 +2184,7 @@ def staged_refine(model, prompt, n_new, vocab_size, n_iters=n_iters_per_stage, resample_frac=resample_frac, prompt_len=prompt_len, - temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask) + temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask, allowed_after_word_mask=allowed_after_word_mask, uppercase_mask=uppercase_mask, any_punct_mask=any_punct_mask) stages_out["after_creativity"] = {"seq": draft.clone(), "trajectory": c_traj, "harmony": harmony_scorer(draft), @@ -1892,7 +2218,7 @@ def iterative_refine(model, prompt, n_new, vocab_size, # Step 1: initial draft. draft = autoregressive_generate(model, prompt, n_new=n_new, vocab_size=vocab_size, - temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask) + temperature=temperature, bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask, allowed_after_word_mask=allowed_after_word_mask, uppercase_mask=uppercase_mask, any_punct_mask=any_punct_mask) history = [] h0 = harmony_scorer(draft) if harmony_scorer is not None else None q0 = quality_scorer(draft) if quality_scorer is not None else None @@ -2299,7 +2625,11 @@ def quality_fn(seq_tokens): vocab_for_bigram, n_chars=n_chars_local) pronoun_mask = build_pronoun_mask(vocab_for_bigram) vowel_start_mask = build_vowel_start_mask(vocab_for_bigram) - end_vowels = build_end_vowel_per_token(vocab_for_bigram) + end_vowels = build_end_vowel_idx_tensor(vocab_for_bigram) + allowed_after_word_mask = build_allowed_after_word_mask( + vocab_for_bigram, n_chars=n_chars_local) + uppercase_mask = build_uppercase_mask(vocab_for_bigram) + any_punct_mask = build_any_punct_mask(vocab_for_bigram) punct_mask = build_punct_mask(vocab_for_bigram) newline_mask = build_newline_mask(vocab_for_bigram) unpronounceable_mask = build_unpronounceable_mask(vocab_for_bigram) @@ -2310,7 +2640,7 @@ def quality_fn(seq_tokens): f"newline: {int(newline_mask.sum().item())} | " f"unpronounceable: " f"{int(unpronounceable_mask.sum().item())} | " - f"end-vowel: {sum(1 for v in end_vowels if v)}") + f"end-vowel: {int((end_vowels >= 0).sum().item())}") else: class_id_tensor = None n_classes = 0 @@ -2320,6 +2650,9 @@ def quality_fn(seq_tokens): punct_mask = None newline_mask = None unpronounceable_mask = None + allowed_after_word_mask = None + uppercase_mask = None + any_punct_mask = None # Active training base: starts as tiny_seed, GROWS by appending each # cycle's best refined output -- only if (a) creativity > corpus @@ -2356,8 +2689,13 @@ def quality_fn(seq_tokens): x, y = sample_tiny_batch(active_base, args.batch_size, args.seq_len, gen) logits = model(x) - ce_fft = substrate_fft_loss(logits, y, vocab_size, - lambda_substrate=args.lambda_sub) + if getattr(args, 'omniweight_loss', False): + ce_fft = substrate_omniweight_loss( + logits, y, vocab_size, + lambda_substrate=args.lambda_sub) + else: + ce_fft = substrate_fft_loss(logits, y, vocab_size, + lambda_substrate=args.lambda_sub) K_h = K_to_K_harmony(cur_K or args.K_init, K_init=args.K_init, K_min=args.K_min) harmony = compute_harmony_grounded(logits, vocab_size, harmony_kind, @@ -2387,14 +2725,14 @@ def quality_fn(seq_tokens): draft = autoregressive_generate( model, prompt_s, n_new=growth_n_new, vocab_size=vocab_size, temperature=0.8, - bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask) + bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask, allowed_after_word_mask=allowed_after_word_mask, uppercase_mask=uppercase_mask, any_punct_mask=any_punct_mask) refined_s, _ = staged_refine( model, prompt_s, n_new=growth_n_new, vocab_size=vocab_size, harmony_scorer=harmony_fn, quality_scorer=quality_fn, creativity_scorer=creativity_fn, n_iters_per_stage=30, resample_frac=0.35, prompt_len=16, temperature=0.5, - bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask) + bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask, allowed_after_word_mask=allowed_after_word_mask, uppercase_mask=uppercase_mask, any_punct_mask=any_punct_mask) samples.append((refined_s.squeeze(0).clone(), creativity_fn(refined_s))) # Sort by creativity desc, keep top K. @@ -2464,14 +2802,14 @@ def quality_fn(seq_tokens): final_gen = autoregressive_generate(model, prompt, n_new=n_new, vocab_size=vocab_size, temperature=0.8, - bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask) + bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask, allowed_after_word_mask=allowed_after_word_mask, uppercase_mask=uppercase_mask, any_punct_mask=any_punct_mask) final_refined, _ = staged_refine( model, prompt, n_new=n_new, vocab_size=vocab_size, harmony_scorer=harmony_fn, quality_scorer=quality_fn, creativity_scorer=creativity_fn, n_iters_per_stage=200, resample_frac=0.35, prompt_len=16, temperature=0.5, - bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask) + bigram_prior=bigram_prior, vocab=vocab, token_signatures=token_signatures, active_vocab_size=active_vocab_size, class_id_tensor=class_id_tensor, n_classes=n_classes, pronoun_mask=pronoun_mask, vowel_start_mask=vowel_start_mask, end_vowels=end_vowels, punct_mask=punct_mask, newline_mask=newline_mask, unpronounceable_mask=unpronounceable_mask, allowed_after_word_mask=allowed_after_word_mask, uppercase_mask=uppercase_mask, any_punct_mask=any_punct_mask) return {"name": name, "mode": "self_distillation", "n_params": n_params, @@ -2911,6 +3249,11 @@ def main(): default=1.0 / (_PHI_FOR_SAMPLING ** math.pi)) parser.add_argument("--tiny-chars", type=int, default=1024, help="Size of the tiny training seed in chars") + parser.add_argument("--omniweight-loss", action="store_true", + help="Apply the inference-side omniweight standard " + "(phi^pi tanh fluid form) to per-token CE " + "during training. Closes the train/inference " + "asymmetry on the anti-stagnation primitive.") parser.add_argument("--out", type=str, default="results_self_recursive.json") args = parser.parse_args()