diff --git a/codebenders-dashboard/app/discovery/aascu/full/page.module.css b/codebenders-dashboard/app/discovery/aascu/full/page.module.css new file mode 100644 index 0000000..e588ab1 --- /dev/null +++ b/codebenders-dashboard/app/discovery/aascu/full/page.module.css @@ -0,0 +1,600 @@ +.doc { + --paper: #f5efe4; + --paper-deep: #ece3d2; + --ink: #1a1714; + --ink-soft: #3b342c; + --muted: #857a68; + --rule: #d6cbb5; + --accent: #b8531a; + --accent-deep: #8a3d12; + --done: #4f6539; + --partial: #a87f1f; + --gap: #8b2f2a; + --p0: #8b2f2a; + --p1: #a87f1f; + --p2: #5d6b76; + + background-color: var(--paper); + color: var(--ink); + font-family: var(--font-plex-sans), ui-sans-serif, system-ui, sans-serif; + font-weight: 400; + line-height: 1.55; + -webkit-font-smoothing: antialiased; + text-rendering: optimizeLegibility; + min-height: calc(100vh - 48px); + + background-image: + radial-gradient(circle at 20% 10%, rgba(184, 83, 26, 0.04), transparent 40%), + radial-gradient(circle at 80% 80%, rgba(26, 23, 20, 0.05), transparent 50%), + url("data:image/svg+xml;utf8,"); + background-attachment: fixed; +} + +.inner { + max-width: 1180px; + margin: 0 auto; + padding: 64px 56px 120px; +} + +.mast { + border-top: 2px solid var(--ink); + border-bottom: 1px solid var(--ink); + padding: 18px 0 22px; + display: grid; + grid-template-columns: 1fr auto; + align-items: end; + gap: 24px; + animation: fadeUp 0.8s ease-out both; +} + +.lockup { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 11px; + letter-spacing: 0.18em; + text-transform: uppercase; + color: var(--ink-soft); +} +.lockup b { color: var(--accent); font-weight: 500; } + +.mastMeta { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10.5px; + letter-spacing: 0.16em; + text-transform: uppercase; + color: var(--muted); + text-align: right; + line-height: 1.7; +} + +.hero { + padding: 80px 0 64px; + border-bottom: 1px solid var(--rule); + position: relative; +} + +.eyebrow { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 11px; + letter-spacing: 0.22em; + text-transform: uppercase; + color: var(--accent); + margin-bottom: 28px; + animation: fadeUp 0.9s 0.1s ease-out both; +} + +.h1 { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 144, "SOFT" 30; + font-weight: 340; + font-size: clamp(48px, 7.2vw, 108px); + line-height: 0.94; + letter-spacing: -0.025em; + color: var(--ink); + margin-bottom: 32px; + animation: fadeUp 1s 0.15s ease-out both; +} +.h1 em { + font-style: italic; + font-weight: 300; + color: var(--accent); + font-variation-settings: "opsz" 144, "SOFT" 80; +} + +.lede { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 36; + font-weight: 330; + font-size: clamp(20px, 1.7vw, 24px); + line-height: 1.45; + color: var(--ink-soft); + max-width: 780px; + animation: fadeUp 1s 0.25s ease-out both; +} + +.strip { + margin-top: 56px; + display: grid; + grid-template-columns: repeat(4, 1fr); + gap: 0; + border-top: 1px solid var(--rule); + animation: fadeUp 1s 0.35s ease-out both; +} +.strip > div { + padding: 18px 20px 4px 0; + border-right: 1px solid var(--rule); +} +.strip > div:last-child { border-right: none; padding-right: 0; } +.strip dt { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.2em; + text-transform: uppercase; + color: var(--muted); + margin-bottom: 8px; +} +.strip dd { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 60; + font-weight: 340; + font-size: 34px; + line-height: 1; + color: var(--ink); + letter-spacing: -0.01em; +} +.strip dd small { + display: block; + font-family: var(--font-plex-sans); + font-size: 11px; + letter-spacing: 0.04em; + color: var(--muted); + margin-top: 8px; + font-weight: 400; +} + +.chapter { + padding: 88px 0 24px; + border-bottom: 1px solid var(--rule); +} +.chapter:last-of-type { border-bottom: none; } + +.chapHead { + display: grid; + grid-template-columns: 120px 1fr; + gap: 48px; + margin-bottom: 56px; + align-items: start; +} +.chapNum { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 144, "SOFT" 0; + font-weight: 300; + font-size: 96px; + line-height: 0.85; + color: var(--accent); + letter-spacing: -0.04em; +} +.chapNum span { + display: block; + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.22em; + text-transform: uppercase; + color: var(--muted); + margin-top: 14px; +} +.chapTitle { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 60; + font-weight: 360; + font-size: clamp(32px, 3.6vw, 52px); + line-height: 1.05; + letter-spacing: -0.02em; + color: var(--ink); + max-width: 780px; +} +.chapTitle em { + font-style: italic; + font-weight: 300; + color: var(--accent); + font-variation-settings: "opsz" 60, "SOFT" 80; +} +.chapKicker { + font-family: var(--font-plex-sans); + font-size: 15px; + line-height: 1.6; + color: var(--ink-soft); + margin-top: 18px; + max-width: 680px; +} + +.pains { + display: grid; + grid-template-columns: repeat(2, 1fr); + gap: 1px; + background: var(--rule); + border: 1px solid var(--rule); +} +.pain { + background: var(--paper); + padding: 32px 30px 36px; + position: relative; +} +.painTag { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.2em; + text-transform: uppercase; + color: var(--accent); + margin-bottom: 14px; + display: flex; + gap: 10px; + align-items: center; +} +.painTag::before { + content: ""; + width: 14px; + height: 1px; + background: var(--accent); +} +.painH { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 36; + font-weight: 380; + font-size: 23px; + line-height: 1.2; + color: var(--ink); + margin-bottom: 16px; + letter-spacing: -0.01em; +} +.painList { list-style: none; padding: 0; margin: 0; } +.painList li { + position: relative; + padding-left: 18px; + margin-bottom: 10px; + font-size: 14.5px; + line-height: 1.55; + color: var(--ink-soft); +} +.painList li::before { + content: ""; + position: absolute; + left: 0; + top: 0.65em; + width: 6px; + height: 1px; + background: var(--ink-soft); +} +.painList li strong { color: var(--ink); font-weight: 500; } + +.pq { + margin: 64px auto; + max-width: 880px; + padding: 0 24px; + text-align: center; +} +.pq blockquote { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 144, "SOFT" 100; + font-style: italic; + font-weight: 320; + font-size: clamp(26px, 3vw, 40px); + line-height: 1.25; + color: var(--ink); + letter-spacing: -0.015em; +} +.pq blockquote::before { + content: "\201C"; + display: block; + font-size: 96px; + line-height: 0.5; + color: var(--accent); + margin-bottom: 8px; + opacity: 0.7; +} +.pq cite { + display: block; + margin-top: 24px; + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-style: normal; + font-size: 11px; + letter-spacing: 0.2em; + text-transform: uppercase; + color: var(--muted); +} + +.row { + display: grid; + grid-template-columns: 140px 1.2fr 1.4fr 0.7fr; + gap: 32px; + padding: 22px 0; + border-bottom: 1px solid var(--rule); + align-items: start; +} +.row:first-child { border-top: 1px solid var(--ink); } +.rowHead { + padding: 14px 0; + border-bottom: 1px solid var(--ink); +} +.rowHead > div { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.2em; + text-transform: uppercase; + color: var(--muted); +} +.pp { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 11px; + letter-spacing: 0.16em; + text-transform: uppercase; + color: var(--accent); +} +.name { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 24; + font-weight: 400; + font-size: 18px; + line-height: 1.3; + color: var(--ink); + letter-spacing: -0.005em; +} +.ev { + font-size: 13.5px; + line-height: 1.55; + color: var(--ink-soft); +} +.ev code { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 12px; + background: var(--paper-deep); + padding: 1px 6px; + border-radius: 2px; + color: var(--ink); +} + +.stat { + display: inline-flex; + align-items: center; + gap: 8px; + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 11px; + letter-spacing: 0.16em; + text-transform: uppercase; + font-weight: 500; +} +.stat::before { + content: ""; + width: 8px; + height: 8px; + border-radius: 50%; + flex-shrink: 0; +} +.statDone { color: var(--done); } +.statDone::before { background: var(--done); } +.statPartial { color: var(--partial); } +.statPartial::before { background: var(--partial); } + +.tier { margin-bottom: 56px; } +.tierHead { + display: flex; + align-items: baseline; + gap: 24px; + margin-bottom: 28px; + padding-bottom: 14px; + border-bottom: 1px solid var(--ink); +} +.tierPill { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 11px; + font-weight: 500; + letter-spacing: 0.18em; + padding: 6px 14px; + border-radius: 2px; + color: var(--paper); +} +.pill0 { background: var(--p0); } +.pill1 { background: var(--p1); } +.pill2 { background: var(--p2); } +.tierH { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 36; + font-weight: 370; + font-size: 24px; + color: var(--ink); + letter-spacing: -0.01em; +} +.tierH em { font-style: italic; color: var(--muted); font-weight: 330; } + +.cards { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 1px; + background: var(--rule); + border: 1px solid var(--rule); +} +.cardsTwo { grid-template-columns: repeat(2, 1fr); } + +.card { + background: var(--paper); + padding: 28px 26px 32px; + display: flex; + flex-direction: column; + transition: background 0.25s ease, transform 0.25s ease; + text-decoration: none; + color: inherit; +} +.card:hover { background: var(--paper-deep); transform: translateY(-2px); } + +.cardNum { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10.5px; + letter-spacing: 0.18em; + color: var(--muted); + text-transform: uppercase; + margin-bottom: 14px; + display: flex; + justify-content: space-between; +} +.cardNum b { color: var(--ink); font-weight: 500; letter-spacing: 0.1em; } +.cardH { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 30; + font-weight: 400; + font-size: 21px; + line-height: 1.2; + letter-spacing: -0.01em; + color: var(--ink); + margin-bottom: 14px; +} +.cardTag { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.2em; + text-transform: uppercase; + color: var(--accent); + margin-bottom: 10px; +} +.cardP { + font-size: 14px; + line-height: 1.6; + color: var(--ink-soft); + margin-bottom: auto; +} +.cardLink { + margin-top: 20px; + padding-top: 16px; + border-top: 1px dotted var(--rule); + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 11px; + letter-spacing: 0.14em; + text-transform: uppercase; + color: var(--accent); +} +.cardLink::after { content: " →"; } + +.summary { + display: grid; + grid-template-columns: 1fr 2fr; + gap: 64px; + align-items: start; +} +.summary aside { + border-top: 2px solid var(--ink); + padding-top: 18px; +} +.summary aside h5 { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.22em; + text-transform: uppercase; + color: var(--muted); + margin-bottom: 18px; + font-weight: 500; +} +.summary aside ul { list-style: none; padding: 0; } +.summary aside li { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 24; + font-size: 18px; + line-height: 1.35; + padding: 10px 0; + border-bottom: 1px solid var(--rule); + color: var(--ink); +} +.summary aside li::before { + content: attr(data-n); + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.2em; + color: var(--accent); + margin-right: 14px; + vertical-align: 2px; +} +.summary .body p { + font-family: var(--font-fraunces), ui-serif, Georgia, serif; + font-variation-settings: "opsz" 24; + font-weight: 340; + font-size: 21px; + line-height: 1.55; + color: var(--ink); + margin-bottom: 18px; + letter-spacing: -0.005em; +} +.summary .body p strong { color: var(--accent); font-weight: 450; } +.summary .body p em { font-style: italic; color: var(--ink-soft); } + +.scope { + margin-top: 48px; + padding: 32px 0 0; + border-top: 1px solid var(--rule); + display: grid; + grid-template-columns: 200px 1fr; + gap: 48px; +} +.scope h6 { + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.22em; + text-transform: uppercase; + color: var(--muted); + font-weight: 500; +} +.scope ul { + list-style: none; + padding: 0; + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 24px; +} +.scope li { + font-size: 13.5px; + line-height: 1.5; + color: var(--ink-soft); + padding-left: 14px; + position: relative; +} +.scope li::before { + content: "\00d7"; + position: absolute; + left: 0; + top: 0; + color: var(--gap); + font-weight: 500; +} + +.colophon { + margin-top: 96px; + padding-top: 24px; + border-top: 2px solid var(--ink); + display: grid; + grid-template-columns: 1fr auto; + gap: 24px; + font-family: var(--font-jb-mono), ui-monospace, monospace; + font-size: 10.5px; + letter-spacing: 0.18em; + text-transform: uppercase; + color: var(--muted); +} +.colophon b { color: var(--ink); font-weight: 500; } + +@keyframes fadeUp { + from { opacity: 0; transform: translateY(14px); } + to { opacity: 1; transform: translateY(0); } +} + +@media (max-width: 920px) { + .inner { padding: 32px 22px 80px; } + .h1 { font-size: 54px; } + .strip { grid-template-columns: repeat(2, 1fr); } + .strip > div { padding: 18px 16px 4px 0; } + .strip > div:nth-child(2n) { border-right: none; } + .strip > div:nth-child(-n+2) { border-bottom: 1px solid var(--rule); } + .chapHead { grid-template-columns: 1fr; gap: 20px; } + .chapNum { font-size: 64px; } + .pains { grid-template-columns: 1fr; } + .row { grid-template-columns: 1fr; gap: 8px; padding: 18px 0; } + .summary { grid-template-columns: 1fr; gap: 36px; } + .cards, .cardsTwo { grid-template-columns: 1fr; } + .scope { grid-template-columns: 1fr; gap: 18px; } + .scope ul { grid-template-columns: 1fr; } +} diff --git a/codebenders-dashboard/app/discovery/aascu/full/page.tsx b/codebenders-dashboard/app/discovery/aascu/full/page.tsx new file mode 100644 index 0000000..6766190 --- /dev/null +++ b/codebenders-dashboard/app/discovery/aascu/full/page.tsx @@ -0,0 +1,332 @@ +import type { Metadata } from "next" +import { Fraunces, IBM_Plex_Sans, JetBrains_Mono } from "next/font/google" +import styles from "./page.module.css" + +const fraunces = Fraunces({ + variable: "--font-fraunces", + subsets: ["latin"], + axes: ["opsz", "SOFT"], +}) + +const plexSans = IBM_Plex_Sans({ + variable: "--font-plex-sans", + subsets: ["latin"], + weight: ["300", "400", "500", "600"], + style: ["normal", "italic"], +}) + +const jbMono = JetBrains_Mono({ + variable: "--font-jb-mono", + subsets: ["latin"], + weight: ["400", "500"], +}) + +export const metadata: Metadata = { + title: "AASCU Intermediary Discovery — Gap Analysis", + description: "Editorial report on the AASCU intermediary discovery session: pain points, what the tool already addresses, and gaps filed as issues.", +} + +const ISSUE_BASE = "https://github.com/devcolor/codebenders-datathon/issues" + +const PAINS = [ + { + tag: "A · Data accuracy & trust", + h: "Numbers that don’t add up", + items: [ + <>Incorrect cohort sizes; campuses missing from dashboard dropdowns., + <>Dashboard pulls from the wrong dataset intermittently., + <>Submission-time processing corrupts otherwise-clean institutional data., + <>No cross-source verification — intermediaries and institutions take each other’s word., + ], + }, + { + tag: "B · Definitions & terminology", + h: "The glossary that isn’t there", + items: [ + <>“Completion at 3/4/5 years” and retention metrics undefined in-context., + <>Definitions live in external documentation, not the dashboard., + <>Terms diverge from IPEDS and state-compliance vocabulary IR staff already use., + ], + }, + { + tag: "C · Visualization & export", + h: "Charts you can’t show your boss", + items: [ + <>Wrong chart types — line graph for independent cohorts where bar is correct., + <>Charts not presentation-ready; IR staff rebuild in Excel by hand., + <>No data download from dashboard. Analysis-ready file is a $20K paywall., + ], + }, + { + tag: "D · AI & governance", + h: "FERPA is the floor, not the ceiling", + items: [ + <>“Weaponizing data” risk to under-resourced campuses via context-free AI inference., + <>Sensitive populations — immigrant, undocumented, public-aid — need explicit care., + <>Demand for full data lineage, model transparency, storage disclosure., + ], + }, + { + tag: "E · Institutional process", + h: "One person knows; one person retires", + items: [ + <>Submission knowledge silos with one IR staffer per institution., + <>Each campus has wildly different submission rituals — some delist + re-upload everything every cycle., + <>“Best edit resolution” loses to people-and-process bottlenecks., + ], + }, + { + tag: "F · Datathon coordination", + h: "Group by goal, not just SIS", + items: [ + <>AASCU has SIS-by-institution list — can rank by commonality or by need., + <>Last datathon: wildly different institutional incomes ate most of the day., + <>Recommend grouping by shared SIS + shared goal (e.g., advising)., + ], + }, +] + +const COVERAGE: Array<{ + pp: string + name: string + ev: React.ReactNode + status: "done" | "partial" + statusLabel: string +}> = [ + { pp: "C · Export", name: "CSV export wired into dashboard", ev: <>components/export-button.tsx · issue #15, status: "done", statusLabel: "Done" }, + { pp: "C · Visualization", name: "Recharts components, types chosen per metric", ev: <>retention-risk-chart.tsx, risk-alert-chart.tsx, readiness-assessment-chart.tsx, status: "done", statusLabel: "Done" }, + { pp: "B · Definitions", name: "Tooltip primitive exists; no centralized glossary yet", ev: <>components/info-popover.tsx, status: "partial", statusLabel: "Partial" }, + { pp: "D · Methodology", name: "How predictions are made — surfaced in-app", ev: <>app/methodology/ route, status: "done", statusLabel: "Done" }, + { pp: "D · FERPA", name: "RBAC, audit log, FERPA-compliant identity resolution", ev: <>Issues #67, #75, #77, #78 (closed), status: "done", statusLabel: "Done" }, + { pp: "D · Automation", name: "Self-service upload (PDP, AR, student, course)", ev: <>Issue #86 (closed) · components/upload/, status: "done", statusLabel: "Done" }, + { pp: "D · Explainability", name: "SHAP narrator — fine-tuning epic in progress", ev: <>Issues #97 — #103 · branch fine-tuning/97-shap-narrator-task-type, status: "partial", statusLabel: "In flight" }, + { pp: "A · Validation", name: "Upload exists; human-readable validation report missing", ev: <>Addressed by new issue #110, status: "partial", statusLabel: "Partial" }, + { pp: "C · Filtering", name: "By cohort, term, demographic, credential type", ev: <>Issues #66, #81 (closed), status: "done", statusLabel: "Done" }, + { pp: "C · Query", name: "Natural-language query against the data", ev: <>lib/prompt-analyzer.ts · issues #17, #61, #88, #90, status: "done", statusLabel: "Done" }, + { pp: "E · Knowledge", name: "Self-service upload reduces single-person dependency", ev: <>In-app submission runbook missing — addressed by #111, status: "partial", statusLabel: "Partial" }, +] + +const TIERS: Array<{ + pri: "p0" | "p1" | "p2" + pillLabel: string + h: React.ReactNode + twoCol?: boolean + cards: Array<{ num: number; tag: string; title: string; body: React.ReactNode }> +}> = [ + { + pri: "p0", + pillLabel: "P0 · Differentiators", + h: <>Match the loudest complaints. Datathon-eligible., + cards: [ + { num: 105, tag: "Pain B · Definitions", title: "Metric definitions glossary with IPEDS & state-compliance cross-walks", body: <>Every KPI surfaces a tooltip with PDP, IPEDS, and state-compliance equivalents. Centralized /glossary page indexed by metric. Markdown source-of-truth, versioned with the code. }, + { num: 106, tag: "Pain C · Export", title: "Presentation-ready chart export (PNG / PDF)", body: <>Every chart exports as a polished image with title, definition, source, and date stamp baked in. Eliminates the manual Excel-rebuild workflow IR staff perform daily. }, + { num: 107, tag: "Pain A + D · Lineage", title: "Data lineage view — “where did this number come from”", body: <>Click any number → see source rows, upload event, transformations, and timestamps. The single highest-leverage gap from the session: directly answers trust + governance + differentiation in one feature. }, + ], + }, + { + pri: "p1", + pillLabel: "P1 · Governance hardening", + h: <>Table-stakes for institutional adoption., + cards: [ + { num: 108, tag: "Pain D · Transparency", title: "AI Transparency Page", body: <>Per-model disclosure: features used, training data source, homegrown vs. third-party, where data flows when invoked, retention policy. Reviewable independently by institutional IT & legal. }, + { num: 109, tag: "Pain D · Sensitive populations", title: "Sensitive-population safeguards", body: <>Per-institution feature-exclusion lists. Context warnings on small sub-populations. Audit log entries for any query touching flagged groups. Demoable, not just claimed. }, + { num: 110, tag: "Pain A + E · Validation", title: "Upload validation report — diff vs. last upload", body: <>Row-level errors, field coercions, dedup decisions, anomaly flags (“3 campuses dropped from this upload”). Readable by non-technical IR staff. Survives the “person retires” scenario. }, + ], + }, + { + pri: "p2", + pillLabel: "P2 · Process & institutional fit", + h: <>Lower urgency, higher institutional gratitude., + twoCol: true, + cards: [ + { num: 111, tag: "Pain E · Process", title: "Submission runbook generator", body: <>Tool records the upload steps + field mappings that worked, then generates a printable runbook so a successor can replicate without tribal knowledge. Replayable on new files. }, + { num: 112, tag: "Pain F · Datathon coordination", title: "Institution-grouping helper — shared SIS + shared goal", body: <>Operational artifact, not a user feature: cross-reference AASCU’s SIS list with stated institutional goals; output a candidate cohort matrix for the fall datathon. }, + ], + }, +] + +const pillClass = (pri: "p0" | "p1" | "p2") => + pri === "p0" ? styles.pill0 : pri === "p1" ? styles.pill1 : styles.pill2 + +const statClass = (s: "done" | "partial") => + s === "done" ? styles.statDone : styles.statPartial + +export default function AASCUFullPage() { + return ( +
+
+ +
+
+ Codebenders Datathon · Discovery Report № 01 · Bishop State CC +
+
+ Filed 2026·04·29
+ Source: 21-min recording + notes
+ Status: Stakeholder review +
+
+ +
+
AASCU Intermediary Discovery — Gap Analysis
+

The data they have isn’t the data they trust.

+

+ Two AASCU intermediaries described, in their own words, why the Postsecondary Data Partnership dashboard fails the institutions they support — and where a tool built around provable, presentable, governed outputs would land. +

+
+
Recording
20:58minutes of testimony
+
Voices
2intermediaries · IR + data‑eng
+
Pain themes
6mapped across stack
+
Issues filed
8#105 — #112
+
+
+ +
+
+
01Executive summary
+
+

A tool worth adopting is one that institutions can defend — to legal, to leadership, to themselves.

+

The intermediaries don’t need another dashboard. They need outputs they can present upward, prove the provenance of, and govern responsibly when sensitive student populations are involved.

+
+
+
+ +
+

Two AASCU intermediaries described pain in three layers: PDP dashboard quality — inaccurate cohort numbers, buried definitions, wrong chart types, no data export — forcing IR staff into manual Excel rebuilds; AI/governance requirements — FERPA-plus expectations including data lineage, transparency, and explicit safeguards for sensitive student populations; and institutional process gaps — submission knowledge that lives with one person and varies wildly across campuses.

+

Our tool already addresses a meaningful share of layer one — CSV export, sane chart types, NLQ, methodology page — and is in-flight on layer two via the SHAP-narrator work. The biggest unaddressed gaps are a definitions glossary with IPEDS / state-compliance cross-walks, presentation-ready chart export, a data-lineage view that proves where each number came from, and AI transparency + sensitive-population safeguards as a precondition for institutional adoption.

+
+
+
+ +
+
+
02What we heard
+
+

Six themes — raised independently by both intermediaries.

+

Andres (IR-focused) and Dr. Prateek (data-engineering-focused) emphasized different layers, but converged on a common diagnosis: institutions can’t trust, can’t present, and can’t govern what PDP gives them today.

+
+
+ +
+ {PAINS.map((p) => ( +
+
{p.tag}
+

{p.h}

+
    + {p.items.map((it, i) =>
  • {it}
  • )} +
+
+ ))} +
+
+ +
+
Why on earth would they invest any time in learning this new tool?
+ — Intermediary, on PDP’s compounding friction · 21-min recording, 03:42 +
+ +
+
+
03What the tool already does
+
+

A meaningful share of the pain is already addressed in codebenders-dashboard.

+

Mapping each pain point to the closed PRs and shipped components. Status reflects the state of the codebase as of this report.

+
+
+ +
+
+
Pain point
+
Capability
+
Evidence
+
Status
+
+ {COVERAGE.map((row, i) => ( +
+
{row.pp}
+
{row.name}
+
{row.ev}
+
+ + {row.statusLabel} + +
+
+ ))} +
+
+ +
+
I just went by each data point and copied that number into the Excel spreadsheet.
+ — Dr. Prateek, on rebuilding PDP outputs by hand · 04:58 +
+ +
+
+
04Gaps — issues filed
+
+

Eight issues, three priority tiers, one through-line: provable, presentable, governed.

+

Each issue links back to a specific quote or recurring complaint from the discovery session. Out-of-scope items are listed at the end and intentionally not filed.

+
+
+ + {TIERS.map((tier) => ( +
+
+ {tier.pillLabel} +

{tier.h}

+
+
+ {tier.cards.map((c) => ( + +
+ Issue + #{c.num} +
+
{c.tag}
+

{c.title}

+

{c.body}

+
Open issue
+
+ ))} +
+
+ ))} + +
+
Intentionally
out of scope
+
    +
  • PDP-side cohort accuracy and dropdown completeness — that’s PDP’s bug to fix; we shouldn’t build around it.
  • +
  • The $20K analysis-ready file paywall — pricing decision by PDP; not addressable here.
  • +
  • Vendor over-promising on “full automation” — competitive-positioning concern, not a feature.
  • +
+
+
+ + + +
+
+ ) +} diff --git a/codebenders-dashboard/app/discovery/aascu/page.module.css b/codebenders-dashboard/app/discovery/aascu/page.module.css new file mode 100644 index 0000000..f3f0510 --- /dev/null +++ b/codebenders-dashboard/app/discovery/aascu/page.module.css @@ -0,0 +1,250 @@ +.doc { + --bg: #fafaf7; + --ink: #15181a; + --soft: #4a4f53; + --muted: #7d8489; + --rule: #e3e1da; + --accent: #0f5c4d; + --accent-soft: #e8efe9; + --done: #0f5c4d; + --partial: #9a6d12; + --gap: #8b3a3a; + + background: var(--bg); + color: var(--ink); + font-family: var(--font-plex-sans), ui-sans-serif, system-ui, sans-serif; + line-height: 1.55; + -webkit-font-smoothing: antialiased; + min-height: calc(100vh - 48px); +} + +.inner { + max-width: 760px; + margin: 0 auto; + padding: 56px 32px 96px; +} + +.header { + padding-bottom: 32px; + border-bottom: 1px solid var(--ink); + margin-bottom: 48px; +} + +.kicker { + font-family: var(--font-plex-mono), ui-monospace, monospace; + font-size: 11px; + letter-spacing: 0.16em; + text-transform: uppercase; + color: var(--accent); + margin-bottom: 18px; +} + +.h1 { + font-family: var(--font-newsreader), ui-serif, Georgia, serif; + font-weight: 400; + font-size: 42px; + line-height: 1.1; + letter-spacing: -0.02em; + color: var(--ink); + margin-bottom: 14px; +} + +.lede { + font-family: var(--font-newsreader), ui-serif, Georgia, serif; + font-weight: 400; + font-size: 19px; + line-height: 1.5; + color: var(--soft); + max-width: 620px; +} + +.meta { + margin-top: 24px; + display: flex; + gap: 24px; + flex-wrap: wrap; + font-family: var(--font-plex-mono), ui-monospace, monospace; + font-size: 11px; + letter-spacing: 0.08em; + color: var(--muted); + text-transform: uppercase; +} + +.meta b { color: var(--ink); font-weight: 500; } + +.section { margin-bottom: 56px; } + +.h2 { + font-family: var(--font-plex-mono), ui-monospace, monospace; + font-size: 11px; + letter-spacing: 0.18em; + text-transform: uppercase; + color: var(--muted); + margin-bottom: 24px; + padding-bottom: 10px; + border-bottom: 1px solid var(--rule); + font-weight: 500; +} + +.tldr { + background: var(--accent-soft); + border-left: 3px solid var(--accent); + padding: 24px 28px; + border-radius: 2px; +} + +.tldr p { + font-family: var(--font-newsreader), ui-serif, Georgia, serif; + font-size: 18px; + line-height: 1.55; + color: var(--ink); +} + +.tldr p + p { margin-top: 14px; } +.tldr strong { color: var(--accent); font-weight: 500; } + +.item { + display: grid; + grid-template-columns: 1fr auto; + gap: 24px; + padding: 18px 0; + border-bottom: 1px solid var(--rule); + align-items: start; +} + +.item:last-child { border-bottom: none; } + +.lbl { + font-family: var(--font-plex-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.16em; + color: var(--muted); + text-transform: uppercase; + margin-bottom: 6px; +} + +.itemH { + font-family: var(--font-newsreader), ui-serif, Georgia, serif; + font-weight: 500; + font-size: 18px; + line-height: 1.3; + color: var(--ink); + letter-spacing: -0.005em; + margin-bottom: 6px; +} + +.itemP { + font-size: 14px; + line-height: 1.55; + color: var(--soft); + max-width: 520px; +} + +.stat { + display: inline-flex; + align-items: center; + gap: 8px; + font-family: var(--font-plex-mono), ui-monospace, monospace; + font-size: 10.5px; + letter-spacing: 0.14em; + text-transform: uppercase; + font-weight: 500; + white-space: nowrap; + padding-top: 4px; +} + +.stat::before { + content: ""; + width: 7px; + height: 7px; + border-radius: 50%; +} + +.statDone { color: var(--done); } +.statDone::before { background: var(--done); } +.statPartial { color: var(--partial); } +.statPartial::before { background: var(--partial); } +.statGap { color: var(--gap); } +.statGap::before { background: var(--gap); } + +.issue { + display: grid; + grid-template-columns: 72px 1fr auto; + gap: 20px; + padding: 18px 0; + border-bottom: 1px solid var(--rule); + align-items: baseline; + text-decoration: none; + color: inherit; + transition: background 0.15s ease; +} + +.issue:last-child { border-bottom: none; } +.issue:hover { background: rgba(15, 92, 77, 0.025); } + +.issueNum { + font-family: var(--font-plex-mono), ui-monospace, monospace; + font-size: 13px; + font-weight: 500; + color: var(--accent); + letter-spacing: 0.04em; +} + +.issueH { + font-family: var(--font-newsreader), ui-serif, Georgia, serif; + font-weight: 500; + font-size: 17px; + line-height: 1.3; + color: var(--ink); + margin-bottom: 4px; + letter-spacing: -0.005em; +} + +.issueP { + font-size: 13.5px; + line-height: 1.5; + color: var(--soft); +} + +.pri { + font-family: var(--font-plex-mono), ui-monospace, monospace; + font-size: 10px; + letter-spacing: 0.14em; + color: var(--muted); + text-transform: uppercase; + padding: 3px 9px; + border: 1px solid var(--rule); + border-radius: 2px; + white-space: nowrap; + font-weight: 500; +} + +.pri0 { color: var(--gap); border-color: var(--gap); } +.pri1 { color: var(--partial); border-color: var(--partial); } +.pri2 { color: var(--muted); } + +.footer { + margin-top: 48px; + padding-top: 20px; + border-top: 1px solid var(--rule); + font-family: var(--font-plex-mono), ui-monospace, monospace; + font-size: 10.5px; + letter-spacing: 0.14em; + text-transform: uppercase; + color: var(--muted); + display: flex; + justify-content: space-between; + flex-wrap: wrap; + gap: 14px; +} + +.footer a { color: var(--accent); text-decoration: none; } + +@media (max-width: 640px) { + .inner { padding: 36px 20px 64px; } + .h1 { font-size: 32px; } + .lede { font-size: 17px; } + .item { grid-template-columns: 1fr; } + .issue { grid-template-columns: 1fr; gap: 6px; } + .issueNum { font-size: 12px; } +} diff --git a/codebenders-dashboard/app/discovery/aascu/page.tsx b/codebenders-dashboard/app/discovery/aascu/page.tsx new file mode 100644 index 0000000..9217ce3 --- /dev/null +++ b/codebenders-dashboard/app/discovery/aascu/page.tsx @@ -0,0 +1,145 @@ +import type { Metadata } from "next" +import Link from "next/link" +import { Newsreader, IBM_Plex_Sans, IBM_Plex_Mono } from "next/font/google" +import styles from "./page.module.css" + +const newsreader = Newsreader({ + variable: "--font-newsreader", + subsets: ["latin"], + weight: ["400", "500"], +}) + +const plexSans = IBM_Plex_Sans({ + variable: "--font-plex-sans", + subsets: ["latin"], + weight: ["400", "500", "600"], +}) + +const plexMono = IBM_Plex_Mono({ + variable: "--font-plex-mono", + subsets: ["latin"], + weight: ["400", "500"], +}) + +export const metadata: Metadata = { + title: "AASCU Gap Analysis — Brief", + description: "Two-page condensation of the AASCU intermediary discovery session and the issues filed against the dashboard backlog.", +} + +const ISSUE_BASE = "https://github.com/devcolor/codebenders-datathon/issues" + +const ISSUES: Array<{ + num: number + title: string + desc: string + pri: "p0" | "p1" | "p2" +}> = [ + { num: 105, pri: "p0", title: "Metric definitions glossary with IPEDS / state cross-walks", desc: "Hover tooltips on every KPI · centralized /glossary page · markdown source-of-truth." }, + { num: 106, pri: "p0", title: "Presentation-ready chart export (PNG / PDF)", desc: "Title, definition, source, date stamp baked in. Eliminates the manual Excel rebuild." }, + { num: 107, pri: "p0", title: "Data lineage view — “where did this number come from”", desc: "Click any number → source rows, upload event, transformations, timestamps. Highest-leverage gap." }, + { num: 108, pri: "p1", title: "AI Transparency Page", desc: "Per-model disclosure — features, training data, provider, data flow, retention. Reviewable independently." }, + { num: 109, pri: "p1", title: "Sensitive-population safeguards", desc: "Per-institution feature exclusion · low-sample-size context warnings · audit log entries." }, + { num: 110, pri: "p1", title: "Upload validation report — diff vs. last upload", desc: "Row-level errors, coercions, dedup decisions, dropped-campus flags. Readable by non-technical IR staff." }, + { num: 111, pri: "p2", title: "Submission runbook generator", desc: "Capture what worked → printable runbook · replayable on new files · survives staff turnover." }, + { num: 112, pri: "p2", title: "Datathon institution grouping (SIS + goal)", desc: "Operational artifact — cross-reference AASCU's SIS list with stated goals; output cohort matrix." }, +] + +const COVERAGE: Array<{ + lbl: string + title: string + desc: string + status: "done" | "partial" | "gap" +}> = [ + { lbl: "A · Data accuracy", status: "partial", title: "Numbers don’t add up; PDP pulls from wrong dataset", desc: "Validation report on upload addresses the institutional side; PDP-side accuracy is out of scope." }, + { lbl: "B · Definitions", status: "gap", title: "Metrics undefined in-context; mismatch with IPEDS / state", desc: "Tooltip primitive exists. Centralized glossary and cross-walks not yet built." }, + { lbl: "C · Visualization", status: "done", title: "Wrong chart types in PDP; ours are sane", desc: "Recharts components, types chosen per metric. Done." }, + { lbl: "C · Export", status: "partial", title: "CSV from dashboard; no presentation-ready chart export", desc: "CSV shipped (#15). PNG/PDF export with embedded definitions is the next step." }, + { lbl: "D · FERPA", status: "done", title: "RBAC, audit log, FERPA-compliant identity resolution", desc: "Issues #67, #75, #77, #78 closed. FERPA basics covered." }, + { lbl: "D · AI governance", status: "gap", title: "Transparency page, lineage view, sensitive-population safeguards", desc: "Methodology page exists. SHAP narrator in flight. Lineage and transparency disclosures unbuilt." }, + { lbl: "E · Process", status: "partial", title: "Knowledge siloed; submission rituals vary per campus", desc: "Self-service upload (#86) helps. Runbook generator would close the loop." }, +] + +const statClass = (s: "done" | "partial" | "gap") => + s === "done" ? styles.statDone : s === "partial" ? styles.statPartial : styles.statGap + +const priClass = (p: "p0" | "p1" | "p2") => + p === "p0" ? styles.pri0 : p === "p1" ? styles.pri1 : styles.pri2 + +export default function AASCUBriefPage() { + return ( +
+
+ +
+
AASCU Discovery · Gap Analysis · Brief
+

What the tool already does, what it’s missing, what to build next.

+

+ A two-page condensation of the AASCU intermediary discovery session and the eight issues filed against the codebenders-dashboard backlog. +

+
+ 2026·04·29 + 2 intermediaries + 21 min source + 8 issues filed +
+
+ +
+

The take

+
+

+ Intermediaries describe pain in three layers: PDP dashboard quality (charts, definitions, exports, accuracy), AI & data governance (lineage, transparency, sensitive populations), and institutional process (knowledge silos, inconsistent submission rituals). +

+

+ The tool already addresses meaningful parts of layer one and is in-flight on layer two via the SHAP narrator. The biggest unaddressed gaps are a definitions glossary, presentation-ready chart export, a data-lineage view, and AI transparency + sensitive-population safeguards. +

+
+
+ +
+

Pain × Coverage

+ {COVERAGE.map((row) => ( +
+
+
{row.lbl}
+

{row.title}

+

{row.desc}

+
+ + {row.status === "done" ? "Done" : row.status === "partial" ? "Partial" : "Gap"} + +
+ ))} +
+ +
+

Issues filed (#105 — #112)

+ {ISSUES.map((iss) => ( + +
#{iss.num}
+
+

{iss.title}

+

{iss.desc}

+
+ {iss.pri.toUpperCase()} +
+ ))} +
+ + + +
+
+ ) +} diff --git a/codebenders-dashboard/components/nav-header.tsx b/codebenders-dashboard/components/nav-header.tsx index c86be49..75034a4 100644 --- a/codebenders-dashboard/components/nav-header.tsx +++ b/codebenders-dashboard/components/nav-header.tsx @@ -17,6 +17,7 @@ const NAV_LINKS: Array<{ href: string; label: string; roles?: Role[] }> = [ { href: "/courses", label: "Courses" }, { href: "/students", label: "Students" }, { href: "/query", label: "Query" }, + { href: "/discovery/aascu", label: "Discovery", roles: ["admin", "ir", "leadership"] }, { href: "/admin/upload", label: "Admin", roles: ["admin", "ir"] }, ] diff --git a/codebenders-dashboard/lib/roles.ts b/codebenders-dashboard/lib/roles.ts index 0b1903d..6b4dce4 100644 --- a/codebenders-dashboard/lib/roles.ts +++ b/codebenders-dashboard/lib/roles.ts @@ -13,6 +13,7 @@ export const ROUTE_PERMISSIONS: Array<{ prefix: string; roles: Role[] }> = [ { prefix: "/api/query-history/export", roles: ["admin", "ir"] }, { prefix: "/admin", roles: ["admin", "ir"] }, { prefix: "/api/admin", roles: ["admin", "ir"] }, + { prefix: "/discovery", roles: ["admin", "ir", "leadership"] }, ] export function canAccess(pathname: string, role: Role): boolean { diff --git a/docs/aascu_gap_analysis.html b/docs/aascu_gap_analysis.html new file mode 100644 index 0000000..08b1d3c --- /dev/null +++ b/docs/aascu_gap_analysis.html @@ -0,0 +1,946 @@ + + + + + +AASCU Intermediary Discovery — Gap Analysis + + + + + + +
+ +
+
Codebenders Datathon · Discovery Report № 01 · Bishop State CC
+
+ Filed 2026·04·29
+ Source: 21-min recording + notes
+ Status: Stakeholder review +
+
+ +
+
AASCU Intermediary Discovery — Gap Analysis
+

The data they have isn't the data they trust.

+

+ Two AASCU intermediaries described, in their own words, why the Postsecondary Data Partnership dashboard fails the institutions they support — and where a tool built around provable, presentable, governed outputs would land. +

+
+
Recording
20:58minutes of testimony
+
Voices
2intermediaries · IR + data‑eng
+
Pain themes
6mapped across stack
+
Issues filed
8#105 — #112
+
+
+ + +
+
+
01Executive summary
+
+

A tool worth adopting is one that institutions can defend — to legal, to leadership, to themselves.

+

The intermediaries don't need another dashboard. They need outputs they can present upward, prove the provenance of, and govern responsibly when sensitive student populations are involved.

+
+
+
+ +
+

Two AASCU intermediaries described pain in three layers: PDP dashboard quality — inaccurate cohort numbers, buried definitions, wrong chart types, no data export — forcing IR staff into manual Excel rebuilds; AI/governance requirements — FERPA-plus expectations including data lineage, transparency, and explicit safeguards for sensitive student populations; and institutional process gaps — submission knowledge that lives with one person and varies wildly across campuses.

+

Our tool already addresses a meaningful share of layer one — CSV export, sane chart types, NLQ, methodology page — and is in-flight on layer two via the SHAP-narrator work. The biggest unaddressed gaps are a definitions glossary with IPEDS / state-compliance cross-walks, presentation-ready chart export, a data-lineage view that proves where each number came from, and AI transparency + sensitive-population safeguards as a precondition for institutional adoption.

+
+
+
+ + +
+
+
02What we heard
+
+

Six themes — raised independently by both intermediaries.

+

Andres (IR-focused) and Dr. Prateek (data-engineering-focused) emphasized different layers, but converged on a common diagnosis: institutions can't trust, can't present, and can't govern what PDP gives them today.

+
+
+ +
+
+
A · Data accuracy & trust
+

Numbers that don't add up

+
    +
  • Incorrect cohort sizes; campuses missing from dashboard dropdowns.
  • +
  • Dashboard pulls from the wrong dataset intermittently.
  • +
  • Submission-time processing corrupts otherwise-clean institutional data.
  • +
  • No cross-source verification — intermediaries and institutions take each other's word.
  • +
+
+ +
+
B · Definitions & terminology
+

The glossary that isn't there

+
    +
  • "Completion at 3/4/5 years" and retention metrics undefined in-context.
  • +
  • Definitions live in external documentation, not the dashboard.
  • +
  • Terms diverge from IPEDS and state-compliance vocabulary IR staff already use.
  • +
+
+ +
+
C · Visualization & export
+

Charts you can't show your boss

+
    +
  • Wrong chart types — line graph for independent cohorts where bar is correct.
  • +
  • Charts not presentation-ready; IR staff rebuild in Excel by hand.
  • +
  • No data download from dashboard. Analysis-ready file is a $20K paywall.
  • +
+
+ +
+
D · AI & governance
+

FERPA is the floor, not the ceiling

+
    +
  • "Weaponizing data" risk to under-resourced campuses via context-free AI inference.
  • +
  • Sensitive populations — immigrant, undocumented, public-aid — need explicit care.
  • +
  • Demand for full data lineage, model transparency, storage disclosure.
  • +
+
+ +
+
E · Institutional process
+

One person knows; one person retires

+
    +
  • Submission knowledge silos with one IR staffer per institution.
  • +
  • Each campus has wildly different submission rituals — some delist + re-upload everything every cycle.
  • +
  • "Best edit resolution" loses to people-and-process bottlenecks.
  • +
+
+ +
+
F · Datathon coordination
+

Group by goal, not just SIS

+
    +
  • AASCU has SIS-by-institution list — can rank by commonality or by need.
  • +
  • Last datathon: wildly different institutional incomes ate most of the day.
  • +
  • Recommend grouping by shared SIS + shared goal (e.g., advising).
  • +
+
+
+
+ +
+
+ Why on earth would they invest any time in learning this new tool? +
+ — Intermediary, on PDP's compounding friction · 21-min recording, 03:42 +
+ + +
+
+
03What the tool already does
+
+

A meaningful share of the pain is already addressed in `codebenders-dashboard`.

+

Mapping each pain point to the closed PRs and shipped components. Status reflects the state of the codebase as of this report.

+
+
+ +
+
+
Pain point
+
Capability
+
Evidence
+
Status
+
+ +
+
C · Export
+
CSV export wired into dashboard
+
components/export-button.tsx · issue #15
+
Done
+
+ +
+
C · Visualization
+
Recharts components, types chosen per metric
+
retention-risk-chart.tsx, risk-alert-chart.tsx, readiness-assessment-chart.tsx
+
Done
+
+ +
+
B · Definitions
+
Tooltip primitive exists; no centralized glossary yet
+
components/info-popover.tsx
+
Partial
+
+ +
+
D · Methodology
+
How predictions are made — surfaced in-app
+
app/methodology/ route
+
Done
+
+ +
+
D · FERPA
+
RBAC, audit log, FERPA-compliant identity resolution
+
Issues #67, #75, #77, #78 (closed)
+
Done
+
+ +
+
D · Automation
+
Self-service upload (PDP, AR, student, course)
+
Issue #86 (closed) · components/upload/
+
Done
+
+ +
+
D · Explainability
+
SHAP narrator — fine-tuning epic in progress
+
Issues #97 — #103 · branch fine-tuning/97-shap-narrator-task-type
+
In flight
+
+ +
+
A · Validation
+
Upload exists; human-readable validation report missing
+
Addressed by new issue #110
+
Partial
+
+ +
+
C · Filtering
+
By cohort, term, demographic, credential type
+
Issues #66, #81 (closed)
+
Done
+
+ +
+
C · Query
+
Natural-language query against the data
+
lib/prompt-analyzer.ts · issues #17, #61, #88, #90
+
Done
+
+ +
+
E · Knowledge
+
Self-service upload reduces single-person dependency
+
In-app submission runbook missing — addressed by #111
+
Partial
+
+
+
+ +
+
+ I just went by each data point and copied that number into the Excel spreadsheet. +
+ — Dr. Prateek, on rebuilding PDP outputs by hand · 04:58 +
+ + +
+
+
04Gaps — issues filed
+
+

Eight issues, three priority tiers, one through-line: provable, presentable, governed.

+

Each issue links back to a specific quote or recurring complaint from the discovery session. Out-of-scope items are listed at the end and intentionally not filed.

+
+
+ + +
+
+ P0 · Differentiators +

Match the loudest complaints. Datathon-eligible.

+
+
+
+
Issue#105
+
Pain B · Definitions
+

Metric definitions glossary with IPEDS & state-compliance cross-walks

+

Every KPI surfaces a tooltip with PDP, IPEDS, and state-compliance equivalents. Centralized /glossary page indexed by metric. Markdown source-of-truth, versioned with the code.

+ Open issue +
+
+
Issue#106
+
Pain C · Export
+

Presentation-ready chart export (PNG / PDF)

+

Every chart exports as a polished image with title, definition, source, and date stamp baked in. Eliminates the manual Excel-rebuild workflow IR staff perform daily.

+ Open issue +
+
+
Issue#107
+
Pain A + D · Lineage
+

Data lineage view — "where did this number come from"

+

Click any number → see source rows, upload event, transformations, and timestamps. The single highest-leverage gap from the session: directly answers trust + governance + differentiation in one feature.

+ Open issue +
+
+
+ + +
+
+ P1 · Governance hardening +

Table-stakes for institutional adoption.

+
+
+
+
Issue#108
+
Pain D · Transparency
+

AI Transparency Page

+

Per-model disclosure: features used, training data source, homegrown vs. third-party, where data flows when invoked, retention policy. Reviewable independently by institutional IT & legal.

+ Open issue +
+
+
Issue#109
+
Pain D · Sensitive populations
+

Sensitive-population safeguards

+

Per-institution feature-exclusion lists. Context warnings on small sub-populations. Audit log entries for any query touching flagged groups. Demoable, not just claimed.

+ Open issue +
+
+
Issue#110
+
Pain A + E · Validation
+

Upload validation report — diff vs. last upload

+

Row-level errors, field coercions, dedup decisions, anomaly flags ("3 campuses dropped from this upload"). Readable by non-technical IR staff. Survives the "person retires" scenario.

+ Open issue +
+
+
+ + +
+
+ P2 · Process & institutional fit +

Lower urgency, higher institutional gratitude.

+
+
+
+
Issue#111
+
Pain E · Process
+

Submission runbook generator

+

Tool records the upload steps + field mappings that worked, then generates a printable runbook so a successor can replicate without tribal knowledge. Replayable on new files.

+ Open issue +
+
+
Issue#112
+
Pain F · Datathon coordination
+

Institution-grouping helper — shared SIS + shared goal

+

Operational artifact, not a user feature: cross-reference AASCU's SIS list with stated institutional goals; output a candidate cohort matrix for the fall datathon.

+ Open issue +
+
+
+ +
+
Intentionally
out of scope
+
    +
  • PDP-side cohort accuracy and dropdown completeness — that's PDP's bug to fix; we shouldn't build around it.
  • +
  • The $20K analysis-ready file paywall — pricing decision by PDP; not addressable here.
  • +
  • Vendor over-promising on "full automation" — competitive-positioning concern, not a feature.
  • +
+
+
+ + + +
+ + diff --git a/docs/aascu_gap_analysis_simple.html b/docs/aascu_gap_analysis_simple.html new file mode 100644 index 0000000..318a0b5 --- /dev/null +++ b/docs/aascu_gap_analysis_simple.html @@ -0,0 +1,411 @@ + + + + + +AASCU Gap Analysis — Brief + + + + + + +
+ +
+
AASCU Discovery · Gap Analysis · Brief
+

What the tool already does, what it's missing, what to build next.

+

A two-page condensation of the AASCU intermediary discovery session and the eight issues filed against the codebenders-dashboard backlog.

+
+ 2026·04·29 + 2 intermediaries + 21 min source + 8 issues filed +
+
+ +
+

The take

+
+

Intermediaries describe pain in three layers: PDP dashboard quality (charts, definitions, exports, accuracy), AI & data governance (lineage, transparency, sensitive populations), and institutional process (knowledge silos, inconsistent submission rituals).

+

The tool already addresses meaningful parts of layer one and is in-flight on layer two via the SHAP narrator. The biggest unaddressed gaps are a definitions glossary, presentation-ready chart export, a data-lineage view, and AI transparency + sensitive-population safeguards.

+
+
+ +
+

Pain × Coverage

+ +
+
+
A · Data accuracy
+

Numbers don't add up; PDP pulls from wrong dataset

+

Validation report on upload addresses the institutional side; PDP-side accuracy is out of scope.

+
+ Partial +
+ +
+
+
B · Definitions
+

Metrics undefined in-context; mismatch with IPEDS / state

+

Tooltip primitive exists. Centralized glossary and cross-walks not yet built.

+
+ Gap +
+ +
+
+
C · Visualization
+

Wrong chart types in PDP; ours are sane

+

Recharts components, types chosen per metric. Done.

+
+ Done +
+ +
+
+
C · Export
+

CSV from dashboard; no presentation-ready chart export

+

CSV shipped (#15). PNG/PDF export with embedded definitions is the next step.

+
+ Partial +
+ +
+
+
D · FERPA
+

RBAC, audit log, FERPA-compliant identity resolution

+

Issues #67, #75, #77, #78 closed. FERPA basics covered.

+
+ Done +
+ +
+
+
D · AI governance
+

Transparency page, lineage view, sensitive-population safeguards

+

Methodology page exists. SHAP narrator in flight. Lineage and transparency disclosures unbuilt.

+
+ Gap +
+ +
+
+
E · Process
+

Knowledge siloed; submission rituals vary per campus

+

Self-service upload (#86) helps. Runbook generator would close the loop.

+
+ Partial +
+
+ +
+

Issues filed (#105 — #112)

+ + +
#105
+
+

Metric definitions glossary with IPEDS / state cross-walks

+

Hover tooltips on every KPI · centralized /glossary page · markdown source-of-truth.

+
+ P0 +
+ + +
#106
+
+

Presentation-ready chart export (PNG / PDF)

+

Title, definition, source, date stamp baked in. Eliminates the manual Excel rebuild.

+
+ P0 +
+ + +
#107
+
+

Data lineage view — "where did this number come from"

+

Click any number → source rows, upload event, transformations, timestamps. Highest-leverage gap.

+
+ P0 +
+ + +
#108
+
+

AI Transparency Page

+

Per-model disclosure — features, training data, provider, data flow, retention. Reviewable independently.

+
+ P1 +
+ + +
#109
+
+

Sensitive-population safeguards

+

Per-institution feature exclusion · low-sample-size context warnings · audit log entries.

+
+ P1 +
+ + +
#110
+
+

Upload validation report — diff vs. last upload

+

Row-level errors, coercions, dedup decisions, dropped-campus flags. Readable by non-technical IR staff.

+
+ P1 +
+ + +
#111
+
+

Submission runbook generator

+

Capture what worked → printable runbook · replayable on new files · survives staff turnover.

+
+ P2 +
+ + +
#112
+
+

Datathon institution grouping (SIS + goal)

+

Operational artifact — cross-reference AASCU's SIS list with stated goals; output cohort matrix.

+
+ P2 +
+
+ + + +
+ + diff --git a/docs/aascu_intermediary_feedback_summary.md b/docs/aascu_intermediary_feedback_summary.md new file mode 100644 index 0000000..debff99 --- /dev/null +++ b/docs/aascu_intermediary_feedback_summary.md @@ -0,0 +1,146 @@ +# AASCU Intermediary Feedback — Summary & Tool Gap Analysis + +**Source:** AASCU Meeting Recording (~21 min) + typed notes from session +**Participants:** Two AASCU Intermediaries (IFS) — one IR-focused (Andres), one data-engineering-focused (Dr. Prateek) +**Date of analysis:** 2026-04-28 + +> **Context — what is an Intermediary (IFS)?** +> Intermediaries are organizations selected and funded by the Gates Foundation to support a network of institutions. They translate institutional needs into context, produce data-informed needs assessments, and connect institutions with support partners. They lean on each other as a peer cohort. Their feedback represents what *institutions they serve* are experiencing with PDP outputs. + +--- + +## 1. Pain Points Raised + +### A. Data accuracy & trust in PDP outputs +- Incorrect cohort sizes and missing institutions/campuses in dashboard dropdowns +- Numbers reported by PDP dashboard appear inaccurate; intermediaries lack confidence in the data +- PDP appears to pull from the wrong dataset intermittently +- Andres: data is being **incorrectly processed at submission time** — institution data ends up with wrong values after going through PDP's pipeline +- No cross-source verification — intermediaries and institutions have to take each other's word + +### B. Definitions & terminology +- Terms like "completion rate at 3/4/5 years" and "first-to-second-year retention" are not clearly explained in-context +- Definitions are buried in external documentation +- PDP terminology differs from what institutions report for state compliance and IPEDS, creating cognitive friction + +### C. Visualization & data export +- Wrong chart types (e.g., line chart used to compare independent cohorts — should be bar) +- Charts are not presentation-ready; IR staff can't show them to supervisors as-is +- **No data download from the dashboard.** Dr. Prateek had to manually copy individual numbers into Excel to build his own visualizations +- The "analysis-ready file" is a paid add-on (~$20K), so most institutions don't have programmatic access to their own data + +### D. AI/governance constraints (for any new tool, including ours) +- **FERPA compliance** is table-stakes +- Beyond FERPA: concerns about "weaponizing data" via AI — broad analyses without institutional context can harm under-resourced campuses +- Sensitive student populations (immigrant, undocumented, public-aid students) need special handling — clarity on which data points are used, and why +- Need full **transparency** — homegrown vs. vendor model, what inputs are used, where data is stored +- Need full **data lineage / governance** — "where did this number come from" must be answerable end-to-end +- Vendors that promise "full automation" frequently underdeliver — so claims must be defensible + +### E. Institutional/process challenges (people, not tech) +- Knowledge tends to live with **one person**; when they retire, no one else knows how the institution submits its data +- Each campus has wildly different submission processes (e.g., one campus delists and re-uploads everything every cycle) +- Institutions have very different sizes, types (university vs. junior college vs. multi-campus system), and goals — generic tools struggle to fit + +### F. Datathon coordination feedback +- AASCU has a list of every institution's SIS — they can group institutions by **most common SIS** OR **most need** +- Recommendation: group by **shared institutional goals** (e.g., "advising") in addition to shared SIS — that produced the strongest cross-talk in their program + +--- + +## 2. What the Tool Already Addresses + +Mapping the pain points to what's already built or in progress in `codebenders-dashboard`: + +| Pain point | Already in tool | Evidence | +|---|---|---| +| **C. No data download** | ✅ CSV export wired into the dashboard | `components/export-button.tsx`, issue #15 (closed) | +| **C. Wrong chart types** | ✅ Recharts-based, chart types chosen per metric | `retention-risk-chart.tsx`, `risk-alert-chart.tsx`, `readiness-assessment-chart.tsx` | +| **B. Buried definitions** | 🟡 Partial — info popovers exist; not yet a full glossary | `components/info-popover.tsx` | +| **D. AI transparency / methodology** | ✅ Methodology page documenting how predictions are made | `app/methodology/` | +| **D. FERPA compliance** | ✅ RBAC, audit log, student-detail FERPA-compliant identity resolution | Issues #67, #75, #77, #78 (all closed) | +| **D. Self-service & automation** | ✅ Self-service data upload (PDP, AR files, student/course data) | Issue #86 (closed), `components/upload/` | +| **D. Explainability / "why this number"** | 🟡 In progress — SHAP narrator fine-tuning | Issues #97–103 (open epic), current branch `fine-tuning/97-shap-narrator-task-type` | +| **A. Data validation on upload** | 🟡 Upload exists; validation surface area unknown | Issue #86 (need to verify what error reporting exists) | +| **C. Filtering by cohort/term/demographic** | ✅ Built | Issue #66 (closed), #81 (closed) | +| **C. Natural-language query against the data** | ✅ NLQ interface live | Issues #17, #61, #88, #90 (all closed); `lib/prompt-analyzer.ts` | +| **E. Knowledge siloed in one person** | 🟡 Self-service upload reduces dependency, but no documented submission runbook in-app | | + +**In short:** The tool already addresses a *substantial portion* of the pain — particularly the export/visualization gap (C), AI methodology transparency (D), and FERPA basics (D). The current SHAP narrator work (D) directly speaks to the "lack of context" risk Andres raised. + +--- + +## 3. Gaps — Issues to Add + +These are pain points the tool does **not yet** address adequately. Recommended priorities are tentative — final priorities are yours to assign. + +### P0 — Differentiators that match the loudest complaints + +1. **Definitions glossary + inline tooltips for every metric** + *Pain points: B (definitions buried, terminology mismatch with IPEDS/state compliance)* + - Every metric/KPI shows a hover tooltip with: PDP definition, IPEDS-equivalent (if any), state-compliance term (if any) + - Centralized `/glossary` page indexed by metric + - Source-of-truth markdown so definitions are versioned with the code + +2. **Presentation-ready chart export (PNG/PDF), not just CSV** + *Pain point: C (charts can't be shown to supervisors)* + - "Export as PNG / PDF / PPTX-ready slide" on every chart + - Includes title, definitions, data source, and date stamp baked into the export + - Removes the manual "copy numbers to Excel and rebuild" workflow Dr. Prateek described + +3. **Data lineage / "where did this number come from" view** + *Pain points: A (trust gap), D (lineage requirement)* + - Click any number → see source rows, which upload it came from, transformations applied, and timestamp + - Critical for the AI-trust story: "we can prove every number" + +### P1 — AI governance hardening (table-stakes for institutional adoption) + +4. **AI Transparency Page** + *Pain point: D (which data points used, where stored, what model)* + - Lists every model (retention, GPA, etc.), the input features, the training data source, model lineage + - Shows whether the model is homegrown or third-party + - Lists the LLM provider for NLQ + SHAP narrator and where prompts/data flow + +5. **Sensitive-population safeguards** + *Pain point: D (immigrant/undocumented/public-aid students; weaponization risk)* + - Configurable "do not use" list of demographic features per institution + - When predictions are generated for flagged sub-populations, surface a context warning ("low sample size — interpret with care") + - Audit-log entry for any query/export that touches flagged populations + +6. **Upload validation report with human-readable error surface** + *Pain point: A (data-processing errors at submission), E (one person knows how)* + - After upload: show row-level errors, field coercions, deduplication decisions + - Must be readable by a non-technical IR staffer (so the tool survives the "person retires" scenario) + - "Diff vs. last upload" view so anomalies (e.g., dropped campuses) are caught immediately + +### P2 — Process / institutional fit + +7. **Submission runbook generator** + *Pain point: E (knowledge siloed in one person)* + - Tool records the exact upload steps + field mappings that "worked" for an institution + - Generates a printable/PDF runbook so a successor can replicate without tribal knowledge + +8. **Institution-grouping helper for shared-goal cohorts** + *Pain point: F (datathon coordination)* + - Operational, not a user-facing tool feature — but worth tracking as an internal issue: + - Pull the SIS-by-institution list from AASCU + - Cross-reference with stated institutional goals (e.g., "advising") + - Output a candidate grouping for the fall datathon + +### Out of scope (should NOT become tool issues) + +- **PDP-side cohort accuracy / dropdown completeness** — that's PDP's bug to fix; we shouldn't build around it +- **The $20K analysis-ready file paywall** — pricing decision by PDP; not addressable in our tool +- **Vendor over-promising on "full automation"** — competitive-positioning concern, not a feature + +--- + +## 4. One-paragraph summary (for your notes doc) + +> Two AASCU Intermediaries described pain in three layers: (1) **PDP dashboard quality** — inaccurate cohort numbers, buried definitions, wrong chart types, no data export, forcing IR staff into manual Excel work; (2) **AI/governance requirements** — FERPA-plus expectations including data lineage, transparency on which data points are used and where stored, and explicit safeguards for sensitive student populations; and (3) **institutional process gaps** — submission knowledge typically lives with one person and varies wildly across campuses. Our tool already addresses a meaningful share of (1) — CSV export, sane chart types, NLQ, methodology page — and is in-flight on (2) via the SHAP-narrator work for explainability. The biggest unaddressed gaps are (a) a **definitions glossary with IPEDS/state-compliance cross-walks**, (b) **presentation-ready chart export** (PNG/PDF with definitions baked in), (c) a **data-lineage view** that proves where each number came from, and (d) **AI transparency + sensitive-population safeguards** as a precondition for institutional adoption. For datathon grouping, AASCU recommends pairing institutions by *shared SIS plus shared institutional goal* (e.g., advising) rather than SIS alone. + +--- + +## 5. Suggested next step + +If you confirm the priorities above, I'll draft `gh issue create` commands for each P0/P1 — matching the labeling style of the existing repo (`area:*`, `type:feature`, `priority:*`) — and show them for review before creating anything. diff --git a/docs/aascu_intermediary_feedback_summary_gdocs.md b/docs/aascu_intermediary_feedback_summary_gdocs.md new file mode 100644 index 0000000..cda0ec8 --- /dev/null +++ b/docs/aascu_intermediary_feedback_summary_gdocs.md @@ -0,0 +1,156 @@ +# AASCU Intermediary Feedback — Summary & Tool Gap Analysis + +**Source:** AASCU Meeting Recording (~21 min) + typed notes from session + +**Participants:** Two AASCU Intermediaries (IFS) — one IR-focused (Andres), one data-engineering-focused (Dr. Prateek) + +**Date of analysis:** 2026-04-28 + +> **Context — what is an Intermediary (IFS)?** Intermediaries are organizations selected and funded by the Gates Foundation to support a network of institutions. They translate institutional needs into context, produce data-informed needs assessments, and connect institutions with support partners. They lean on each other as a peer cohort. Their feedback represents what *institutions they serve* are experiencing with PDP outputs. + +--- + +## 1. Pain Points Raised + +### A. Data accuracy & trust in PDP outputs + +- Incorrect cohort sizes and missing institutions/campuses in dashboard dropdowns +- Numbers reported by PDP dashboard appear inaccurate; intermediaries lack confidence in the data +- PDP appears to pull from the wrong dataset intermittently +- Andres: data is being **incorrectly processed at submission time** — institution data ends up with wrong values after going through PDP's pipeline +- No cross-source verification — intermediaries and institutions have to take each other's word + +### B. Definitions & terminology + +- Terms like "completion rate at 3/4/5 years" and "first-to-second-year retention" are not clearly explained in-context +- Definitions are buried in external documentation +- PDP terminology differs from what institutions report for state compliance and IPEDS, creating cognitive friction + +### C. Visualization & data export + +- Wrong chart types (e.g., line chart used to compare independent cohorts — should be bar) +- Charts are not presentation-ready; IR staff can't show them to supervisors as-is +- **No data download from the dashboard.** Dr. Prateek had to manually copy individual numbers into Excel to build his own visualizations +- The "analysis-ready file" is a paid add-on (~$20K), so most institutions don't have programmatic access to their own data + +### D. AI/governance constraints (for any new tool, including ours) + +- **FERPA compliance** is table-stakes +- Beyond FERPA: concerns about "weaponizing data" via AI — broad analyses without institutional context can harm under-resourced campuses +- Sensitive student populations (immigrant, undocumented, public-aid students) need special handling — clarity on which data points are used, and why +- Need full **transparency** — homegrown vs. vendor model, what inputs are used, where data is stored +- Need full **data lineage / governance** — "where did this number come from" must be answerable end-to-end +- Vendors that promise "full automation" frequently underdeliver — so claims must be defensible + +### E. Institutional/process challenges (people, not tech) + +- Knowledge tends to live with **one person**; when they retire, no one else knows how the institution submits its data +- Each campus has wildly different submission processes (e.g., one campus delists and re-uploads everything every cycle) +- Institutions have very different sizes, types (university vs. junior college vs. multi-campus system), and goals — generic tools struggle to fit + +### F. Datathon coordination feedback + +- AASCU has a list of every institution's SIS — they can group institutions by **most common SIS** OR **most need** +- Recommendation: group by **shared institutional goals** (e.g., "advising") in addition to shared SIS — that produced the strongest cross-talk in their program + +--- + +## 2. What the Tool Already Addresses + +Mapping the pain points to what's already built or in progress in `codebenders-dashboard`. Status legend: **Done** = shipped; **In progress** = active branch or open issue; **Partial** = some coverage, room to grow. + +**No data download (pain point C)** — *Done.* CSV export wired into the dashboard via `components/export-button.tsx` (issue #15). + +**Wrong chart types (pain point C)** — *Done.* Recharts-based components with chart types chosen per metric: `retention-risk-chart.tsx`, `risk-alert-chart.tsx`, `readiness-assessment-chart.tsx`. + +**Buried definitions (pain point B)** — *Partial.* `components/info-popover.tsx` exists for tooltips, but no centralized glossary or IPEDS/state cross-walks yet. + +**AI transparency / methodology (pain point D)** — *Done (foundation).* `app/methodology/` page documents how predictions are made. Will be deepened by the new AI Transparency Page issue. + +**FERPA compliance (pain point D)** — *Done.* RBAC (#75), audit log (#67), FERPA-compliant student-detail and SIS deep-link (#77, #78) — all closed. + +**Self-service & automation (pain point D)** — *Done.* Self-service data upload for PDP, AR files, student and course data shipped under issue #86, with UI in `components/upload/`. + +**Explainability / "why this number" (pain point D)** — *In progress.* SHAP narrator fine-tuning epic (issues #97–103, current branch `fine-tuning/97-shap-narrator-task-type`). Directly answers Andres's "broad analysis without context" concern. + +**Data validation on upload (pain point A)** — *Partial.* Upload exists (#86); the human-readable validation report layer is the new issue #110. + +**Filtering by cohort/term/demographic (pain point C)** — *Done.* Issues #66 and #81 (closed) shipped cohort/term/demographic/credential-type filters. + +**Natural-language query against the data (pain point C)** — *Done.* NLQ interface live via `lib/prompt-analyzer.ts`; issues #17, #61, #88, #90 (all closed). + +**Knowledge siloed in one person (pain point E)** — *Partial.* Self-service upload reduces single-person dependency, but no in-app submission runbook yet — addressed by new issue #111. + +**In short:** The tool already addresses a *substantial portion* of the pain — particularly the export/visualization gap (C), AI methodology transparency (D), and FERPA basics (D). The current SHAP narrator work (D) directly speaks to the "lack of context" risk Andres raised. + +--- + +## 3. Gaps — Issues Filed + +These are pain points the tool does **not yet** address adequately. Issues have been created in GitHub. + +### P0 — Differentiators that match the loudest complaints + +**Issue #105 — Metric definitions glossary with IPEDS and state-compliance cross-walks** + +Pain points: B (definitions buried, terminology mismatch with IPEDS/state compliance). + +Every metric/KPI surfaces a hover tooltip with: PDP definition, IPEDS-equivalent (if any), state-compliance term (if any). Centralized `/glossary` page indexed by metric. Source-of-truth markdown so definitions are versioned with the code. + +**Issue #106 — Presentation-ready chart export (PNG/PDF) with definitions baked in** + +Pain point: C (charts can't be shown to supervisors). + +"Export as PNG / PDF" on every chart. Includes title, definitions, data source, and date stamp baked into the export. Removes the manual "copy numbers to Excel and rebuild" workflow. + +**Issue #107 — Data lineage view ("where did this number come from")** + +Pain points: A (trust gap), D (lineage requirement). + +Click any number → see source rows, which upload it came from, transformations applied, and timestamp. Critical for the AI-trust story: "we can prove every number." This is the highest-leverage gap from the session. + +### P1 — AI governance hardening (table-stakes for institutional adoption) + +**Issue #108 — AI Transparency Page** + +Pain point: D (which data points used, where stored, what model). + +Lists every model (retention, GPA, etc.), the input features, the training data source, and model lineage. Shows whether the model is homegrown or third-party. Lists the LLM provider for NLQ + SHAP narrator and where prompts/data flow. + +**Issue #109 — Sensitive-population safeguards** + +Pain point: D (immigrant/undocumented/public-aid students; weaponization risk). + +Configurable "do not use" list of demographic features per institution. When predictions are generated for flagged sub-populations, surface a context warning ("low sample size — interpret with care"). Audit-log entry for any query/export that touches flagged populations. + +**Issue #110 — Upload validation report with human-readable error surface** + +Pain points: A (data-processing errors at submission), E (one person knows how). + +After upload: row-level errors, field coercions, deduplication decisions. Must be readable by a non-technical IR staffer (so the tool survives the "person retires" scenario). "Diff vs. last upload" view so anomalies (e.g., dropped campuses) are caught immediately. + +### P2 — Process / institutional fit + +**Issue #111 — Submission runbook generator** + +Pain point: E (knowledge siloed in one person). + +Tool records the exact upload steps + field mappings that "worked" for an institution. Generates a printable/PDF runbook so a successor can replicate without tribal knowledge. + +**Issue #112 — Institution-grouping helper for shared-goal cohorts** + +Pain point: F (datathon coordination). + +Operational, not a user-facing tool feature. Pull the SIS-by-institution list from AASCU. Cross-reference with stated institutional goals (e.g., "advising"). Output a candidate grouping for the fall datathon. + +### Out of scope (intentionally not filed) + +- **PDP-side cohort accuracy / dropdown completeness** — that's PDP's bug to fix; we shouldn't build around it +- **The $20K analysis-ready file paywall** — pricing decision by PDP; not addressable in our tool +- **Vendor over-promising on "full automation"** — competitive-positioning concern, not a feature + +--- + +## 4. One-paragraph summary + +Two AASCU Intermediaries described pain in three layers: (1) **PDP dashboard quality** — inaccurate cohort numbers, buried definitions, wrong chart types, no data export, forcing IR staff into manual Excel work; (2) **AI/governance requirements** — FERPA-plus expectations including data lineage, transparency on which data points are used and where stored, and explicit safeguards for sensitive student populations; and (3) **institutional process gaps** — submission knowledge typically lives with one person and varies wildly across campuses. Our tool already addresses a meaningful share of (1) — CSV export, sane chart types, NLQ, methodology page — and is in-flight on (2) via the SHAP-narrator work for explainability. The biggest unaddressed gaps are (a) a **definitions glossary with IPEDS/state-compliance cross-walks**, (b) **presentation-ready chart export** (PNG/PDF with definitions baked in), (c) a **data-lineage view** that proves where each number came from, and (d) **AI transparency + sensitive-population safeguards** as a precondition for institutional adoption. For datathon grouping, AASCU recommends pairing institutions by *shared SIS plus shared institutional goal* (e.g., advising) rather than SIS alone. diff --git a/docs/aascu_intermediary_postmortem.md b/docs/aascu_intermediary_postmortem.md new file mode 100644 index 0000000..57681e7 --- /dev/null +++ b/docs/aascu_intermediary_postmortem.md @@ -0,0 +1,180 @@ +# AASCU Intermediary Discovery — Postmortem + +**Session:** Discovery conversation with two AASCU Intermediaries (IFS) + +**Participants:** Andres (IR-focused), Dr. Prateek (data-engineering-focused) + +**Source materials:** `AASCU Meeting Recording.m4a` (~21 min) + typed session notes + +--- + +## Postmortem (4L) + +### Liked + +- Both intermediaries showed up candid and specific — concrete examples (line-graph misuse, the $20K AR-file paywall, the retiring-IR-lead anecdote) rather than abstract complaints. +- Strong alignment between the two IFS on the *root* problem ("institutions can't trust or use PDP outputs"), even though they emphasized different layers — that means the pain is real, not idiosyncratic. +- They volunteered AI-governance concerns unprompted, which validates that data lineage and transparency aren't over-engineering — they're table stakes. +- Operational asset surfaced: AASCU has a SIS-by-institution list ready to share for datathon grouping. + +### Learned + +- The $20K analysis-ready file paywall is a real adoption barrier — most institutions are working off the dashboard alone. +- Submission errors at the PDP processing layer (not at the institution) are corrupting downstream numbers — institutions get blamed for problems that aren't theirs. +- Knowledge silos at institutions are *the* fragility point. One person retires and submission capability evaporates. +- IR staff are already manually re-creating PDP charts in Excel before showing them to supervisors. The "rebuild it cleaner" workflow is a daily, repeated tax. +- "Weaponizing data" is the IFS's mental model for AI risk — not technical risk, but institutional-harm risk to under-resourced campuses. +- Datathon institutions should be grouped by **shared SIS + shared goal** (e.g., advising), not SIS alone. Last year's "wildly different incomes" cohort cost most of the day to find common ground. + +### Lacked + +- No live screen-share or walk-through of the PDP dashboard during the call — we're working from secondhand descriptions. +- No representative from an actual institution (only intermediaries describing what institutions experience). +- No quantification — we don't know *how many* institutions are affected by each pain point or how often. +- No exposure to the AR file format itself, since neither IFS has paid for it. Our assumptions about its structure are still assumptions. + +### Longed For + +- A side-by-side "PDP says X, our tool says Y" comparison on the same dataset, to make the differentiation tangible. +- A direct line to one or two institutions to validate the IFS-described pain points firsthand. +- Definition cross-walks already published somewhere (state compliance ↔ IPEDS ↔ PDP) we could reuse rather than build. +- Concrete examples of the "data weaponization" cases the IFS feared — to design safeguards against real scenarios, not hypotheticals. + +--- + +## Immediate Takeaways + +### Broad strokes — what we heard + +Three layers of pain, two of them addressable by our tool: + +- **PDP dashboard quality** (charts, definitions, export, accuracy): partially addressable — we already cover much of this, with clear gaps to close. +- **AI/governance expectations** (lineage, transparency, sensitive populations): fully addressable — and it's now table-stakes for institutional adoption. +- **Institutional process gaps** (knowledge silos, submission inconsistency): addressable via tooling that captures and replays institutional knowledge. + +### Common challenges (both IFS independently raised) + +- Data accuracy / trust gap with PDP outputs +- Buried, jargon-heavy definitions that don't match IPEDS or state compliance terms +- Visualizations that aren't presentation-ready +- AI governance: full transparency, full data lineage, sensitive-population care +- Need for full automation — and skepticism that vendors deliver on it + +### Unique challenges + +- **Andres (IR-focused):** emphasized incorrect submission processing at PDP — the bug is in the pipeline, not the source data; and the knowledge-silo fragility (retiring IR lead). +- **Dr. Prateek (data-engineering-focused):** emphasized hands-on friction — manually copying numbers into Excel, no download path, the cost of presentation rebuilds. + +### Clear problem space + +The tool is positioned to be **"PDP outputs you can trust, present, and govern"** — three things institutions can't get from PDP alone today. The differentiator is provable lineage + presentation-ready outputs + AI safeguards, layered on the predictions we already produce. + +--- + +## Flags + +### Red Flags — risk / concerns + +- **AI weaponization risk is real to IFS, not theoretical.** Any misstep on sensitive populations (immigrant, undocumented, public-aid students) torpedoes institutional trust permanently. +- **PDP-side data corruption is outside our control** but could be blamed on us if we don't surface it clearly. We need to make the source-of-truth boundary explicit. +- **Vendors in this space have a credibility deficit** ("vendors promise full automation; that does not happen"). We will be measured against that history. +- **FERPA-plus expectations.** FERPA alone isn't enough — institutions want assurances on AI use, storage, and lineage that go beyond statute. + +### Green Flags — areas of opportunity + +- **Data lineage view** is unmet by PDP and is the single highest-leverage differentiator (see issue #107). Directly answers the trust gap. +- **Presentation-ready chart export** (issue #106) eliminates a daily manual workflow for IR staff — high frequency, high pain, low-medium build cost. +- **SHAP narrator** (in-flight, issues #97-103) maps almost word-for-word to the IFS's "AI without context" concern. Strong narrative for the datathon demo. +- **Self-service upload** (#86, shipped) already lowers the knowledge-silo barrier — one of the loudest pain points is partially solved before we walked in. +- **AASCU has the SIS list ready** — we can produce the datathon grouping artifact (#112) on a fast turnaround. + +### Yellow Flags — potential challenges + +- **Definitions cross-walk** (issue #105) requires authoritative IPEDS / state-compliance source data we don't have yet. Could become a research drag if the cross-walks aren't already documented somewhere. +- **Institutional fit varies wildly** — university vs. junior college vs. multi-campus systems have different use cases. A generic tool will fight the same "wildly different incomes" problem the last datathon hit. +- **Scope creep risk on lineage view (#107).** Done well it's a differentiator; done shallowly it's a feature flag. Needs a clear scope cut for datathon. +- **No first-party institution voice yet.** All feedback is via intermediaries. There may be sub-pain-points or different priorities at the institution level we haven't surfaced. + +### Prep — what needs to be true / what we need to find out + +- Confirm with AASCU whether IPEDS / state-compliance ↔ PDP cross-walks already exist anywhere we can reuse. +- Get the SIS-by-institution list from AASCU and overlay institutional goals to draft datathon groupings (issue #112). +- Identify 1-2 institutions willing to do a 30-min validation call before the datathon to confirm the IFS-described pain points firsthand. +- Decide whether to scope a "lineage view MVP" (e.g., one metric only) for the datathon vs. building it post-event. +- Get clarity on which AI provider and data-flow story we'll commit to publicly (input for #108 AI Transparency Page). +- Verify the FERPA/RBAC implementation (#75) covers sensitive-population audit-log requirements (#109), or scope the gap. + +--- + +## Post Work — per IFS + +### Andres (IR-focused intermediary) + +**Completed template (discovery meeting):** + +- Role: IR-focused IFS, working with multiple institutions on submission and dashboard interpretation +- Primary pain: data accuracy at submission/processing layer + knowledge silos +- Top quote: *"Maybe the dashboard will sometimes pull from the wrong set"* and *"a lot of the knowledge tends to live with one person"* +- Asks of our tool: provable accuracy, transparency on data flow, FERPA-plus governance + +**Ideation session — finalized problem space (Andres):** + +- "PDP outputs you can prove and audit." Center the data-lineage view (#107) and AI Transparency Page (#108) for his use cases. +- Adjacent: upload validation report (#110) addresses his "submission processing is corrupting data" concern by catching anomalies institutions can act on. + +**Immediate follow-up questions for Andres:** + +1. Can you share examples (sanitized) of the "wrong dataset" PDP pulls — so we can model what verification looks like? +2. Of the institutions you support, which 1-2 would you prioritize for a 30-min validation call before the fall datathon? +3. Has AASCU already documented IPEDS ↔ PDP definition cross-walks anywhere we can reuse? +4. What's the typical IR team size at your institutions, and how does that change what "self-serve" means in practice? + +**Concerns / flags (Andres):** + +- Strong skepticism of vendor over-promising — we should under-promise and demo working features, not slideware. +- The "person retired" example suggests our submission runbook generator (#111) lands on real, recurring pain. + +**Datathon ideas + skills needed (Andres):** + +- Data-engineering / pipeline-focused track: ingestion, validation, lineage tracking +- Skills: SQL, data modeling, audit-log design, Python/pandas for pipeline work + +### Dr. Prateek (data-engineering-focused intermediary) + +**Completed template (discovery meeting):** + +- Role: data-engineering-focused IFS, hands-on with PDP outputs and visualization rebuilds +- Primary pain: friction in extracting, presenting, and trusting data +- Top quote: *"I just went by each data point and then I had to copy that number ... into the Excel spreadsheet"* +- Asks of our tool: easy export, clean visuals, careful AI handling of sensitive populations + +**Ideation session — finalized problem space (Dr. Prateek):** + +- "PDP outputs you can present and reuse." Center the presentation-ready chart export (#106), definitions glossary (#105), and sensitive-population safeguards (#109) for his use cases. +- Adjacent: data-lineage view (#107) supports his "is this number right?" reflex. + +**Immediate follow-up questions for Dr. Prateek:** + +1. Walk us through your last "rebuild it in Excel" cycle — what did you have to do, step by step? (To scope #106 well.) +2. Which sensitive-population categories should we make first-class in our exclusion lists? (Input for #109.) +3. What chart types do you wish PDP offered that it doesn't? +4. When you talk to your supervisors, what one or two charts do they actually care about? (To prioritize export polish.) + +**Concerns / flags (Dr. Prateek):** + +- AI sensitivity around immigrant/undocumented/public-aid students is paramount. We must demo our safeguards, not just claim them. +- "Data is very sensitive" — implies he'll scrutinize our data-flow disclosures hard. The AI Transparency Page (#108) needs to be airtight. + +**Datathon ideas + skills needed (Dr. Prateek):** + +- Front-end / visualization / UX track: chart export, glossary tooltips, presentation polish +- Skills: React, Recharts/D3, design sensibility, technical writing for definitions content + +--- + +## What's next (mechanical) + +- 8 GitHub issues filed (#105-#112, see `docs/aascu_intermediary_feedback_summary.md` section 3) +- Validation calls to schedule with 1-2 institutions +- IPEDS / state-compliance cross-walk research before #105 (glossary) starts +- AASCU SIS list + goals overlay to drive #112 (datathon grouping) diff --git a/docs/superpowers/specs/2026-04-02-fine-tuning-student-explainability-design.md b/docs/superpowers/specs/2026-04-02-fine-tuning-student-explainability-design.md new file mode 100644 index 0000000..dd81ea1 --- /dev/null +++ b/docs/superpowers/specs/2026-04-02-fine-tuning-student-explainability-design.md @@ -0,0 +1,305 @@ +# Design Spec: Fine-Tuning for Student Explainability + +**Date:** 2026-04-02 +**Epic label:** `fine-tuning: student-explainability` +**Epic branch:** `fine-tuning/student-explainability` +**Status:** Draft + +--- + +## 1. Goal + +Fine-tune a small language model (Qwen 3.5) on Bishop State domain data to replace GPT-4o-mini for three inference tasks in the dashboard. The primary value is improved explainability: advisors get SHAP-grounded, institution-aware narratives instead of templated rule-engine output. Secondary benefits include FERPA compliance (all inference on-premises), offline deployment, and institutional scalability. + +### Tasks to Fine-Tune + +| Task | Input | Output | Priority | +|------|-------|--------|----------| +| **SHAP Narrator** | SHAP values + student profile + risk factors | Grounded advisor narrative + interventions | Highest (new) | +| **Summarizer** | Query results + original question | Plain-English summary for advisors | Medium (exists) | +| **Explainer** | Course pairing stats (DFWI, delivery, instructor) | Data-driven analysis + recommendation | Medium (exists) | + +### Out of Scope + +- Query Analyzer (NL → SQL) — high risk, deferred to future epic +- Model serving infrastructure (RunPod, dedicated GPU hosting) — use local Ollama for now + +## 2. Prerequisites + +Before the epic branch is created: + +1. **Merge `feature/distillation-pipeline` → `main`** — brings in `training/` pipeline modules, `schools/bishop-state/config.yaml`, seed queries, `model-client.ts` +2. **Merge `feature/shap-explainability` → `main`** — brings in per-student SHAP computation (Step 10b), SHAP-aware `enrich_with_llm()`, student API SHAP exposure, feasibility report + +## 3. Epic Structure + +### Branching + +- **Epic branch:** `fine-tuning/student-explainability` (from `main` after prereq merges) +- **Feature branches:** `fine-tuning/issue-N-description` → PR into epic branch +- **Final PR:** epic branch → `main` + +### Issue Breakdown + +``` + +---------------+ + | #1 Prereq: | + | Merge both | + | branches | + +-------+-------+ + | + +------------+------------+ + v v v + +----------+ +----------+ +----------+ + | #2 SHAP | | #3 Colab | | #4 Distill| + | narrator | | notebook | | summarizer| + | task type| | (Unsloth)| | + explain | + +----+-----+ +----+-----+ +----+-----+ + | | | + v | | + +----------+ | | + | #5 Distill| | | + | SHAP | | | + | narrator | | | + +----+-----+ | | + | | | + +-------------+------------+ + v + +----------+ + | #6 Train | + | 4B + 9B | + | evaluate | + +----+-----+ + | + +----+-----+ + v v + +----------+ +----------+ + | #7 Export | | #8 Update| + | + wire | | docs & | + | dashboard| | report | + +----------+ +----------+ +``` + +| # | Title | Description | Depends | Labels | +|---|-------|------------|---------|--------| +| 1 | Merge distillation-pipeline and shap-explainability to main | Merge both feature branches, resolve conflicts, verify CI | — | `type:chore` | +| 2 | Add SHAP narrator task type to training pipeline | New prompt template, output schema, seed data generator, eval metrics | #1 | `type:feature`, `area:ai` | +| 3 | Build Colab training notebook (Unsloth + LoRA) | Single "Run All" notebook, parameterized config, 3-phase training, GGUF export. Replace `training/finetune.py` (MLX) with Unsloth wrapper. | #1 | `type:feature`, `area:ai` | +| 4 | Distill training pairs for summarizer and explainer | Run distillation for both existing tasks (~1,500 pairs each via Claude API). Prepare datasets. | #1 | `type:feature`, `area:ai` | +| 5 | Distill training pairs for SHAP narrator | Generate ~1,500 SHAP narrator pairs from student data + SHAP values. Requires SHAP data in DB. | #2 | `type:feature`, `area:ai` | +| 6 | Train and evaluate 4B + 9B models | Run Colab notebook for both model sizes. Evaluate via ship criteria. Compare metrics, pick winner. | #3, #4, #5 | `type:spike`, `area:ai` | +| 7 | Export models and wire into dashboard | GGUF export, Ollama registration, wire `model-client.ts` into consumer routes, update `enrich_with_llm` model string. | #6 | `type:feature`, `area:ai`, `area:frontend` | +| 8 | Update documentation and feasibility report | Update feasibility report with actual results, update README and CLAUDE.md. | #6 | `type:documentation` | + +### Parallelism + +Issues #2, #3, and #4 can proceed concurrently after #1. Issue #5 waits only on #2. Issue #6 is the convergence point. Issues #7 and #8 are parallel after #6. + +## 4. Colab Notebook Design + +### Principles + +- **Single "Run All" execution.** No babysitting. No manual cell-by-cell. +- **Parameterized at the top.** One config cell is the only thing the user edits. +- **Checkpoint and resume.** If Colab disconnects, set `SKIP_DOMAIN_ADAPTATION=True` to resume from Phase 2. +- **Chat template alignment.** Uses `tokenizer.apply_chat_template()` throughout — never manual ChatML tokenization (D4BL's critical lesson). + +### Notebook Structure + +``` +Cell 1: Configuration (ONLY cell the user edits) +------------------------------------------------- +SCHOOL = "bishop-state" +MODEL_SIZES = ["4b", "9b"] +REPO_URL = "https://github.com/codebenders/datathon.git" +REPO_BRANCH = "fine-tuning/student-explainability" +HF_TOKEN = "" # or userdata.get('HF_TOKEN') +PHASE_1_EPOCHS = 1 +PHASE_2_EPOCHS = 7 +SKIP_DOMAIN_ADAPTATION = False # True to reuse cached Phase 1 + +Cell 2+: Fully autonomous +------------------------------------------------- +- GPU detection + validation (assert A100/T4/L4) +- pip install unsloth, trl, peft +- Clone repo, load schools/{SCHOOL}/config.yaml +- For each model size: + - Phase 1: Domain adaptation + - Load base Qwen model via Unsloth (4-bit NF4) + - Train on training_data/{school}/domain.jsonl + - LoRA rank 16, all modules, 1 epoch, lr 2e-4, effective batch 32 + - Save merged checkpoint + - Phase 2: Task adapters (narrator, summarizer, explainer) + - Load Phase 1 checkpoint + - Train LoRA adapter per task + - Eval after each task, print ship-criteria table + - Narrator: LoRA r=16, attention+FFN, 7 epochs, lr 1e-4 + - Summarizer: LoRA r=8, attention only, 7 epochs, lr 1e-4 + - Explainer: LoRA r=16, attention+FFN, 4 epochs, lr 1e-4 + - Phase 3: GGUF export + - Quantize each task adapter to q4_k_m + - Upload to Google Drive (or HF Hub if HF_TOKEN provided) +- Print comparison table: 4B vs 9B metrics across all tasks +- Recommend winner based on ship criteria +``` + +### Training Hyperparameters + +Based on D4BL's proven configurations: + +| Parameter | Phase 1 (Domain) | Phase 2 (Tasks) | +|-----------|------------------|-----------------| +| LoRA rank | 16 | 8-16 (task-dependent) | +| LoRA alpha | 32 | 16-32 | +| Learning rate | 2e-4 | 1e-4 | +| Batch size (per device) | 8 | 4-8 | +| Gradient accumulation | 4 | 2-4 | +| Epochs | 1 | 4-7 | +| Max sequence length | 4096 | 4096-8192 | +| Optimizer | AdamW 8-bit | AdamW 8-bit | +| Precision | bf16 (A100) | bf16 (A100) | + +### What the Notebook Does NOT Do + +- Does not run distillation (that's local via `python -m training.distill`) +- Does not register Ollama models (local after downloading GGUFs) +- Does not modify the repo (read-only clone for config + training data) + +## 5. SHAP Narrator Task Design + +### New Task Type: `narrator` + +This is the highest-value task — it transforms per-student SHAP attribution data into advisor-facing narratives that explain *why* a student is at risk and *what specifically to do about it*. + +### Input Format (at inference) + +```json +{ + "student_profile": { + "enrollment_intensity": "Part-Time", + "gpa_year1": 1.4, + "math_placement": "R", + "course_completion_rate": 0.55, + "gateway_math_completed": false, + "at_risk_alert": "HIGH", + "retention_probability": 0.28 + }, + "readiness_score": 0.38, + "readiness_level": "low", + "risk_factors": [ + "Low first-year GPA (1.4 / 4.0)", + "Gateway math not completed in Year 1" + ], + "shap": { + "retention": { + "base_value": 0.52, + "top_positive": [ + {"feature": "total_credits_attempted", "shap_value": 0.05, "value": 12.0} + ], + "top_negative": [ + {"feature": "CompletedGatewayMathYear1", "shap_value": -0.18, "value": 0.0}, + {"feature": "Enrollment_Intensity_First_Term", "shap_value": -0.12, "value": 1.0} + ] + }, + "gateway_math": { ... }, + "low_gpa": { ... } + } +} +``` + +### Output Schema + +```json +{ + "narrative": "2-3 sentence explanation grounded in SHAP attribution", + "key_drivers": [ + "Gateway math not completed (-0.18 on retention)", + "Part-time enrollment (-0.12 on retention)" + ], + "recommended_actions": [ + "Priority enrollment in MAT 100 next term", + "Explore full-time enrollment options and financial aid", + "Connect with Math Bootcamp (2x pass rate for participants)" + ], + "data_limitations": [ + "Retention model trained on 2019-2023 cohorts; 2024+ patterns may differ" + ] +} +``` + +### Distillation Strategy + +1. Pull ~4K students from `student_level_with_predictions` joined with `llm_recommendations` +2. For each medium/low readiness student (~2K): build input from `shap_explanations` + `input_features` columns +3. Send to Claude (teacher model) with system prompt grounded in Bishop State context from `config.yaml` +4. Validate output JSON schema, deduplicate (Jaccard 1.0), split 80/10/10 +5. Target: ~1,500 validated training pairs + +### Eval Metrics (Ship Criteria) + +| Metric | Threshold | Blocking? | +|--------|-----------|-----------| +| `json_valid_rate` | >= 95% | Yes | +| `schema_valid_rate` | >= 90% | Yes | +| `shap_grounding_rate` | >= 80% (narrative mentions >= 2 of top-3 SHAP features) | Yes | +| `action_specificity` | LLM-judged: are actions Bishop State-specific? | No | + +## 6. Dashboard Integration + +### Model Client as Single Adapter + +`model-client.ts` becomes the sole inference routing layer. Existing routes (`explain-pairing/route.ts`, `query-summary/route.ts`) that currently instantiate their own OpenAI clients will be refactored to call `generateExplanation()` and `generateSummary()` from `model-client.ts`. + +### Ollama Model Naming + +``` +bishop-state-narrator:{size} # SHAP narrator +bishop-state-summarizer:{size} # Query summary +bishop-state-explainer:{size} # Course pairing +``` + +Where `{size}` is `4b` or `9b` based on evaluation results. + +### SHAP Narrator Integration Point + +`generate_readiness_scores.py` already has `--enrich-with-llm` with the SHAP-aware prompt. The only change is the model string: + +```bash +# Before (OpenAI) +python ai_model/generate_readiness_scores.py --enrich-with-llm --llm-model gpt-4o-mini + +# After (fine-tuned) +python ai_model/generate_readiness_scores.py --enrich-with-llm --llm-model ollama/bishop-state-narrator:4b +``` + +### Environment Variables + +```env +MODEL_BACKEND=ollama # or "openai" (fallback) +OLLAMA_BASE_URL=http://localhost:11434 +MODEL_SIZE=4b # set after evaluation picks winner +SCHOOL_CODE=bishop-state +``` + +### Fallback Behavior + +The operator sets `MODEL_BACKEND` to either `ollama` or `openai`. There is no automatic failover — if Ollama is down and `MODEL_BACKEND=ollama`, the route returns an error. This is intentional: silent fallback to OpenAI would send student data to an external service without the operator's knowledge, violating the FERPA benefit. + +## 7. Cost Estimate + +| Item | Cost | +|------|------| +| Claude API distillation (~4,500 pairs across 3 tasks) | $5-10 | +| Colab A100 compute (~4 hours for 2 model sizes) | $8-16 | +| **Total per training run** | **$13-26** | +| Iteration runs (subsequent) | $8-16 each | + +## 8. Success Criteria + +The epic is complete when: + +1. All three tasks pass ship criteria on the winning model size +2. `MODEL_BACKEND=ollama` serves all three tasks in the dashboard without OpenAI +3. SHAP narrator produces grounded narratives that cite specific feature attributions +4. Feasibility report is updated with actual metrics and model selection rationale +5. Colab notebook is documented and reproducible (clone + Run All)