githubusers/generate_stats.py at main · ArchiveBox/githubusers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
generate_stats.py — mine pirate's git history across local clones and GitHub.

Pipeline:
  1. Walk local filesystem for .git dirs (under /Users/squash/Local & Documents).
  2. Mine each repo with `git log --no-merges --numstat` filtered to pirate's
     author identities. Cache per-repo results to JSON.
  3. Query gh API for owned repos. Any owned repo not covered locally is
     pulled via `gh api /repos/.../commits` + per-commit stats fetch.
  4. Query gh search/commits to discover repos pirate has contributed to
     but does not own. Fetch missing commits via API.
  5. Dedupe by SHA across all sources, aggregate by year/repo/day, and
     render a single-file stats.html using an embedded template.

Run:  python3 generate_stats.py            # full run, uses cache
      python3 generate_stats.py --no-api   # local-only, skip GitHub API
      python3 generate_stats.py --render   # re-render HTML from cached data
"""
from __future__ import annotations

import argparse
import json
import os
import re
import subprocess
import sys
import time
from collections import defaultdict
from datetime import datetime, timezone, date, timedelta
from pathlib import Path
from typing import Iterable, Iterator

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

# ---------------------------------------------------------------------------
# Configuration (mutable at startup via load_config / CLI flags).
# Defaults below are pirate's personal config; override via --user / --config.
# ---------------------------------------------------------------------------

GH_LOGIN = "pirate"
GH_NAME = "Nick Sweeting"

# Optional: when set, the script POSTs phase updates to this URL so the
# live "mining…" page can show real-time progress. The Worker's
# /api/progress endpoint checks the Bearer token against GH_DISPATCH_TOKEN.
PROGRESS_URL = os.environ.get(
    "STATS_PROGRESS_URL",
    "https://githubusers.archivebox.io/api/progress",
)
PROGRESS_TOKEN = os.environ.get("STATS_PROGRESS_TOKEN", "")

# Known author emails pirate has used over the years.
PIRATE_EMAILS = {
    "nikisweeting@gmail.com",
    "githubpirate@gmail.com",
    "nickwentboom@gmail.com",
    "nick@sweeting.me",
    "git@sweeting.me",
    "github@sweeting.me",
    "git@nicksweeting.com",
    "root@home.sweeting.me",
    "pirate@browserbase.com",
    "511499+pirate@users.noreply.github.com",
    "pirate@users.noreply.github.com",
}

# Substrings that strongly indicate an author entry belongs to pirate.
PIRATE_NEEDLES = (
    "sweeting",
    "githubpirate",
    "nickwentboom",
    "pirate@browserbase",
    "511499+pirate",
)

# Whether to render personalized sections (career timeline, company colors).
# Set to False for "generic" runs of arbitrary GH users.
PERSONALIZED = True

ROOT = Path(__file__).resolve().parent
# Cache paths are namespaced per user (default = pirate's). Re-bound in
# rebind_cache_paths() after load_config_from_file / auto_derive_config.
CACHE = ROOT / "cache"
CACHE_REPOS = CACHE / "repos"
CACHE_BARE = CACHE / "bare"
CACHE_API = CACHE / "api"
CACHE_AGG = CACHE / "commits_all.json"
TEMPLATE_FILE = ROOT / "stats_template.html"
OUTPUT_FILE = ROOT / "stats.html"


def rebind_cache_paths() -> None:
    """Switch cache + output paths to be user-namespaced when running for
    a non-default user. Pirate's default cache (cache/) stays in place;
    other users get cache_<login>/ + stats_<login>.html so re-runs don't
    clobber each other."""
    global CACHE, CACHE_REPOS, CACHE_BARE, CACHE_API, CACHE_AGG, OUTPUT_FILE
    if GH_LOGIN == "pirate":
        # Backwards compat for the original owner's files.
        return
    safe = re.sub(r"[^A-Za-z0-9_.-]", "_", GH_LOGIN)
    CACHE = ROOT / f"cache_{safe}"
    CACHE_REPOS = CACHE / "repos"
    CACHE_BARE = CACHE / "bare"
    CACHE_API = CACHE / "api"
    CACHE_AGG = CACHE / "commits_all.json"
    OUTPUT_FILE = ROOT / f"stats_{safe}.html"

SEARCH_DIRS = [
    Path("/Users/squash/Local"),
    Path("/Users/squash/Documents"),
]

# Path patterns to skip (vendored / nested git repos that are dependencies).
EXCLUDE_PATH_PATTERNS = (
    "/node_modules/",
    "/.venv/",
    "/venv/",
    "/.cache/",
    "/__pycache__/",
    "/site-packages/",
    "/data/archive/",      # ArchiveBox snapshot mirrors
    "/lolcommits/",
    "/.bash-prompt/",      # oh-my-zsh themes
    "/.oh-my-zsh/",
    "/.oh-my-fish/",
    "/bash-utils/",
)

# Path-component (directory name) exclusions. We split the file path on
# '/' and exclude if ANY segment exactly matches one of these names —
# anchored matching avoids false positives like 'layout/' triggering on
# 'out/', or 'rebuild/' triggering on 'build/'.
EXCLUDE_PATH_COMPONENTS = frozenset({
    # Vendored / dependency directories
    "node_modules", "vendor", "bower_components", "third_party",
    ".venv", "venv", "site-packages", "__pycache__",
    # Build / dist / generated output (anchored so 'rebuild/' is kept)
    "dist", "build", "target", "out",
    ".next", ".nuxt", ".turbo", ".parcel-cache",
    "coverage", "htmlcov", ".pytest_cache", ".mypy_cache",
    ".tox", ".eggs",
    # ArchiveBox snapshot dirs (huge data dumps)
    "archive", "snapshots",
})

# Glob-like suffix exclusions for directory names (matches whole segments).
# These can't be in the frozenset above because they're suffix patterns.
EXCLUDE_PATH_COMPONENT_SUFFIXES = (
    ".egg-info",
)

# File-name (basename) exact matches — lock files etc.
EXCLUDE_FILE_BASENAMES = {
    "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "npm-shrinkwrap.json",
    "poetry.lock", "Pipfile.lock", "uv.lock", "pdm.lock", "pylock.toml",
    "Gemfile.lock", "composer.lock", "Cargo.lock", "go.sum",
    "Podfile.lock", "mix.lock", "flake.lock", "deno.lock", "bun.lockb",
    ".gitmodules", ".DS_Store",
}

# File extensions excluded — binaries, media, archives, minified.
EXCLUDE_FILE_EXTS = {
    # Compiled / bytecode
    ".pyc", ".pyo", ".pyd", ".so", ".o", ".a", ".class", ".dll", ".dylib",
    ".jar", ".war", ".ear",
    # Images
    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif",
    ".ico", ".heic", ".heif", ".avif", ".raw",
    # Vector graphics (often huge embedded data)
    ".svg",
    # Video / audio
    ".mp4", ".mov", ".webm", ".avi", ".mkv", ".m4v", ".flv",
    ".mp3", ".wav", ".aac", ".m4a", ".ogg", ".flac", ".opus",
    # Archives
    ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".tgz", ".tbz",
    # Documents (binary)
    ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
    ".odt", ".ods", ".odp",
    # Fonts
    ".woff", ".woff2", ".ttf", ".otf", ".eot",
    # Minified / bundled / sourcemaps
    ".min.js", ".min.css", ".bundle.js", ".chunk.js", ".map",
    # Misc binaries
    ".bin", ".dat", ".db", ".sqlite", ".sqlite3", ".mmdb",
    # Generated/exported data dumps
    ".sql.gz", ".dump", ".pickle", ".pkl", ".parquet", ".feather",
    ".h5", ".hdf5", ".npy", ".npz", ".pt", ".pth", ".onnx",
}


EXT_TO_LANG = {
    ".py": "Python", ".pyi": "Python", ".ipynb": "Python",
    ".js": "JavaScript", ".jsx": "JavaScript", ".mjs": "JavaScript", ".cjs": "JavaScript",
    ".ts": "TypeScript", ".tsx": "TypeScript",
    ".go": "Go",
    ".rs": "Rust",
    ".rb": "Ruby", ".erb": "Ruby",
    ".java": "Java",
    ".kt": "Kotlin", ".kts": "Kotlin",
    ".swift": "Swift",
    ".m": "Objective-C", ".mm": "Objective-C",
    ".c": "C", ".h": "C",
    ".cpp": "C++", ".cc": "C++", ".cxx": "C++", ".hpp": "C++", ".hh": "C++",
    ".cs": "C#",
    ".php": "PHP",
    ".scala": "Scala",
    ".clj": "Clojure", ".cljs": "Clojure",
    ".lua": "Lua",
    ".erl": "Erlang", ".hrl": "Erlang",
    ".ex": "Elixir", ".exs": "Elixir",
    ".dart": "Dart",
    ".hs": "Haskell",
    ".html": "HTML", ".htm": "HTML",
    ".css": "CSS", ".scss": "CSS", ".sass": "CSS", ".less": "CSS",
    ".vue": "Vue",
    ".svelte": "Svelte",
    ".astro": "Astro",
    ".md": "Markdown", ".mdx": "Markdown",
    ".rst": "reST",
    ".tex": "TeX",
    ".sh": "Shell", ".bash": "Shell", ".zsh": "Shell", ".fish": "Shell",
    ".ps1": "PowerShell",
    ".sql": "SQL",
    ".yaml": "YAML", ".yml": "YAML",
    ".toml": "TOML",
    ".json": "JSON",
    ".xml": "XML",
    ".proto": "Protobuf",
    ".graphql": "GraphQL", ".gql": "GraphQL",
    ".dockerfile": "Docker",
    ".tf": "Terraform", ".hcl": "HCL",
    ".vim": "Vim script",
    ".nix": "Nix",
    ".zig": "Zig",
    ".cr": "Crystal",
    ".jl": "Julia",
    ".r": "R",
    ".pl": "Perl", ".pm": "Perl",
    ".coffee": "CoffeeScript",
    ".elm": "Elm",
    ".purs": "PureScript",
    ".fs": "F#", ".fsi": "F#", ".fsx": "F#",
    ".ml": "OCaml", ".mli": "OCaml",
    ".re": "ReasonML", ".rei": "ReasonML",
    ".sol": "Solidity",
    ".move": "Move",
    ".cairo": "Cairo",
    ".v": "Verilog", ".sv": "SystemVerilog", ".vhdl": "VHDL",
    ".asm": "Assembly", ".s": "Assembly",
    ".lisp": "Lisp", ".scm": "Scheme",
    ".raku": "Raku",
}


def lang_for(path: str) -> str:
    """Return a coarse language label for a file path. 'Other' for unknown."""
    if not path:
        return "Other"
    base = path.rsplit("/", 1)[-1].lower()
    if base == "dockerfile" or base.startswith("dockerfile."):
        return "Docker"
    if base == "makefile" or base.startswith("makefile."):
        return "Make"
    plower = path.lower()
    # Compound suffixes first (e.g., .test.ts)
    for ext, name in EXT_TO_LANG.items():
        if plower.endswith(ext):
            return name
    return "Other"


def _is_excluded_file(path: str) -> bool:
    """Return True if the file path matches an exclusion pattern."""
    if not path:
        return False
    # Path-component match: any /-delimited segment in EXCLUDE_PATH_COMPONENTS
    # or matching one of the suffix patterns (e.g. *.egg-info).
    segments = path.split("/")
    for seg in segments:
        if seg in EXCLUDE_PATH_COMPONENTS:
            return True
        for suf in EXCLUDE_PATH_COMPONENT_SUFFIXES:
            if seg.endswith(suf):
                return True
    # Basename exact match (lock files etc.)
    base = segments[-1]
    if base in EXCLUDE_FILE_BASENAMES:
        return True
    # Extension match (lowercase, support compound suffixes like .min.js)
    plower = path.lower()
    for ext in EXCLUDE_FILE_EXTS:
        if plower.endswith(ext):
            return True
    return False

# Repo path → display name fallback when remote is unavailable.
def repo_display_name(local_path: Path, remote_url: str | None) -> str:
    if remote_url:
        # git@github.com:foo/bar.git or https://github.com/foo/bar.git
        m = re.search(r"github\.com[:/]([^/]+/[^/]+?)(?:\.git)?/?$", remote_url)
        if m:
            return m.group(1)
    return local_path.name


# ---------------------------------------------------------------------------
# Local git mining
# ---------------------------------------------------------------------------

def find_local_repos(search_dirs: list[Path]) -> list[Path]:
    """Find all .git directories under the given search roots.
    Returns repos ordered by HEAD mtime descending (most recently active first)
    so that incremental rendering shows the user's recent activity quickly."""
    seen: set[Path] = set()
    repos: list[Path] = []
    for root in search_dirs:
        if not root.exists():
            continue
        try:
            out = subprocess.run(
                ["find", str(root), "-maxdepth", "8", "-type", "d", "-name", ".git"],
                capture_output=True, text=True, timeout=180,
            ).stdout
        except subprocess.TimeoutExpired:
            print(f"  ! find timed out on {root}", file=sys.stderr)
            continue
        for line in out.splitlines():
            git_dir = Path(line)
            repo_dir = git_dir.parent
            sline = str(repo_dir) + "/"
            if any(p in sline for p in EXCLUDE_PATH_PATTERNS):
                continue
            if repo_dir in seen:
                continue
            seen.add(repo_dir)
            repos.append(repo_dir)

    # Sort by .git/HEAD mtime descending (most recent activity first).
    def sort_key(p: Path) -> float:
        try:
            head = p / ".git" / "HEAD"
            if head.exists():
                return -head.stat().st_mtime
            return 0.0
        except Exception:
            return 0.0
    repos.sort(key=sort_key)
    return repos


def git_remote_url(repo: Path) -> str | None:
    try:
        out = subprocess.run(
            ["git", "-C", str(repo), "config", "--get", "remote.origin.url"],
            capture_output=True, text=True, timeout=5,
        )
        url = out.stdout.strip()
        return url or None
    except Exception:
        return None


def mine_local_repo(repo: Path) -> list[dict]:
    """Return a list of commit records authored by the user in this repo."""
    # Build a git --author regex from the configured needles + emails so
    # mining for arbitrary users (--user X) actually matches their commits,
    # not pirate's hardcoded identity.
    _alts: set[str] = set()
    for n in PIRATE_NEEDLES:
        s = (n or "").strip()
        if s:
            _alts.add(re.escape(s))
    for e in PIRATE_EMAILS:
        s = (e or "").strip()
        if s:
            _alts.add(re.escape(s))
    # Always include the GH login itself in case commits use the noreply
    # form like 511499+pirate@users.noreply.github.com.
    if GH_LOGIN:
        _alts.add(re.escape(GH_LOGIN))
    author_pattern = "\\|".join(sorted(_alts)) or r"$^"  # never-match fallback

    # Quick existence check first — git stops walking on first match.
    # HEAD-only is fast on any repo; if no hit, try --all with shorter timeout.
    def precheck(refs_args: list[str], timeout: int) -> bool | None:
        try:
            r = subprocess.run(
                ["git", "-C", str(repo), "log", *refs_args, "-i",
                 f"--author={author_pattern}", "-n", "1", "--format=%H"],
                capture_output=True, text=True, timeout=timeout,
            )
        except subprocess.TimeoutExpired:
            return None
        return bool(r.stdout.strip())

    head_match = precheck([], timeout=20)
    if head_match is None:
        print(f"  ! pre-check (HEAD) timed out on {repo}", file=sys.stderr)
        return []
    if head_match is False:
        all_match = precheck(["--all"], timeout=30)
        if not all_match:
            return []

    # Try fast path first (default branch only).
    def run_log(extra_args: list[str], timeout: int) -> str | None:
        cmd = [
            "git", "-C", str(repo), "log", "--no-merges", "-i",
            f"--author={author_pattern}",
            "--pretty=format:__C__%H|%aI|%aE|%aN|%s",
            "--numstat",
        ] + extra_args
        try:
            r = subprocess.run(cmd, capture_output=True, text=True,
                               timeout=timeout, errors="replace")
        except subprocess.TimeoutExpired:
            return None
        if r.returncode != 0:
            return None
        return r.stdout

    # Prefer --all (covers HEAD + all branches/tags). Fall back to HEAD-only
    # if --all is slow (lots of refs / I/O contention).
    output = run_log(["--all"], timeout=90)
    if output is None:
        print(f"  ! --all slow on {repo} — falling back to HEAD", file=sys.stderr)
        output = run_log([], timeout=30)
    if output is None:
        print(f"  ! git log failed on {repo}", file=sys.stderr)
        return []

    out_stdout = output

    remote = git_remote_url(repo)
    name = repo_display_name(repo, remote)

    records: list[dict] = []
    current: dict | None = None

    for line in out_stdout.split("\n"):
        if not line:
            continue
        if line.startswith("__C__"):
            # Flush previous record.
            if current is not None:
                records.append(current)
            try:
                sha, iso_date, email, author_name, subject = line[5:].split("|", 4)
            except ValueError:
                current = None
                continue
            # Check pirate-ness.
            ident = f"{author_name} {email}".lower()
            is_pirate = (
                email.lower() in PIRATE_EMAILS
                or any(n in ident for n in PIRATE_NEEDLES)
            )
            if not is_pirate:
                current = None
                continue
            current = {
                "sha": sha,
                "date": iso_date,
                "email": email,
                "author": author_name,
                "subject": subject,
                "additions": 0,
                "deletions": 0,
                "files": 0,
                "by_lang": {},
                "repo": name,
                "repo_local_path": str(repo),
                "repo_remote": remote,
                "source": "local",
            }
        else:
            if current is None:
                continue
            # numstat line: "<add>\t<del>\t<path>" — add/del may be "-" for binary.
            parts = line.split("\t", 2)
            if len(parts) != 3:
                continue
            a, d, path = parts
            # Skip vendored / lock / generated files from line counts.
            if _is_excluded_file(path):
                continue
            add_i = int(a) if a.isdigit() else 0
            del_i = int(d) if d.isdigit() else 0
            current["additions"] += add_i
            current["deletions"] += del_i
            current["files"] += 1
            lang = lang_for(path)
            bl = current["by_lang"].setdefault(lang, [0, 0])
            bl[0] += add_i
            bl[1] += del_i

    if current is not None:
        records.append(current)
    return records


def mine_all_local(
    repos: list[Path], force: bool = False,
    incremental_render_every: int = 0,
) -> dict[str, list[dict]]:
    """Mine each repo, caching results by repo path. Returns {repo_path: records}.
    Loads ALL cached repos upfront so incremental renders include history that
    isn't part of this iteration's processing order."""
    CACHE_REPOS.mkdir(parents=True, exist_ok=True)
    results: dict[str, list[dict]] = {}

    # Preload every cache file (covers repos no longer on disk, or whose
    # turn hasn't come yet in this iteration).
    for cf in CACHE_REPOS.glob("*.json"):
        try:
            results[cf.stem] = json.loads(cf.read_text())
        except Exception:
            pass

    new_count = 0
    for i, repo in enumerate(repos, 1):
        key = str(repo).replace("/", "__").lstrip("__")
        cache_file = CACHE_REPOS / f"{key}.json"
        if cache_file.exists() and not force:
            try:
                results[str(repo)] = json.loads(cache_file.read_text())
                continue
            except Exception:
                pass
        print(f"  [{i}/{len(repos)}] {repo}", file=sys.stderr)
        records = mine_local_repo(repo)
        cache_file.write_text(json.dumps(records))
        results[str(repo)] = records
        new_count += 1
        if incremental_render_every and new_count % incremental_render_every == 0:
            try:
                all_recs = [r for rs in results.values() for r in rs]
                merged = dedupe_commits(all_recs)
                agg = aggregate(merged)
                OUTPUT_FILE.write_text(render_html(agg))
                print(f"    -- incremental: {len(merged)} commits, {agg['totals']['repos']} repos --",
                      file=sys.stderr)
            except Exception as e:
                print(f"    incremental render failed: {e}", file=sys.stderr)
    return results


# ---------------------------------------------------------------------------
# GitHub API mining
# ---------------------------------------------------------------------------

ANSI_RE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]")

def gh_api(path: str, *, paginate: bool = False, jq: str | None = None) -> object:
    """Run `gh api` and return parsed JSON.
    For multi-result jq selectors, pass jq that pipes through `@json` to get
    one compact JSON value per line — gh's default pretty-prints across lines,
    which breaks line-by-line parsing.
    Returns a list when jq is set or when paginate yields concatenated objects;
    a parsed scalar/dict/list otherwise."""
    cmd = ["gh", "api", path]
    if paginate:
        cmd.append("--paginate")
    if jq:
        # Force compact one-line output for stream parsing.
        if "@json" not in jq:
            jq = f"{jq} | @json"
        cmd += ["--jq", jq]
    env = {**os.environ, "NO_COLOR": "1", "CLICOLOR": "0", "GH_NO_COLOR": "1"}
    out = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
    if out.returncode != 0:
        raise RuntimeError(f"gh api failed: {path}\n{out.stderr}")
    text = ANSI_RE.sub("", out.stdout).strip()
    if not text:
        return [] if paginate else None
    if jq:
        # Each line is a JSON value (or, if jq yields a multi-line value, parse).
        results = []
        for line in text.splitlines():
            line = line.strip()
            if not line:
                continue
            try:
                results.append(json.loads(line))
            except json.JSONDecodeError:
                pass
        return results
    # Try one-shot parse first.
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass
    # Parse concatenated JSON values via raw_decode.
    dec = json.JSONDecoder()
    pos = 0
    n = len(text)
    parts = []
    while pos < n:
        while pos < n and text[pos] in " \t\n\r":
            pos += 1
        if pos >= n:
            break
        try:
            val, end = dec.raw_decode(text, pos)
        except json.JSONDecodeError:
            break
        parts.append(val)
        pos = end
    # If every part is a list (paginated array endpoint), flatten.
    if parts and all(isinstance(p, list) for p in parts):
        merged = []
        for p in parts:
            merged.extend(p)
        return merged
    return parts


def list_owned_repos() -> list[dict]:
    """List all repos owned by the configured user (public + private if
    authenticated)."""
    cache_file = CACHE_API / "owned_repos.json"
    if cache_file.exists():
        return json.loads(cache_file.read_text())
    print(f"  fetching owned repos for @{GH_LOGIN} ...", file=sys.stderr)
    if PERSONALIZED:
        data = gh_api("/user/repos?per_page=100&affiliation=owner&sort=updated",
                      paginate=True)
    else:
        data = gh_api(f"/users/{GH_LOGIN}/repos?per_page=100&sort=updated",
                      paginate=True)
    repos = data if isinstance(data, list) else []
    CACHE_API.mkdir(parents=True, exist_ok=True)
    cache_file.write_text(json.dumps(repos))
    return repos


def list_accessible_repos() -> list[dict]:
    """List all repos for the configured user. For the authenticated user
    (PERSONALIZED=True), gets owner+collaborator+organization_member access.
    For other users (--user mode), only their PUBLIC owned repos are
    available via /users/{login}/repos."""
    cache_file = CACHE_API / "accessible_repos.json"
    if cache_file.exists():
        return json.loads(cache_file.read_text())
    if PERSONALIZED:
        print("  fetching all accessible repos for authenticated user ...",
              file=sys.stderr)
        data = gh_api("/user/repos?per_page=100"
                      "&affiliation=owner,collaborator,organization_member"
                      "&sort=updated", paginate=True)
    else:
        print(f"  fetching public repos for @{GH_LOGIN} ...", file=sys.stderr)
        data = gh_api(f"/users/{GH_LOGIN}/repos?per_page=100&sort=updated",
                      paginate=True)
    repos = data if isinstance(data, list) else []
    CACHE_API.mkdir(parents=True, exist_ok=True)
    cache_file.write_text(json.dumps(repos))
    return repos


def list_repo_commits_by_author(full_name: str) -> list[dict]:
    """List commit SHAs in a repo authored by the user (uses commits API).
    Returns minimal commit records (no stats yet)."""
    cache_file = CACHE_API / "repo_commits" / f"{full_name.replace('/', '__')}.json"
    if cache_file.exists():
        try:
            return json.loads(cache_file.read_text())
        except Exception:
            pass
    try:
        items = gh_api(
            f"/repos/{full_name}/commits?author={GH_LOGIN}&per_page=100",
            paginate=True, jq=".[]",
        )
    except RuntimeError as e:
        print(f"    ! list commits {full_name}: {e}", file=sys.stderr)
        return []
    items = items or []
    cache_file.parent.mkdir(parents=True, exist_ok=True)
    cache_file.write_text(json.dumps(items))
    return items


def mine_github_accessible(known_shas: set[str],
                           max_repos: int = 1000,
                           max_fetches: int = 5000) -> list[dict]:
    """For each repo pirate has access to, list his commits via commits API
    and fetch per-commit stats. Returns list of records not already in
    known_shas. Cache-friendly."""
    repos = list_accessible_repos()
    print(f"  {len(repos)} accessible repos", file=sys.stderr)
    records: list[dict] = []
    fetches = 0
    for i, r in enumerate(repos[:max_repos], 1):
        full = r.get("full_name")
        if not full:
            continue
        commits = list_repo_commits_by_author(full)
        if not commits:
            continue
        new_shas = [c["sha"] for c in commits if c.get("sha") not in known_shas]
        if not new_shas:
            continue
        print(f"    [{i}/{len(repos)}] {full}: {len(new_shas)} new commits",
              file=sys.stderr)
        for sha in new_shas:
            if fetches >= max_fetches:
                print("  ! reached max_fetches", file=sys.stderr)
                return records
            stats_data = fetch_commit_stats(full, sha)
            fetches += 1
            if not stats_data:
                continue
            add_f, del_f, n_f, bl_f = _stats_from_commit(stats_data)
            commit_meta = stats_data.get("commit") or {}
            author = commit_meta.get("author") or {}
            records.append({
                "sha": sha,
                "date": author.get("date"),
                "email": author.get("email"),
                "author": author.get("name"),
                "subject": (commit_meta.get("message") or "").split("\n")[0],
                "additions": add_f,
                "deletions": del_f,
                "files": n_f,
                "by_lang": bl_f,
                "repo": full,
                "repo_local_path": None,
                "repo_remote": f"https://github.com/{full}.git",
                "source": "github_commits_api",
            })
            known_shas.add(sha)
    return records


def _wait_for_search_quota(min_remaining: int = 11):
    """Sleep until search quota recovers to at least min_remaining."""
    while True:
        try:
            r = gh_api("/rate_limit")
        except RuntimeError:
            time.sleep(5); continue
        s = (r or {}).get("resources", {}).get("search", {})
        rem = s.get("remaining", 30)
        reset = s.get("reset", 0)
        if rem >= min_remaining:
            return
        wait = max(2, int(reset - time.time()) + 2)
        print(f"    search quota low ({rem}); waiting {wait}s",
              file=sys.stderr)
        time.sleep(min(wait, 70))


def list_search_commits() -> list[dict]:
    """Use search/commits API to discover repos pirate has contributed to.
    The search API is capped at 1000 results per query, so we page by year.
    Rate limit is 30 req/min (primary) — we wait when quota gets low.
    GitHub also enforces secondary rate limits (HTTP 403 'secondary rate
    limit') for bursts; we retry those with exponential backoff and write
    incrementally per-year so progress isn't lost."""
    final_cache = CACHE_API / "search_commits.json"
    if final_cache.exists():
        return json.loads(final_cache.read_text())
    # Per-year cache → resumable.
    per_year_dir = CACHE_API / "search_by_year"
    per_year_dir.mkdir(parents=True, exist_ok=True)
    print("  searching commits authored by the user ...", file=sys.stderr)
    all_items: list[dict] = []
    # Page by year (1000 cap per query). When a year approaches the cap,
    # re-page by month to capture more unique results.
    def query(date_filter: str) -> list[dict]:
        _wait_for_search_quota(11)
        q = f"author:{GH_LOGIN}+author-date:{date_filter}"
        try:
            r = gh_api(
                f"/search/commits?q={q}&per_page=100&sort=author-date",
                paginate=True, jq=".items[]",
            )
        except RuntimeError as e:
            print(f"    ! query {date_filter} failed: {e}", file=sys.stderr)
            return []
        return r or []

    for year in range(2010, datetime.now().year + 1):
        items = query(f"{year}-01-01..{year}-12-31")
        if len(items) >= 990:
            # Hit (or nearly hit) the 1000 cap — sub-page by month.
            print(f"    {year}: {len(items)} (capped) — re-paging by month",
                  file=sys.stderr)
            items = []
            for month in range(1, 13):
                # Actual last day of month — accept slight overshoot via "<"
                if month == 12:
                    next_first = date(year + 1, 1, 1)
                else:
                    next_first = date(year, month + 1, 1)
                last = next_first - timedelta(days=1)
                first = date(year, month, 1)
                month_items = query(f"{first.isoformat()}..{last.isoformat()}")
                print(f"      {first.isoformat()[:7]}: {len(month_items)}",
                      file=sys.stderr)
                items.extend(month_items)
                time.sleep(0.3)
        print(f"    {year}: {len(items)} results", file=sys.stderr)
        all_items.extend(items)
        time.sleep(0.3)
    CACHE_API.mkdir(parents=True, exist_ok=True)
    final_cache.write_text(json.dumps(all_items))
    return all_items


def list_prs_and_issues() -> tuple[list[dict], list[dict]]:
    """Use search/issues API to find all PRs and issues authored by the user.
    Returns (prs, issues). Cached per-kind, paged per year, with secondary
    rate-limit retry. Resumable via per-year cache files."""
    out: dict[str, list[dict]] = {}
    for kind in ("pr", "issue"):
        cache_file = CACHE_API / f"{kind}s.json"
        if cache_file.exists():
            out[kind] = json.loads(cache_file.read_text())
            continue
        per_year_dir = CACHE_API / f"{kind}s_by_year"
        per_year_dir.mkdir(parents=True, exist_ok=True)
        print(f"  searching {kind}s authored by the user ...", file=sys.stderr)
        all_items: list[dict] = []
        for year in range(2010, datetime.now().year + 1):
            year_file = per_year_dir / f"{year}.json"
            if year_file.exists():
                items = json.loads(year_file.read_text())
                all_items.extend(items)
                continue
            _wait_for_search_quota(11)
            q = (f"author:{GH_LOGIN}+is:{kind}"
                 f"+created:{year}-01-01..{year}-12-31")
            try:
                items = gh_api(
                    f"/search/issues?q={q}&per_page=100",
                    paginate=True, jq=".items[]",
                )
            except RuntimeError as e:
                msg = str(e).lower()
                if "secondary rate" in msg or "403" in msg:
                    print(f"    secondary rate limit on {kind} {year} — sleeping 90s",
                          file=sys.stderr)
                    time.sleep(90)
                    try:
                        items = gh_api(
                            f"/search/issues?q={q}&per_page=100",
                            paginate=True, jq=".items[]",
                        )
                    except RuntimeError as e2:
                        print(f"    ! {kind} {year} failed twice: {e2}",
                              file=sys.stderr)
                        items = []
                else:
                    print(f"    ! {kind} {year} failed: {e}", file=sys.stderr)
                    items = []
            items = items or []
            print(f"    {kind} {year}: {len(items)}", file=sys.stderr)
            year_file.write_text(json.dumps(items))
            all_items.extend(items)
            time.sleep(0.3)
        cache_file.write_text(json.dumps(all_items))
        out[kind] = all_items
    return out["pr"], out["issue"]


def _stats_from_commit(stats_data: dict) -> tuple[int, int, int, dict]:
    """Compute (additions, deletions, files_count, by_lang) from a GH commit
    detail, excluding vendored/lock/generated files. by_lang = {lang: [add, del]}."""
    files = stats_data.get("files") or []
    if not files:
        s = stats_data.get("stats") or {}
        return int(s.get("additions") or 0), int(s.get("deletions") or 0), 0, {}
    add, dlt, n = 0, 0, 0
    by_lang: dict[str, list[int]] = {}
    for f in files:
        path = f.get("filename", "")
        if _is_excluded_file(path):
            continue
        fa = int(f.get("additions") or 0)
        fd = int(f.get("deletions") or 0)
        add += fa
        dlt += fd
        n += 1
        bl = by_lang.setdefault(lang_for(path), [0, 0])
        bl[0] += fa
        bl[1] += fd
    return add, dlt, n, by_lang


def fetch_commit_stats(full_name: str, sha: str) -> dict | None:
    """Fetch a single commit's stats via the GitHub API, cached."""
    cache_file = CACHE_API / "commit_stats" / f"{full_name.replace('/', '__')}__{sha}.json"
    if cache_file.exists():
        try:
            return json.loads(cache_file.read_text())
        except Exception:
            pass
    try:
        data = gh_api(f"/repos/{full_name}/commits/{sha}")
    except RuntimeError as e:
        return None
    if not isinstance(data, dict):
        return None
    cache_file.parent.mkdir(parents=True, exist_ok=True)
    cache_file.write_text(json.dumps(data))
    return data


def mine_github_search(items: list[dict], known_shas: set[str],
                       max_fetches: int = 5000) -> list[dict]:
    """For each search/commits item whose SHA isn't already known, fetch
    stats via the per-commit API and build a record."""
    records: list[dict] = []
    fetches = 0
    # Dedup search results by (repo, sha) — forks can appear multiple times.
    seen: set[tuple[str, str]] = set()
    for it in items:
        repo_info = it.get("repository") or {}
        full_name = repo_info.get("full_name")
        sha = it.get("sha")
        if not full_name or not sha:
            continue
        if sha in known_shas:
            continue
        key = (full_name, sha)
        if key in seen:
            continue
        seen.add(key)
        if fetches >= max_fetches:
            print(f"  ! reached max_fetches={max_fetches}, stopping", file=sys.stderr)
            break
        stats_data = fetch_commit_stats(full_name, sha)
        fetches += 1
        if fetches % 25 == 0:
            print(f"    fetched stats for {fetches} commits", file=sys.stderr)
        if not stats_data:
            continue
        add_f, del_f, n_f, bl_f = _stats_from_commit(stats_data)
        commit_meta = stats_data.get("commit") or {}
        author = commit_meta.get("author") or {}
        records.append({
            "sha": sha,
            "date": author.get("date"),
            "email": author.get("email"),
            "author": author.get("name"),
            "subject": (commit_meta.get("message") or "").split("\n")[0],
            "additions": add_f,
            "deletions": del_f,
            "files": n_f,
            "by_lang": bl_f,
            "repo": full_name,
            "repo_local_path": None,
            "repo_remote": f"https://github.com/{full_name}.git",
            "source": "github_search",
        })
    return records


# ---------------------------------------------------------------------------
# Aggregation
# ---------------------------------------------------------------------------

def _slug(name: str) -> str:
    """Lowercase name with non-word chars → hyphens, stripped."""
    n = name.split("/")[-1]  # strip owner prefix if any
    return re.sub(r"[\W_]+", "-", n).lower().strip("-")


# Manual canonical-name overrides — for repos whose upstream org was
# deleted, so GH no longer knows about the fork relationship.
MANUAL_CANONICAL = {
    "pirate/drchrono-web": "drchrono/drchrono-web",
    "pirate/DeliveryHeroChina": "DeliveryHero/DeliveryHeroChina",
    "lucase/DeliveryHeroChina": "DeliveryHero/DeliveryHeroChina",
    "pirate/cmdty.ncm-ui": "Monadical-Inc/cmdty.ncm-ui",
}

# Repos to fully exclude from this user's stats — forks where the user
# never actually contributed (commits, PRs, issues, lines all dropped).
# Compared against the canonical name AFTER alias resolution.
EXCLUDE_REPOS = {
    "WeKruit/Hand-X",
    "leonardojyanez/redux_time",
}


_RENAME_CACHE_FILE = None  # set in resolve_canonical_name
def resolve_canonical_name(full_name: str) -> str:
    """Query GitHub for the current canonical name of a repo. If the repo
    was renamed, GitHub redirects and returns the new full_name. Cached."""
    global _RENAME_CACHE_FILE
    if _RENAME_CACHE_FILE is None:
        _RENAME_CACHE_FILE = CACHE_API / "renames.json"
    cache: dict[str, str] = {}