From 43488fb4f4f8bbe4444d085f594904a0c5f88604 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Fri, 17 Apr 2026 10:35:38 -0400 Subject: [PATCH 1/6] Fix header stripping for spatial path on multi-column headers Edge items were collected from the MuPDF path only, so three-column running headers (left/center/right) produced exact-match patterns only for MuPDF's split form. The spatial path merges those into a single line on shared y-coordinate and was left unstripped, polluting the dual-path comparison and flagging regions as uncertain. Union edge items from both paths before detecting repeating patterns. --- lib/pdf/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/pdf/__init__.py b/lib/pdf/__init__.py index 99b7473..13fc923 100644 --- a/lib/pdf/__init__.py +++ b/lib/pdf/__init__.py @@ -116,7 +116,15 @@ def convert_pdf(path: Path) -> tuple[str, str | None]: mupdf_blocks = extract_mupdf(page, pg_num) spatial_blocks = extract_spatial(page, pg_num) - edge_items = get_edge_items(mupdf_blocks, pg_num, page_height) + # Collect edges from both paths: MuPDF splits left/center/right + # headers into separate lines on wide horizontal gaps; spatial + # merges them into one line. Patterns from both forms are needed + # so strip_repeating (which does exact text match) can match + # against either path's segmentation. + edge_items = ( + get_edge_items(mupdf_blocks, pg_num, page_height) + + get_edge_items(spatial_blocks, pg_num, page_height) + ) all_edge_items.append(edge_items) links = collect_links(page) From 59dc6450750e2917a8be13e03d84d17d2fbbd910 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Fri, 17 Apr 2026 10:51:03 -0400 Subject: [PATCH 2/6] Strip repeating headers at span granularity When the spatial path merges left/center/right header columns into a single line (one span per column), the whole-line text matches no pattern but each column-span does. Stripping per-span drops the matched columns and preserves any non-header span that shares the header y-coordinate (e.g. a one-off appendix title on the bibliography page). Also indexes repeating patterns by quantized y so the common no-pattern-near-this-line case short-circuits without scanning, and factors out the y-quantize expression into a shared helper. --- lib/pdf/cleanup.py | 86 +++++++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/lib/pdf/cleanup.py b/lib/pdf/cleanup.py index dc5fe75..e8d9929 100644 --- a/lib/pdf/cleanup.py +++ b/lib/pdf/cleanup.py @@ -17,6 +17,12 @@ _log = logging.getLogger(__name__) +def _y_bucket(bbox: tuple[float, float, float, float]) -> float: + """Quantize a bbox's vertical center to the Y_TOLERANCE grid.""" + y_center = (bbox[1] + bbox[3]) / 2.0 + return round(y_center / Y_TOLERANCE) * Y_TOLERANCE + + def get_edge_items(blocks: list[Block], page_num: int, page_height: float) -> list[PageEdgeItem]: """Get the top N and bottom N text items from a page by y-coordinate. @@ -72,8 +78,7 @@ def detect_repeating(all_edge_items: list[list[PageEdgeItem]], for page_items in all_edge_items: for item in page_items: - y_key = round(item.y / Y_TOLERANCE) * Y_TOLERANCE - y_buckets[y_key].append(item) + y_buckets[_y_bucket(item.bbox)].append(item) repeating = set() for y_key, items in y_buckets.items(): @@ -107,10 +112,36 @@ def detect_repeating(all_edge_items: list[list[PageEdgeItem]], def strip_repeating(blocks: list[Block], repeating: set[tuple[float, str]], ) -> list[Block]: - """Remove lines matching repeating header/footer patterns from blocks.""" + """Remove lines (or individual spans) matching repeating header/footer + patterns from blocks. + + Spatial-path lines merge left/center/right header columns (one span + per column). The whole-line text then won't match any single pattern, + but each column-span will; per-span stripping handles that while + preserving non-header content that shares the header y-coordinate + (e.g. a one-off appendix title). + """ if not repeating: return blocks + patterns_by_y: dict[float, list[str]] = defaultdict(list) + for ry, rp in repeating: + patterns_by_y[ry].append(rp) + + def _patterns_near(y_key: float) -> list[str]: + return (patterns_by_y.get(y_key - Y_TOLERANCE, []) + + patterns_by_y.get(y_key, []) + + patterns_by_y.get(y_key + Y_TOLERANCE, [])) + + def _matches(text: str, rpattern: str) -> bool: + if rpattern == text: + return True + if rpattern == "__PAGE_NUM__" and PAGE_NUM_RE.match(text): + return True + if rpattern == "__DOC_NUM__" and DOC_NUM_RE.search(text): + return True + return False + result = [] for block in blocks: kept_lines = [] @@ -119,29 +150,36 @@ def strip_repeating(blocks: list[Block], repeating: set[tuple[float, str]], if not text: kept_lines.append(line) continue - y_center = (line.bbox[1] + line.bbox[3]) / 2.0 - y_key = round(y_center / Y_TOLERANCE) * Y_TOLERANCE - stripped = False - for ry, rpattern in repeating: - if abs(y_key - ry) > Y_TOLERANCE: - continue - if rpattern == text: - stripped = True - break - if rpattern == "__PAGE_NUM__" and PAGE_NUM_RE.match(text): - stripped = True - break - if rpattern == "__DOC_NUM__" and DOC_NUM_RE.search(text): - stripped = True - break - if not stripped: + + line_patterns = _patterns_near(_y_bucket(line.bbox)) + if not line_patterns: kept_lines.append(line) + continue + + if any(_matches(text, rp) for rp in line_patterns): + continue + + kept_spans = [] + stripped_any = False + for span in line.spans: + span_text = span.text.strip() + if not span_text: + kept_spans.append(span) + continue + span_patterns = _patterns_near(_y_bucket(span.bbox)) + if any(_matches(span_text, rp) for rp in span_patterns): + stripped_any = True + continue + kept_spans.append(span) + + if not any(sp.text.strip() for sp in kept_spans): + continue + + kept_lines.append( + replace(line, spans=kept_spans) if stripped_any else line) + if kept_lines: - result.append(Block( - lines=kept_lines, - bbox=block.bbox, - page_num=block.page_num, - )) + result.append(replace(block, lines=kept_lines)) return result From c77fc451b29bfe5e5e3aa32bbd189b053e85e996 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Fri, 17 Apr 2026 12:25:17 -0400 Subject: [PATCH 3/6] Fix body-size detection on code-heavy papers Code-dense papers (especially wording papers) biased the detected body size toward the smaller monospace font, which then pushed body prose into the font-rank heading range. The nesting validator ratcheted consecutive "headings" upward one level at a time, producing depth-6/7/8 cascades where bullet lines should have been list items. Three coordinated changes: - _CAMEL_SPLIT_RE now splits on letter/digit boundaries, so family names with embedded sizes (LMMono9, LMMono10, LMMono8) are recognized by the name-only monospace check. Previously only LMMonoLt10 split cleanly because its modifier separated the digits. - propagate_monospace now requires a majority of a font's spatial characters to be classified monospace before propagating. Short spans of digits or thin chars could false-positive the per-glyph metric, contaminating proportional fonts (e.g. Lato-Light) when any single short span passed the signal. - _detect_body_size excludes monospace spans by default. For wording papers with almost no prose at all, falls back to the overall mode so body size isn't pinned to a tiny outlier (e.g. superscript page numbers at 7pt). --- lib/pdf/mono.py | 44 +++++++++++++++++++++++++++++++++----------- lib/pdf/structure.py | 35 +++++++++++++++++++++++++++++------ 2 files changed, 62 insertions(+), 17 deletions(-) diff --git a/lib/pdf/mono.py b/lib/pdf/mono.py index cd3b8b1..c330902 100644 --- a/lib/pdf/mono.py +++ b/lib/pdf/mono.py @@ -33,7 +33,12 @@ _MONO_KEYWORDS = frozenset({"mono", "courier", "code", "consolas", "menlo"}) -_CAMEL_SPLIT_RE = re.compile(r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])") +_CAMEL_SPLIT_RE = re.compile( + r"(?<=[a-z])(?=[A-Z])" + r"|(?<=[A-Z])(?=[A-Z][a-z])" + r"|(?<=[A-Za-z])(?=\d)" + r"|(?<=\d)(?=[A-Za-z])" +) _GLYPH_CV_THRESHOLD = 0.15 _FAT_THIN_REJECT_RATIO = 1.3 @@ -170,24 +175,41 @@ def classify_monospace( return False +_PROPAGATE_MONO_MAJORITY = 0.5 + + def propagate_monospace(mupdf_blocks: list[Block], spatial_blocks: list[Block], dominant_font: str) -> None: """Apply spatial path's glyph-width monospace decisions to MuPDF spans. - Collects fonts classified as monospace by the spatial path (which has - per-character data), filters out the dominant body font (the single - most common font by character count), and sets monospace=True on - matching MuPDF spans. The dominant_font must be pre-lowercased. + Only propagates fonts whose spatial spans are mostly classified + monospace (by character count). Short spans of digits or thin + characters can false-positive the per-glyph signal, so requiring a + majority keeps proportional fonts (e.g. a regular text font with a + handful of "1" or "3.1" spans) out of the mono set. + + The dominant_font is still discarded unless it passes a + name-based mono check, to avoid body text leaking into code blocks + when metrics happen to agree across a majority. """ - mono_fonts: set[str] = set() + from collections import Counter + mono_chars: Counter[str] = Counter() + total_chars: Counter[str] = Counter() for b in spatial_blocks: for ln in b.lines: for s in ln.spans: - if s.monospace and s.text.strip(): - mono_fonts.add(s.font_name.lower()) - # Only discard the dominant font if it is not itself a known monospace - # family. On code-heavy papers the code font may dominate by character - # count, and discarding it would suppress all code block detection. + if not s.text.strip(): + continue + key = s.font_name.lower() + total_chars[key] += len(s.text) + if s.monospace: + mono_chars[key] += len(s.text) + + mono_fonts = { + f for f, total in total_chars.items() + if total > 0 and mono_chars[f] / total >= _PROPAGATE_MONO_MAJORITY + } + if dominant_font and not classify_monospace(dominant_font): mono_fonts.discard(dominant_font) if not mono_fonts: diff --git a/lib/pdf/structure.py b/lib/pdf/structure.py index c1d4b41..7e48688 100644 --- a/lib/pdf/structure.py +++ b/lib/pdf/structure.py @@ -179,17 +179,40 @@ def compare_extractions(mupdf_blocks: list[Block], return sections +_BODY_PROSE_MIN_CHARS = 500 +_BODY_PROSE_MIN_FRACTION = 0.10 + + def _detect_body_size(sections: list[Section]) -> float: - """Find the most common font size across all sections (= body text).""" - sizes: Counter[float] = Counter() + """Find the most common font size that represents body text. + + Prefers prose (non-monospace) spans so that code-heavy papers don't + bias the body size toward the smaller code font. Falls back to the + overall most common size when prose is insufficient (e.g. wording + papers that are nearly entirely monospace specification text), since + in those papers the spec font *is* the body font. + """ + prose_sizes: Counter[float] = Counter() + all_sizes: Counter[float] = Counter() for sec in sections: for line in sec.lines: for span in line.spans: - if span.text.strip(): - sizes[span.font_size] += len(span.text) - if not sizes: + if not span.text.strip(): + continue + all_sizes[span.font_size] += len(span.text) + if not span.monospace: + prose_sizes[span.font_size] += len(span.text) + + prose_total = sum(prose_sizes.values()) + all_total = sum(all_sizes.values()) + prose_sufficient = ( + prose_total >= _BODY_PROSE_MIN_CHARS + and (all_total == 0 or prose_total >= _BODY_PROSE_MIN_FRACTION * all_total) + ) + source = prose_sizes if prose_sufficient else all_sizes + if not source: return FALLBACK_BODY_SIZE - return sizes.most_common(1)[0][0] + return source.most_common(1)[0][0] def _rank_font_sizes(sections: list[Section], From 8fda9bd6eeacb0f2addfce27e09399e299fe88d7 Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Fri, 17 Apr 2026 12:37:08 -0400 Subject: [PATCH 4/6] Reject heading classification for prose-length first lines Numbered paragraph clauses in wording sections ("1 A fiber is a single flow of control...") and numeric arithmetic results that land on a continuation line ("4 on IEEE-754 double implementations...") both match SECTION_NUM_RE and got classified as H2 headings. The nesting validator then let them cascade downward. Reject the heading classification when the first physical line exceeds _HEADING_MAX_WORDS (12) AND the confidence is LOW (number alone, no font-size or bold confirmation). Real section titles with number + larger-font signal at MEDIUM/HIGH confidence are preserved even when long (e.g. "9.2 Rationale: Why Lane Count (L) is the Sole Coordinate"). Across the batch: 201 -> 76 long misclassified headings in the four most affected papers. --- lib/pdf/structure.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/pdf/structure.py b/lib/pdf/structure.py index 7e48688..a1dd200 100644 --- a/lib/pdf/structure.py +++ b/lib/pdf/structure.py @@ -18,6 +18,13 @@ _HEADING_SIZE_RATIO = 1.05 _TITLE_SIZE_RATIO = 1.2 +# Max words in a heading's first physical line. Beyond this the line is +# prose, not a title. Numbered spec clauses ("1 A fiber is a single flow +# of control...") and numeric arithmetic results that happen to start a +# continuation line both trigger SECTION_NUM_RE; this length cap rejects +# them downstream. Real WG21 section titles are 1-8 words. +_HEADING_MAX_WORDS = 12 + _HTML_TAG_RE = re.compile(r"<[^>]+>") _EMAIL_INLINE_RE = re.compile(r"\S+@\S+\.\S+") _MULTI_SPACE_RE = re.compile(r"\s{2,}") @@ -408,7 +415,9 @@ def structure_sections(sections: list[Section], level, conf = heading_confidence( has_number, number_level, font_level, is_bold, is_known) - if level > 0: + is_prose_length = len(first_line.split()) > _HEADING_MAX_WORDS + prose_on_weak_signal = is_prose_length and conf == Confidence.LOW + if level > 0 and not prose_on_weak_signal: sec.kind = SectionKind.HEADING sec.heading_level = level sec.confidence = conf From e61a78711f9e3d03475a6f7b5d743f153df593cb Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Fri, 17 Apr 2026 13:07:19 -0400 Subject: [PATCH 5/6] Treat same-styled consecutive headings as siblings in nesting validation When multiple consecutive headings share an initial styling (e.g., a run of "Changes since P0876R21", "Changes since P0876R20", ... entries), the existing clamp rule would assign each prev_clamped + 1, cascading to ever-deeper levels: level 3, 4, 5, 6, 7, 8, 8... even though they are clearly siblings at the same logical level. Track the previous heading's font size in _validate_nesting. When the current heading's font size matches closely, it's a sibling and gets the previous clamped level, not prev_clamped + 1. The tolerance is 0.1pt, which is wider than extraction noise for the same visual style but narrower than the real gap between heading tiers in typical designs. --- lib/pdf/structure.py | 71 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/lib/pdf/structure.py b/lib/pdf/structure.py index a1dd200..03792b7 100644 --- a/lib/pdf/structure.py +++ b/lib/pdf/structure.py @@ -442,6 +442,7 @@ def structure_sections(sections: list[Section], structured = _detect_code_blocks(structured) structured = [s for s in structured if _detect_lang_label(s) is None] structured = _classify_wording_sections(structured) + _demote_repeated_low_confidence_numbers(structured) _validate_nesting(structured) return metadata, structured @@ -841,17 +842,84 @@ def _classify_wording_sections(sections: list[Section]) -> list[Section]: return sections +_PARAGRAPH_NUM_MIN_REPEATS = 3 + + +def _demote_repeated_low_confidence_numbers(sections: list[Section]) -> None: + """Demote LOW-confidence numbered headings when their section_num + repeats often enough to indicate paragraph numbering. + + Real section numbers are unique, but many papers duplicate each one + between a TOC and the body (count 2), and a real section that ends up + at LOW confidence can also collide with paragraph numbering elsewhere. + Paragraph numbering resets per clause ("1 Constraints:", "2 Mandates:", + then later "1 Preconditions:", ...) so the same number typically shows + up 5 or more times. Requiring count >= 3 keeps TOC/body pairs and + section/paragraph 1-on-1 collisions intact while catching the + paragraph-numbering pattern. + + Only LOW-confidence headings are considered, so a real section title + with font-size or bold confirmation is preserved regardless. + """ + counts: Counter[str] = Counter() + nums_by_index: dict[int, str] = {} + for i, sec in enumerate(sections): + if (sec.kind != SectionKind.HEADING + or sec.confidence != Confidence.LOW): + continue + first_line = sec.text.split("\n")[0].strip() + m = SECTION_NUM_RE.match(first_line) + if not m: + continue + num = m.group(1) + nums_by_index[i] = num + counts[num] += 1 + + repeated = {num for num, c in counts.items() + if c >= _PARAGRAPH_NUM_MIN_REPEATS} + if not repeated: + return + + for i, num in nums_by_index.items(): + if num in repeated: + sec = sections[i] + _log.info("Demote repeated low-conf number %r: %r", + num, sec.text[:40]) + sec.kind = SectionKind.PARAGRAPH + sec.heading_level = 0 + + +_SIBLING_FONT_TOL = 0.1 + + def _validate_nesting(sections: list[Section]) -> None: """Ensure heading levels don't skip more than one level deeper. Mutates headings that skip levels: adjusts heading_level and downgrades confidence from HIGH to MEDIUM when corrected. + + Also consolidates runs of same-styled headings as siblings. When + consecutive headings share a font size, they're at the same logical + level; without this check, a run of similar entries (e.g. a dozen + "Changes since P0876RN" items) would each get prev_clamped + 1, + cascading to ever-deeper levels. """ prev_level = 0 + prev_font_size: float | None = None for sec in sections: if sec.kind != SectionKind.HEADING: continue - if prev_level > 0 and sec.heading_level > prev_level + 1: + is_sibling = ( + prev_font_size is not None + and abs(sec.font_size - prev_font_size) <= _SIBLING_FONT_TOL + ) + if is_sibling and prev_level > 0 and sec.heading_level > prev_level: + _log.info("Nesting sibling: h%d -> h%d for %r", + sec.heading_level, prev_level, sec.text[:40]) + sec.heading_level = prev_level + if sec.confidence == Confidence.HIGH: + sec.confidence = Confidence.MEDIUM + elif prev_level > 0 and sec.heading_level > prev_level + 1: corrected = prev_level + 1 _log.info("Nesting fix: h%d -> h%d for %r", sec.heading_level, corrected, @@ -860,3 +928,4 @@ def _validate_nesting(sections: list[Section]) -> None: if sec.confidence == Confidence.HIGH: sec.confidence = Confidence.MEDIUM prev_level = sec.heading_level + prev_font_size = sec.font_size From 1e1383d731a0e6252ecd56b018a29ba3c7d145af Mon Sep 17 00:00:00 2001 From: Greg Kaleka Date: Fri, 17 Apr 2026 17:09:11 -0400 Subject: [PATCH 6/6] Add regression tests for PR #9 structure and cleanup fixes Covers body-size prose preference, heading prose-length rejection, repeated-number demotion, sibling nesting clamp, monospace majority propagation, and per-span header stripping. Also documents review findings (sibling false-flatten risk, tight font tolerance, leftover LOW confidence on demoted paragraphs) in issues/pr9-review.md. Fixes stale y-bucket assertions in three existing detect_repeating tests that were broken by the bbox-center refactor in 59dc645. --- issues/pr9-review.md | 111 ++++++++++++++++ tests/test_header_footer.py | 114 ++++++++++++++++- tests/test_mono.py | 71 ++++++++++ tests/test_structure.py | 249 +++++++++++++++++++++++++++++++++++- 4 files changed, 538 insertions(+), 7 deletions(-) create mode 100644 issues/pr9-review.md diff --git a/issues/pr9-review.md b/issues/pr9-review.md new file mode 100644 index 0000000..ee1f453 --- /dev/null +++ b/issues/pr9-review.md @@ -0,0 +1,111 @@ +# PR #9 Review Findings + +Code-quality notes from the review of PR #9 (`fix/pdf-structure-classification`). None were blockers; tracked here so they surface when the surrounding code is touched next. + +Each entry names the concern, a concrete example, root cause, and a fix direction (matching the format of `known-issues.md`). + +--- + +## Sibling consolidation may demote legitimate subsection nesting + +**Symptom:** `_validate_nesting` now treats consecutive headings that share a font size as siblings and clamps the second to `prev_level`. On papers that express heading depth through section numbering rather than typography (a common pattern for Markdown-origin PDFs), this flattens real nested subsections. + +**Example sequence:** + +``` +## 2 Motivation (font_size = 14, level = 2) +### 2.1 Background (font_size = 12, level = 3) +#### 2.1.1 History (font_size = 12, level = 4) +``` + +With `_SIBLING_FONT_TOL = 0.1`, `History` is compared to `Background`: same font size, so it is reclassified as a sibling (`level = 3`). The numbering depth (3 dotted parts → level 4) is ignored. Output becomes: + +``` +## 2 Motivation +### 2.1 Background +### 2.1.1 History <-- should be #### +``` + +**Papers likely affected:** any WG21 paper where subsection headings at multiple depths share one font size. Not yet observed in the 52-paper batch as a regression, but the probability rises for papers that pass LOW-confidence heading classification (where section number is the only evidence). + +**Root cause:** `lib/pdf/structure.py:_validate_nesting` (lines 912-921). The sibling check fires whenever `abs(sec.font_size - prev_font_size) <= _SIBLING_FONT_TOL` AND `sec.heading_level > prev_level`. It has no awareness of whether the section number supports the deeper level. + +**Fix direction:** Let the section-number signal veto sibling clamping. When the incoming heading has a `has_number` match AND its `number_level == sec.heading_level == prev_level + 1`, allow the one-level descent through unchanged. Only clamp when the level would skip (`> prev_level + 1`) or when no numbering is present. In CLAUDE.md terms: dotted-decimal section numbering is higher-reliability than font-size agreement; font agreement should not override numbering agreement. + +--- + +## `_SIBLING_FONT_TOL = 0.1` is very tight for real-world font sizes + +**Symptom:** PDFs sometimes specify fractionally different font sizes for the same visual heading tier (e.g. `11.95` and `12.0`, or `14.04` and `14.0`). A tolerance of `0.1` will treat these as different tiers and skip sibling consolidation, reintroducing the cascade this fix was meant to prevent. + +**Example:** + +``` +sec_a.font_size = 11.95 +sec_b.font_size = 12.00 +abs(12.00 - 11.95) = 0.05 # within tol, sibling +``` + +``` +sec_a.font_size = 11.70 +sec_b.font_size = 12.00 +abs(12.00 - 11.70) = 0.30 # exceeds tol, NOT sibling +``` + +Fractions in the 0.1–0.5 range are most common in LaTeX-rendered PDFs and in scanned-then-OCR'd documents. Papers in the 52-doc batch did not happen to hit this, but the test coverage does not guard against it either. + +**Root cause:** `_SIBLING_FONT_TOL = 0.1` in `lib/pdf/structure.py` is an absolute point-size tolerance with no relation to the heading size itself. At 12pt body text the tolerance is ~0.8% of the size; at 8pt captions it is ~1.3%. The constant was chosen without a documented rationale. + +**Fix direction:** Either (a) widen the absolute tolerance to ~0.5 (covers typical fractional PDF size variance while still discriminating between real 10pt / 11pt / 12pt tiers), or (b) express it as a ratio: `abs(a - b) <= max(0.5, 0.02 * max(a, b))`. The ratio form is more defensible across font-size ranges. + +--- + +## Local import of `Counter` inside `propagate_monospace` + +**Symptom:** `lib/pdf/mono.py:195` adds `from collections import Counter` inside the body of `propagate_monospace`. Other modules in the project (e.g. `types.py`, `structure.py`, `cleanup.py`) import `Counter` at module scope. The inline import breaks the project's import-location consistency. + +**Impact:** Cosmetic only. No measurable performance difference for a one-shot call per document, and no correctness risk. + +**Root cause:** The import was likely added at the point of first use while iterating on the fix and never hoisted. + +**Fix direction:** Move the import to the top of `mono.py` alongside `math` and `re`. Zero runtime impact, consistent with the rest of the codebase. Trivial to fold into the next mono.py edit. + +--- + +## Commit 5 bundles two unrelated changes + +**Symptom:** Commit `e61a787` ("Treat same-styled consecutive headings as siblings in nesting validation") contains two logically independent changes: + +1. **Repetition demotion:** `_demote_repeated_low_confidence_numbers`, a new phase that demotes LOW-confidence headings whose `section_num` repeats ≥3 times. +2. **Sibling consolidation:** a new branch inside `_validate_nesting` that clamps runs of same-font-size headings to `prev_level`. + +Either change is independently useful and could be reverted without the other. Bundling them reduces bisect-ability if a future regression traces to one of the two rules. + +**Acknowledgement:** The PR description already flags this ("Includes two changes (should have been two commits)"). + +**Fix direction:** Split before merge via `git rebase -i e61a787` and `git reset HEAD~` + two `git commit -p` rounds. Optional — the description captures the distinction in prose, and the functions themselves are separately named and located. + +--- + +## `_demote_repeated_low_confidence_numbers` leaves `confidence` at LOW on demoted paragraphs + +**Symptom:** When a section is demoted from HEADING → PARAGRAPH, only `kind` and `heading_level` are updated. `confidence` remains at `Confidence.LOW`, the value it carried as a heading. Most PARAGRAPH sections elsewhere in the pipeline carry `Confidence.HIGH` (constructed by `_make_paragraph_section`). + +**Example:** + +```python +sec.kind == SectionKind.HEADING +sec.confidence == Confidence.LOW +sec.heading_level == 2 + +# after _demote_repeated_low_confidence_numbers: +sec.kind == SectionKind.PARAGRAPH +sec.confidence == Confidence.LOW # stays LOW +sec.heading_level == 0 +``` + +**Impact:** None currently observable. A grep for `sec.confidence ==` across `lib/` shows only heading-path consumers inspect confidence (the `HIGH → MEDIUM` downgrade in `_validate_nesting`), and `compare_extractions` has precedent for assigning `Confidence.LOW` to paragraphs (line 182). The emitter ignores `confidence` for PARAGRAPH sections. + +**Root cause:** `lib/pdf/structure.py:_demote_repeated_low_confidence_numbers` only resets the two fields it knows the downstream classifier cares about. Not a bug today, but a future consumer that trusts `confidence` on PARAGRAPH sections would be surprised. + +**Fix direction:** Either (a) reset `sec.confidence = Confidence.HIGH` on demotion to match the provenance of other paragraphs, (b) leave it as LOW and add a short comment stating that confidence is not meaningful for non-HEADING sections, or (c) drop the field entirely from `Section` for non-HEADING kinds. Option (a) is the least risky and aligns with the "most paragraphs are HIGH" default. diff --git a/tests/test_header_footer.py b/tests/test_header_footer.py index 8e879ac..737f7d1 100644 --- a/tests/test_header_footer.py +++ b/tests/test_header_footer.py @@ -90,8 +90,8 @@ def test_detect_repeating_exact_text(): [PageEdgeItem(text="Unique Title", y=30.0, page_num=5, bbox=(0, 30, 100, 42))], ] result = detect_repeating(all_edges, total_pages=5) - # y_bucket = round(30 / Y_TOLERANCE) * Y_TOLERANCE = 30.0 (Y_TOLERANCE is 2.0) - assert (30.0, "Running Head") in result + # Bucket is derived from bbox center (30+42)/2 = 36, quantized to Y_TOLERANCE=2. + assert (36.0, "Running Head") in result def test_detect_repeating_skips_below_threshold(): @@ -120,8 +120,8 @@ def test_detect_repeating_page_number_pattern(): for pg in range(1, 6) ] result = detect_repeating(all_edges, total_pages=5) - # y_bucket = round(580 / 2) * 2 = 580.0 - assert (580.0, "__PAGE_NUM__") in result + # Bucket is derived from bbox center (580+592)/2 = 586, quantized to Y_TOLERANCE=2. + assert (586.0, "__PAGE_NUM__") in result def test_detect_repeating_doc_number_pattern(): @@ -135,7 +135,8 @@ def test_detect_repeating_doc_number_pattern(): for i in range(4) ] result = detect_repeating(all_edges, total_pages=4) - assert (30.0, "__DOC_NUM__") in result + # Bucket is derived from bbox center (30+42)/2 = 36, quantized to Y_TOLERANCE=2. + assert (36.0, "__DOC_NUM__") in result # ---- strip_repeating ----------------------------------------------------- @@ -181,4 +182,105 @@ def test_strip_repeating_drops_empty_blocks(): def test_strip_repeating_empty_input(): """Empty repeating set returns blocks unchanged.""" b = _make_block_at_y([("content", 300)]) - assert strip_repeating([b], set()) == [b] \ No newline at end of file + assert strip_repeating([b], set()) == [b] + + +# --------------------------------------------------------------------------- +# Regression coverage for PR #9 per-span stripping. +# --------------------------------------------------------------------------- + + +def _multi_span_line(texts, y, *, page_num=0): + """Build a Line with multiple spans at the same y (different x-ranges). + + Models the spatial path merging left/center/right header columns + into one line with one span per column. + """ + x = 50.0 + spans = [] + for t in texts: + w = 8.0 * max(len(t), 1) + spans.append(Span(text=t, font_name="Body", font_size=11.0, + bbox=(x, y, x + w, y + 12.0))) + x += w + 40.0 # large inter-column gap + return Line( + spans=spans, + bbox=(50.0, y, x, y + 12.0), + page_num=page_num, + ) + + +def test_strip_repeating_drops_matched_spans_keeps_the_rest(): + """Spatial-path merged header: strip matched column-spans, keep unique ones.""" + # "Doc P1234R0" (doc-num pattern) | "Appendix: Review" (unique) | "3" (page num) + line = _multi_span_line(["P1234R0", "Appendix: Review", "3"], 30.0) + block = Block(lines=[line], bbox=line.bbox, page_num=0) + + repeating = {(36.0, "__DOC_NUM__"), (36.0, "__PAGE_NUM__")} + result = strip_repeating([block], repeating) + + assert len(result) == 1, "block with surviving span must be kept" + kept_line = result[0].lines[0] + kept_texts = [s.text.strip() for s in kept_line.spans] + assert "P1234R0" not in kept_texts + assert "3" not in kept_texts + assert "Appendix: Review" in kept_texts + + +def test_strip_repeating_drops_all_spans_drops_line(): + """When every column-span matches a repeating pattern, the line is removed.""" + line = _multi_span_line(["P1234R0", "5"], 30.0) + block = Block(lines=[line], bbox=line.bbox, page_num=0) + + repeating = {(36.0, "__DOC_NUM__"), (36.0, "__PAGE_NUM__")} + result = strip_repeating([block], repeating) + + # All spans matched; line is stripped; block becomes empty and is dropped. + assert result == [], "block whose only line is fully stripped must be dropped" + + +def test_strip_repeating_whole_line_match_still_works(): + """A line whose full text matches the repeating exact pattern is stripped. + + Regression: the per-span path must not break the original whole-line + strip that handles single-span lines on the MuPDF path. + """ + # Single span whose text exactly matches the repeating entry. + line = Line( + spans=[Span(text="Running Head", font_name="Body", + font_size=11.0, bbox=(50.0, 30.0, 200.0, 42.0))], + bbox=(50.0, 30.0, 200.0, 42.0), + ) + block = Block(lines=[line], bbox=line.bbox) + + repeating = {(36.0, "Running Head")} + result = strip_repeating([block], repeating) + assert result == [], "whole-line exact match must still strip the block" + + +def test_strip_repeating_span_outside_bucket_preserved(): + """A span whose y-bucket is far from any repeating pattern is preserved + even if another span on the same line is stripped.""" + # Build a line with two spans at different y-buckets (unusual, but + # defensible for floating column-spans with differing baselines). + s_top = Span(text="P1234R0", font_name="Body", font_size=11.0, + bbox=(50.0, 30.0, 110.0, 42.0)) # bucket = 36 + s_body = Span(text="real content", font_name="Body", font_size=11.0, + bbox=(300.0, 300.0, 420.0, 312.0)) # bucket = 306 + line = Line( + spans=[s_top, s_body], + bbox=(50.0, 30.0, 420.0, 312.0), + ) + block = Block(lines=[line], bbox=line.bbox) + + repeating = {(36.0, "__DOC_NUM__")} + result = strip_repeating([block], repeating) + # The line's own y_bucket (from its full bbox center) will land far from + # any repeating pattern; implementation short-circuits via _patterns_near + # returning empty and keeps the whole line untouched. + assert len(result) == 1 + kept = result[0].lines[0] + kept_texts = [s.text for s in kept.spans] + assert "real content" in kept_texts + # Per-span strip does not fire because line_patterns is empty. + assert "P1234R0" in kept_texts \ No newline at end of file diff --git a/tests/test_mono.py b/tests/test_mono.py index 438eaa9..7246a02 100644 --- a/tests/test_mono.py +++ b/tests/test_mono.py @@ -2,6 +2,7 @@ from conftest import make_span, make_line, make_block from lib.pdf.mono import classify_monospace, propagate_monospace +from lib.pdf.types import Span, Line, Block def test_keyword_courier(): @@ -105,3 +106,73 @@ def test_propagate_keeps_dominant_monospace(): propagate_monospace([m_block], [s_block], "courier") assert m_block.lines[0].spans[0].monospace is True + + +# --------------------------------------------------------------------------- +# Regression coverage for PR #9 propagate_monospace majority filter. +# --------------------------------------------------------------------------- + + +def test_propagate_minority_mono_not_adopted(): + """A prose font with <50% monospace-classified chars is NOT propagated. + + Short spans like "3.1" or "42" can false-positive the per-glyph signal, + but they represent a small minority of the font's characters. Requiring + a majority keeps proportional text fonts out of the mono set. + """ + # Lato-Light appears in ~100 chars total, only 3 of which are mono. + prose_lines = [ + Line(spans=[Span(text="A" * 97, font_name="Lato-Light", + monospace=False)]), + Line(spans=[Span(text="3.1", font_name="Lato-Light", + monospace=True)]), # false-positive + ] + spatial = [Block(lines=prose_lines)] + + mupdf_line = Line(spans=[Span(text="more prose", font_name="Lato-Light", + monospace=False)]) + mupdf = [Block(lines=[mupdf_line])] + + propagate_monospace(mupdf, spatial, "some-dominant-font") + assert mupdf[0].lines[0].spans[0].monospace is False, ( + "minority mono classification must not contaminate a prose font" + ) + + +def test_propagate_majority_mono_is_adopted(): + """A font with >=50% monospace-classified chars IS propagated.""" + code_lines = [ + Line(spans=[Span(text="int x = 0;", font_name="MyCodeFont", + monospace=True)]), + Line(spans=[Span(text="y", font_name="MyCodeFont", + monospace=False)]), # minority non-mono + ] + spatial = [Block(lines=code_lines)] + + mupdf_line = Line(spans=[Span(text="return x;", font_name="MyCodeFont", + monospace=False)]) + mupdf = [Block(lines=[mupdf_line])] + + propagate_monospace(mupdf, spatial, "arial") + assert mupdf[0].lines[0].spans[0].monospace is True, ( + "majority-monospace font should propagate to MuPDF spans" + ) + + +def test_propagate_exactly_at_threshold_is_adopted(): + """50%% exactly meets the majority threshold (inclusive).""" + # 5 mono chars, 5 non-mono chars -> ratio 0.5 (meets threshold). + lines = [ + Line(spans=[Span(text="aaaaa", font_name="EdgeFont", + monospace=True)]), + Line(spans=[Span(text="bbbbb", font_name="EdgeFont", + monospace=False)]), + ] + spatial = [Block(lines=lines)] + + mupdf_line = Line(spans=[Span(text="target", font_name="EdgeFont", + monospace=False)]) + mupdf = [Block(lines=[mupdf_line])] + + propagate_monospace(mupdf, spatial, "arial") + assert mupdf[0].lines[0].spans[0].monospace is True diff --git a/tests/test_structure.py b/tests/test_structure.py index 7f0b794..935a5dc 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -1,12 +1,14 @@ """Tests for lib.pdf.structure.""" -from conftest import make_block, make_section +from conftest import make_block, make_line, make_section, make_span from lib.pdf.types import ( Block, Line, Span, Section, SectionKind, Confidence, ) from lib.pdf.structure import ( compare_extractions, structure_sections, heading_confidence, _extract_metadata, + _detect_body_size, _validate_nesting, + _demote_repeated_low_confidence_numbers, ) @@ -295,3 +297,248 @@ def test_line_count_voting(self): # Lines: two at 14, one at 11 -> 14 wins by line count. # Character count would favor 11. assert block.font_size == 14.0 + + +# --------------------------------------------------------------------------- +# Regression coverage for PR #9 fixes. +# --------------------------------------------------------------------------- + + +def _mk_section(text, *, font_size=10.0, monospace=False, bold=False, + kind=SectionKind.PARAGRAPH, + confidence=Confidence.HIGH, heading_level=0): + """Build a Section whose single line carries explicit span attributes.""" + span = make_span(text, font_size=font_size, + monospace=monospace, bold=bold) + line = Line(spans=[span]) + return Section(kind=kind, text=text, confidence=confidence, + heading_level=heading_level, lines=[line], + font_size=font_size) + + +class TestDetectBodySizeProsePreference: + """`_detect_body_size` prefers prose over monospace on code-heavy papers.""" + + def test_prose_wins_over_monospace_majority(self): + """Prose beats a more frequent monospace size when it clears the floor.""" + prose = [_mk_section("prose line " + ("x" * 60), font_size=11.0) + for _ in range(10)] + code = [_mk_section("code line " + ("y" * 60), font_size=9.0, + monospace=True) for _ in range(30)] + body = _detect_body_size(prose + code) + assert body == 11.0, ( + "prose font should be picked as body even when monospace spans " + "hold more characters overall" + ) + + def test_fallback_to_all_when_prose_too_small(self): + """When prose is scarce, the overall most-common size wins (wording papers).""" + tiny_prose = [_mk_section("hi", font_size=11.0)] # <<500 chars + code = [_mk_section("code " + ("y" * 200), font_size=9.0, + monospace=True) for _ in range(10)] + body = _detect_body_size(tiny_prose + code) + assert body == 9.0, ( + "with insufficient prose, body falls back to the most common size" + ) + + def test_empty_sections_fall_back(self): + """No data at all returns the FALLBACK_BODY_SIZE.""" + from lib.pdf.types import FALLBACK_BODY_SIZE + assert _detect_body_size([]) == FALLBACK_BODY_SIZE + + +class TestHeadingProseLengthRejection: + """Long numbered first lines don't become headings at LOW confidence.""" + + def test_long_numbered_line_demoted(self): + """A numbered prose line with >12 words and no font/bold signal is a paragraph.""" + long_line = ("1 A fiber is a single flow of control with a " + "private stack and an associated execution context.") + sec = _mk_section(long_line, font_size=10.0) + # Flank it with body text at the same size so `_detect_body_size` + # agrees that 10.0 is body and no font-level signal fires. + body_fill = [_mk_section("ordinary body " + ("x" * 80), font_size=10.0) + for _ in range(10)] + _, result = structure_sections(body_fill + [sec], has_title=True) + demoted = [s for s in result if long_line in s.text] + assert demoted, "expected the long numbered line to appear in output" + assert all(s.kind != SectionKind.HEADING for s in demoted), ( + "prose-length first line should not become a heading at LOW conf" + ) + + def test_long_numbered_line_kept_with_font_signal(self): + """A long numbered line at a heading font size is preserved as a heading.""" + long_title = ("1 A fiber is a single flow of control with a " + "private stack and an associated execution context") + heading = _mk_section(long_title, font_size=14.0) + body_fill = [_mk_section("plain body " + ("x" * 80), font_size=10.0) + for _ in range(10)] + _, result = structure_sections(body_fill + [heading], has_title=True) + matches = [s for s in result if long_title in s.text] + assert matches and any(s.kind == SectionKind.HEADING for s in matches), ( + "long numbered line at heading font size must stay a HEADING " + "(MEDIUM or HIGH confidence survives the length cap)" + ) + + +class TestDemoteRepeatedLowConfidenceNumbers: + """Paragraph-number resets collapse to PARAGRAPH; TOC/body pairs do not.""" + + def _heading(self, num, text, *, confidence=Confidence.LOW): + return _mk_section(f"{num} {text}", font_size=10.0, + kind=SectionKind.HEADING, + confidence=confidence, heading_level=2) + + def test_three_repeats_demoted(self): + """section_num repeating >=3 times at LOW confidence becomes PARAGRAPH.""" + sections = [ + self._heading("1", "Constraints: first"), + self._heading("2", "Mandates: one"), + self._heading("1", "Preconditions: second"), + self._heading("1", "Effects: third"), + ] + _demote_repeated_low_confidence_numbers(sections) + ones = [s for s in sections if s.text.startswith("1 ")] + assert all(s.kind == SectionKind.PARAGRAPH for s in ones), ( + "three or more occurrences of number '1' at LOW conf should demote" + ) + assert all(s.heading_level == 0 for s in ones) + + def test_two_repeats_preserved(self): + """A TOC/body pair (count == 2) is NOT demoted.""" + sections = [ + self._heading("1", "Introduction"), + self._heading("1", "Introduction"), # second copy (body) + ] + _demote_repeated_low_confidence_numbers(sections) + assert all(s.kind == SectionKind.HEADING for s in sections), ( + "pair-count (TOC + body) should be left alone" + ) + + def test_medium_confidence_not_demoted(self): + """MEDIUM/HIGH confidence headings are never touched, even if they repeat.""" + sections = [ + self._heading("1", "a", confidence=Confidence.MEDIUM), + self._heading("1", "b", confidence=Confidence.MEDIUM), + self._heading("1", "c", confidence=Confidence.MEDIUM), + ] + _demote_repeated_low_confidence_numbers(sections) + assert all(s.kind == SectionKind.HEADING for s in sections) + + def test_demoted_confidence_left_low(self): + """Pins current behavior: demoted sections keep Confidence.LOW. + + Not currently observed as a problem; documented in + issues/pr9-review.md. Any change to the demotion path should + consciously revisit this pin. + """ + sections = [ + self._heading("1", "first"), + self._heading("1", "second"), + self._heading("1", "third"), + ] + _demote_repeated_low_confidence_numbers(sections) + assert all(s.confidence == Confidence.LOW for s in sections), ( + "demoted paragraphs currently retain their heading's LOW " + "confidence; update issues/pr9-review.md if this changes" + ) + + +class TestValidateNestingSiblingClamp: + """Same-font-size consecutive headings are treated as siblings.""" + + def _h(self, text, *, level, fs): + return _mk_section(text, font_size=fs, + kind=SectionKind.HEADING, + confidence=Confidence.HIGH, + heading_level=level) + + def test_sibling_run_stays_flat(self): + """A long run of same-font revision headings doesn't cascade.""" + sections = [ + self._h("## Changes", level=2, fs=14.0), + self._h("### Changes since P21", level=3, fs=12.0), + # Each of the following originally got level = prev_clamped + 1. + # Sibling logic pins them all to level 3. + self._h("#### Changes since P20", level=4, fs=12.0), + self._h("##### Changes since P19", level=5, fs=12.0), + self._h("###### Changes since P18", level=6, fs=12.0), + ] + _validate_nesting(sections) + levels = [s.heading_level for s in sections] + assert levels == [2, 3, 3, 3, 3], ( + f"expected runs of same-font siblings at level 3; got {levels}" + ) + + def test_sibling_downgrades_high_to_medium(self): + """When the sibling rule fires, HIGH confidence drops to MEDIUM.""" + sections = [ + self._h("## Root", level=2, fs=14.0), + self._h("### Child", level=3, fs=12.0), + self._h("#### Grandchild", level=4, fs=12.0), + ] + _validate_nesting(sections) + gc = sections[-1] + assert gc.heading_level == 3 + assert gc.confidence == Confidence.MEDIUM + + def test_different_font_allows_nesting(self): + """Truly different font sizes still pass through the skip-level rule.""" + sections = [ + self._h("## Root", level=2, fs=14.0), + self._h("### Child", level=3, fs=12.0), + self._h("#### Grandchild", level=4, fs=10.0), + ] + _validate_nesting(sections) + assert [s.heading_level for s in sections] == [2, 3, 4] + + def test_current_behavior_flattens_legit_nesting_same_font(self): + """Pins the risk documented in issues/pr9-review.md. + + Papers that express depth through section numbering while using + one font size for all sub-levels will have legitimate h4s clamped + to h3. This test encodes the CURRENT behavior so that any future + change (e.g. letting section-number depth veto the sibling rule) + is noticed in review. + """ + sections = [ + self._h("## 2 Motivation", level=2, fs=14.0), + self._h("### 2.1 Background", level=3, fs=12.0), + self._h("#### 2.1.1 History", level=4, fs=12.0), + ] + _validate_nesting(sections) + assert sections[-1].heading_level == 3, ( + "currently flattens 2.1.1 to h3; update issues/pr9-review.md " + "if section-number depth is made to veto the sibling rule" + ) + + def test_tight_font_tolerance_misses_fractional_variance(self): + """Pins the risk documented in issues/pr9-review.md. + + `_SIBLING_FONT_TOL = 0.1` rejects sibling status between 11.7 and + 12.0 (diff 0.3), which is within the fractional variance common + in LaTeX PDFs. Encodes CURRENT behavior. + """ + sections = [ + self._h("## Root", level=2, fs=14.0), + self._h("### Sibling A", level=3, fs=12.0), + # fs = 11.7 is > 0.1 away from 12.0, so NOT a sibling; + # skip-level rule clamps level 5 -> 4 (prev + 1). + self._h("##### Sibling B", level=5, fs=11.7), + ] + _validate_nesting(sections) + assert sections[-1].heading_level == 4, ( + "current absolute 0.1 tolerance treats 11.7 and 12.0 as " + "different tiers; widening the tolerance would change this " + "assertion — see issues/pr9-review.md" + ) + + def test_close_fractional_is_sibling(self): + """Within the 0.1 tolerance, sizes like 11.95 vs 12.0 are siblings.""" + sections = [ + self._h("## Root", level=2, fs=14.0), + self._h("### Sibling A", level=3, fs=12.0), + self._h("#### Sibling B", level=4, fs=11.95), + ] + _validate_nesting(sections) + assert sections[-1].heading_level == 3