From 6727fcd44c784850ef8e49beb775fef4b0023077 Mon Sep 17 00:00:00 2001 From: supermario_leo Date: Fri, 8 May 2026 17:59:18 +0800 Subject: [PATCH] Defend against single-page TOC misclassification dropping all content MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a single-page document is passed to PageIndex and toc_detector_single_page returns "yes" (a known failure mode for pages of numbered policy / rule / statute content), find_toc_pages returns [0]. check_toc then enters the process_toc_with_page_numbers path with start_page_index = toc_page_list[-1] + 1 = 1, so the for-loop at process_toc_with_page_numbers iterates over range(1, min(1+toc_check_page_num, 1)) = range(1, 1) — empty — yielding main_content="" and silently dropping the entire document. Fix: in check_toc, after find_toc_pages, treat "the TOC covers the whole document" as a misclassification and fall through to the no-toc path so the page itself becomes content. A real TOC always points to content located elsewhere; if there is no content elsewhere, the detection was wrong by definition. Also tightened the toc_detector_single_page prompt to make the content-vs-directory distinction explicit, with examples of structured content (policies, regulations, rules, statutes, contracts, ordinances) that resemble TOCs but are not. Refs #203. --- pageindex/page_index.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 9004309fb..3547e2963 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -103,18 +103,33 @@ async def check_title_appearance_in_start_concurrent(structure, page_list, model def toc_detector_single_page(content, model=None): prompt = f""" - Your job is to detect if there is a table of content provided in the given text. + Your job is to detect whether the given text is a table of contents. + + A table of contents is a directory: a list of references that point to content + located ELSEWHERE in the document (typically section titles paired with page + or section numbers). The entries are pointers; the actual content they refer + to is on other pages. + + Pages that contain the document's substantive content with numbered sections — + such as policies, regulations, rules, statutes, contracts, ordinances, or + articles — are NOT tables of contents, even when their visual structure + (numbered headings, indented sub-items) resembles one. If each numbered item + is followed on this same page by its own substantive body text, the page is + content, not a TOC. + + When the input is a single self-contained page that reads as the document + itself rather than a directory pointing elsewhere, answer "no". Given text: {content} return the following JSON format: {{ - "thinking": + "thinking": "toc_detected": "", }} Directly return the final JSON structure. Do not output anything else. - Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" + Please note: abstract, summary, notation list, figure list, table list, etc. are not tables of contents.""" response = llm_completion(model=model, prompt=prompt) # print('response', response) @@ -698,6 +713,16 @@ def check_toc(page_list, opt=None): if len(toc_page_list) == 0: print('no toc found') return {'toc_content': None, 'toc_page_list': [], 'page_index_given_in_toc': 'no'} + # A real table of contents points to content located beyond it. If + # find_toc_pages classified every available page (typically: a single-page + # document where toc_detector_single_page misfired on numbered policy / + # rule / statute content), then start_page_index = toc_page_list[-1] + 1 + # would be >= len(page_list) and process_toc_with_page_numbers would build + # main_content="" and silently drop the entire document. Fall back to the + # no-toc path so the page itself is processed as content. + if toc_page_list[-1] + 1 >= len(page_list): + print('toc covers the entire document (likely misclassification); falling back to no-toc') + return {'toc_content': None, 'toc_page_list': [], 'page_index_given_in_toc': 'no'} else: print('toc found') toc_json = toc_extractor(page_list, toc_page_list, opt.model) @@ -707,17 +732,17 @@ def check_toc(page_list, opt=None): return {'toc_content': toc_json['toc_content'], 'toc_page_list': toc_page_list, 'page_index_given_in_toc': 'yes'} else: current_start_index = toc_page_list[-1] + 1 - - while (toc_json['page_index_given_in_toc'] == 'no' and - current_start_index < len(page_list) and + + while (toc_json['page_index_given_in_toc'] == 'no' and + current_start_index < len(page_list) and current_start_index < opt.toc_check_page_num): - + additional_toc_pages = find_toc_pages( start_page_index=current_start_index, page_list=page_list, opt=opt ) - + if len(additional_toc_pages) == 0: break