diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 9004309fb..3547e2963 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -103,18 +103,33 @@ async def check_title_appearance_in_start_concurrent(structure, page_list, model def toc_detector_single_page(content, model=None): prompt = f""" - Your job is to detect if there is a table of content provided in the given text. + Your job is to detect whether the given text is a table of contents. + + A table of contents is a directory: a list of references that point to content + located ELSEWHERE in the document (typically section titles paired with page + or section numbers). The entries are pointers; the actual content they refer + to is on other pages. + + Pages that contain the document's substantive content with numbered sections — + such as policies, regulations, rules, statutes, contracts, ordinances, or + articles — are NOT tables of contents, even when their visual structure + (numbered headings, indented sub-items) resembles one. If each numbered item + is followed on this same page by its own substantive body text, the page is + content, not a TOC. + + When the input is a single self-contained page that reads as the document + itself rather than a directory pointing elsewhere, answer "no". Given text: {content} return the following JSON format: {{ - "thinking": + "thinking": "toc_detected": "", }} Directly return the final JSON structure. Do not output anything else. - Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" + Please note: abstract, summary, notation list, figure list, table list, etc. are not tables of contents.""" response = llm_completion(model=model, prompt=prompt) # print('response', response) @@ -698,6 +713,16 @@ def check_toc(page_list, opt=None): if len(toc_page_list) == 0: print('no toc found') return {'toc_content': None, 'toc_page_list': [], 'page_index_given_in_toc': 'no'} + # A real table of contents points to content located beyond it. If + # find_toc_pages classified every available page (typically: a single-page + # document where toc_detector_single_page misfired on numbered policy / + # rule / statute content), then start_page_index = toc_page_list[-1] + 1 + # would be >= len(page_list) and process_toc_with_page_numbers would build + # main_content="" and silently drop the entire document. Fall back to the + # no-toc path so the page itself is processed as content. + if toc_page_list[-1] + 1 >= len(page_list): + print('toc covers the entire document (likely misclassification); falling back to no-toc') + return {'toc_content': None, 'toc_page_list': [], 'page_index_given_in_toc': 'no'} else: print('toc found') toc_json = toc_extractor(page_list, toc_page_list, opt.model) @@ -707,17 +732,17 @@ def check_toc(page_list, opt=None): return {'toc_content': toc_json['toc_content'], 'toc_page_list': toc_page_list, 'page_index_given_in_toc': 'yes'} else: current_start_index = toc_page_list[-1] + 1 - - while (toc_json['page_index_given_in_toc'] == 'no' and - current_start_index < len(page_list) and + + while (toc_json['page_index_given_in_toc'] == 'no' and + current_start_index < len(page_list) and current_start_index < opt.toc_check_page_num): - + additional_toc_pages = find_toc_pages( start_page_index=current_start_index, page_list=page_list, opt=opt ) - + if len(additional_toc_pages) == 0: break