diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 9004309fb..77c648067 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -118,8 +118,8 @@ def toc_detector_single_page(content, model=None): response = llm_completion(model=model, prompt=prompt) # print('response', response) - json_content = extract_json(response) - return json_content['toc_detected'] + json_content = extract_json(response) + return json_content.get('toc_detected', 'no') def check_if_toc_extraction_is_complete(content, toc, model=None): @@ -137,7 +137,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None): prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) - return json_content['completed'] + return json_content.get('completed', 'no') def check_if_toc_transformation_is_complete(content, toc, model=None): @@ -155,7 +155,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None): prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) - return json_content['completed'] + return json_content.get('completed', 'no') def extract_toc_content(content, model=None): prompt = f""" @@ -217,7 +217,7 @@ def detect_page_index(toc_content, model=None): response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) - return json_content['page_index_given_in_toc'] + return json_content.get('page_index_given_in_toc', 'no') def toc_extractor(page_list, toc_page_list, model): def transform_dots_to_colon(text): @@ -324,8 +324,8 @@ def toc_transformer(toc_content, model=None): new_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if new_complete.startswith('```json'): - new_complete = get_json_content(new_complete) - last_complete = last_complete+new_complete + new_complete = get_json_content(new_complete) + last_complete = last_complete + new_complete if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model) @@ -683,8 +683,9 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None): item_copy = copy.deepcopy(item) del item_copy['page'] - result = add_page_number_to_toc(page_contents, item_copy, model) - if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('').strip()) del item['page'] diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py index 5a5971690..5ec44872f 100644 --- a/pageindex/page_index_md.py +++ b/pageindex/page_index_md.py @@ -4,7 +4,7 @@ import os try: from .utils import * -except: +except ImportError: from utils import * async def get_node_summary(node, summary_token_threshold=200, model=None): diff --git a/pageindex/utils.py b/pageindex/utils.py index f00ccf3a7..ce32e7e5d 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -50,7 +50,8 @@ def llm_completion(model, prompt, chat_history=None, return_finish_reason=False) print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - time.sleep(1) + wait = min(2 ** i, 60) # exponential backoff, capped at 60s + time.sleep(wait) else: logging.error('Max retries reached for prompt: ' + prompt) if return_finish_reason: @@ -76,7 +77,8 @@ async def llm_acompletion(model, prompt): print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - await asyncio.sleep(1) + wait = min(2 ** i, 60) # exponential backoff, capped at 60s + await asyncio.sleep(wait) else: logging.error('Max retries reached for prompt: ' + prompt) return "" @@ -172,7 +174,7 @@ def structure_to_list(structure): def get_leaf_nodes(structure): if isinstance(structure, dict): - if not structure['nodes']: + if not structure.get('nodes'): structure_node = copy.deepcopy(structure) structure_node.pop('nodes', None) return [structure_node] @@ -284,23 +286,17 @@ class JsonLogger: def __init__(self, file_path): # Extract PDF name for logger name pdf_name = get_pdf_name(file_path) - + current_time = datetime.now().strftime("%Y%m%d_%H%M%S") - self.filename = f"{pdf_name}_{current_time}.json" + # Use .jsonl extension to reflect the newline-delimited format + self.filename = f"{pdf_name}_{current_time}.jsonl" os.makedirs("./logs", exist_ok=True) - # Initialize empty list to store all messages - self.log_data = [] def log(self, level, message, **kwargs): - if isinstance(message, dict): - self.log_data.append(message) - else: - self.log_data.append({'message': message}) - # Add new message to the log data - - # Write entire log data to file - with open(self._filepath(), "w") as f: - json.dump(self.log_data, f, indent=2) + entry = message if isinstance(message, dict) else {'message': message} + # Append a single JSON line — O(1) regardless of how many entries exist + with open(self._filepath(), "a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") def info(self, message, **kwargs): self.log("INFO", message, **kwargs) diff --git a/run_pageindex.py b/run_pageindex.py index 673439d89..f76043cf5 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -106,7 +106,7 @@ } # Load config with defaults from config.yaml - opt = config_loader.load(user_opt) + opt = config_loader.load({k: v for k, v in user_opt.items() if v is not None}) toc_with_page_number = asyncio.run(md_to_tree( md_path=args.md_path,