From 0b4ae8d4f611b891be1c530f84757ed468636be5 Mon Sep 17 00:00:00 2001 From: Joevenner Date: Sat, 14 Feb 2026 15:00:21 +0100 Subject: [PATCH 1/3] feat: Add multi-provider LLM support via LiteLLM integration Replace OpenAI-only implementation with LiteLLM to support 100+ LLM providers including Anthropic Claude, Google Gemini, Azure OpenAI, AWS Bedrock, Groq, and local Ollama models. Changes: - Add litellm>=1.0.0 dependency - Refactor ChatGPT_API functions to use litellm.completion() - Enhance count_tokens() for multi-provider token counting - Update config.yaml with provider-specific model examples - Update README.md with multi-provider setup instructions Backward compatible: Existing OPENAI_API_KEY and CHATGPT_API_KEY still work. Default model remains gpt-4o-2024-11-20. --- README.md | 49 ++++++++++++- pageindex/config.yaml | 35 +++++++++ pageindex/utils.py | 165 +++++++++++++++++++++++++++++++++++------- requirements.txt | 4 +- 4 files changed, 222 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 7180efd5a..f4ffab311 100644 --- a/README.md +++ b/README.md @@ -147,14 +147,49 @@ You can follow these steps to generate a PageIndex tree from a PDF document. pip3 install --upgrade -r requirements.txt ``` -### 2. Set your OpenAI API key +### 2. Set your API key -Create a `.env` file in the root directory and add your API key: +PageIndex now supports multiple LLM providers via [LiteLLM](https://docs.litellm.ai/). Create a `.env` file in the root directory and add your API key: +**OpenAI (default):** ```bash +OPENAI_API_KEY=your_openai_key_here +# or CHATGPT_API_KEY=your_openai_key_here ``` +**Anthropic Claude:** +```bash +ANTHROPIC_API_KEY=your_anthropic_key_here +``` + +**Google Gemini:** +```bash +GEMINI_API_KEY=your_google_key_here +``` + +**Azure OpenAI:** +```bash +AZURE_API_KEY=your_azure_key_here +AZURE_API_BASE=your_azure_endpoint +AZURE_API_VERSION=2024-02-01 +``` + +**AWS Bedrock:** +```bash +AWS_ACCESS_KEY_ID=your_access_key +AWS_SECRET_ACCESS_KEY=your_secret_key +AWS_REGION_NAME=us-east-1 +``` + +**Groq:** +```bash +GROQ_API_KEY=your_groq_key_here +``` + +**Ollama (local):** +No API key needed. Just ensure Ollama is running locally. + ### 3. Run PageIndex on your PDF ```bash @@ -167,7 +202,15 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf You can customize the processing with additional optional arguments: ``` ---model OpenAI model to use (default: gpt-4o-2024-11-20) +--model LLM model to use (default: gpt-4o-2024-11-20) + Examples: + - OpenAI: gpt-4o, gpt-4-turbo + - Claude: claude-3-opus-20240229, claude-3-sonnet-20240229 + - Gemini: gemini/gemini-pro, gemini/gemini-1.5-pro + - Azure: azure/your-deployment-name + - Bedrock: bedrock/anthropic.claude-3-opus-20240229-v1:0 + - Groq: groq/llama-3.1-70b-versatile + - Ollama: ollama/llama3 --toc-check-pages Pages to check for table of contents (default: 20) --max-pages-per-node Max pages per node (default: 10) --max-tokens-per-node Max tokens per node (default: 20000) diff --git a/pageindex/config.yaml b/pageindex/config.yaml index aa60a1f91..0b44a1c18 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -1,3 +1,38 @@ +# PageIndex Configuration +# +# Model Configuration: +# PageIndex now supports multiple LLM providers via LiteLLM. +# Set the model string according to your preferred provider: +# +# OpenAI (default): +# model: "gpt-4o-2024-11-20" or "gpt-4o" or "gpt-4-turbo" +# Env var: OPENAI_API_KEY or CHATGPT_API_KEY +# +# Anthropic Claude: +# model: "claude-3-opus-20240229" or "claude-3-sonnet-20240229" +# Env var: ANTHROPIC_API_KEY +# +# Google Gemini: +# model: "gemini/gemini-pro" or "gemini/gemini-1.5-pro" +# Env var: GEMINI_API_KEY +# +# Azure OpenAI: +# model: "azure/your-deployment-name" +# Env vars: AZURE_API_KEY, AZURE_API_BASE, AZURE_API_VERSION +# +# AWS Bedrock: +# model: "bedrock/anthropic.claude-3-opus-20240229-v1:0" +# Env vars: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION_NAME +# +# Groq: +# model: "groq/llama-3.1-70b-versatile" +# Env var: GROQ_API_KEY +# +# Ollama (local): +# model: "ollama/llama3" +# +# For more providers, see: https://docs.litellm.ai/docs/providers + model: "gpt-4o-2024-11-20" # model: "anthropic/claude-sonnet-4-6" toc_check_page_num: 20 diff --git a/pageindex/utils.py b/pageindex/utils.py index 57b69c5b5..e9f59ffc9 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -1,3 +1,4 @@ +import tiktoken import litellm import logging import os @@ -16,65 +17,174 @@ from pathlib import Path from types import SimpleNamespace as config -# Backward compatibility: support CHATGPT_API_KEY as alias for OPENAI_API_KEY -if not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): - os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY") +# Support multiple API key environment variables for different providers +CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") or os.getenv("OPENAI_API_KEY") -litellm.drop_params = True +# Configure LiteLLM to use environment variables for different providers +# Users can set: OPENAI_API_KEY, ANTHROPIC_API_KEY, GEMINI_API_KEY, etc. +# See: https://docs.litellm.ai/docs/providers def count_tokens(text, model=None): + """ + Count tokens in text using the appropriate tokenizer for the model. + Uses tiktoken for OpenAI models and LiteLLM's token counter for other providers. + """ if not text: return 0 - return litellm.token_counter(model=model, text=text) - + + # Check if it's an OpenAI model (no prefix or openai/ prefix) + model_lower = model.lower() if model else "" + is_openai_model = ( + not "/" in model or + model_lower.startswith("openai/") or + model_lower.startswith("gpt-") or + model_lower.startswith("o1-") or + model_lower.startswith("o3-") + ) + + if is_openai_model: + # Use tiktoken for OpenAI models + try: + # Strip openai/ prefix if present + clean_model = model.replace("openai/", "") if model else "gpt-4o" + enc = tiktoken.encoding_for_model(clean_model) + tokens = enc.encode(text) + return len(tokens) + except KeyError: + # Fallback to cl100k_base encoding for unknown OpenAI models + enc = tiktoken.get_encoding("cl100k_base") + tokens = enc.encode(text) + return len(tokens) + else: + # Use LiteLLM's token counter for other providers + try: + return litellm.token_counter(model=model, text=text) + except Exception: + # Fallback to approximate counting (4 chars per token) + return len(text) // 4 -def llm_completion(model, prompt, chat_history=None, return_finish_reason=False): +def ChatGPT_API_with_finish_reason(model, prompt, api_key=None, chat_history=None): + """ + Synchronous chat completion API with finish reason tracking. + Uses LiteLLM to support multiple LLM providers. + + Args: + model: Model string (e.g., "gpt-4o", "claude-3-opus-20240229", "gemini/gemini-pro") + prompt: User prompt + api_key: API key (optional, uses environment variables if not provided) + chat_history: Previous conversation history + + Returns: + Tuple of (response_content, finish_reason) + """ max_retries = 10 - messages = list(chat_history) + [{"role": "user", "content": prompt}] if chat_history else [{"role": "user", "content": prompt}] + + # Build messages list + if chat_history: + messages = chat_history.copy() + messages.append({"role": "user", "content": prompt}) + else: + messages = [{"role": "user", "content": prompt}] + for i in range(max_retries): try: response = litellm.completion( model=model, messages=messages, temperature=0, + api_key=api_key, ) - content = response.choices[0].message.content - if return_finish_reason: - finish_reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished" - return content, finish_reason - return content + if response.choices[0].finish_reason == "length": + return response.choices[0].message.content, "max_output_reached" + else: + return response.choices[0].message.content, "finished" + except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - time.sleep(1) + time.sleep(1) # Wait for 1s before retrying else: logging.error('Max retries reached for prompt: ' + prompt) - if return_finish_reason: - return "", "error" - return "" + return "Error", "error" + +def ChatGPT_API(model, prompt, api_key=None, chat_history=None): + """ + Synchronous chat completion API. + Uses LiteLLM to support multiple LLM providers. + + Args: + model: Model string (e.g., "gpt-4o", "claude-3-opus-20240229", "gemini/gemini-pro") + prompt: User prompt + api_key: API key (optional, uses environment variables if not provided) + chat_history: Previous conversation history + + Returns: + Response content string + """ + max_retries = 10 + + # Build messages list + if chat_history: + messages = chat_history.copy() + messages.append({"role": "user", "content": prompt}) + else: + messages = [{"role": "user", "content": prompt}] + + for i in range(max_retries): + try: + response = litellm.completion( + model=model, + messages=messages, + temperature=0, + api_key=api_key, + ) + return response.choices[0].message.content + except Exception as e: + print('************* Retrying *************') + logging.error(f"Error: {e}") + if i < max_retries - 1: + time.sleep(1) # Wait for 1s before retrying + else: + logging.error('Max retries reached for prompt: ' + prompt) + return "Error" + -async def llm_acompletion(model, prompt): +async def ChatGPT_API_async(model, prompt, api_key=None): + """ + Asynchronous chat completion API. + Uses LiteLLM to support multiple LLM providers. + + Args: + model: Model string (e.g., "gpt-4o", "claude-3-opus-20240229", "gemini/gemini-pro") + prompt: User prompt + api_key: API key (optional, uses environment variables if not provided) + + Returns: + Response content string + """ max_retries = 10 messages = [{"role": "user", "content": prompt}] + for i in range(max_retries): try: response = await litellm.acompletion( model=model, messages=messages, temperature=0, + api_key=api_key, ) return response.choices[0].message.content except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - await asyncio.sleep(1) + await asyncio.sleep(1) # Wait for 1s before retrying else: logging.error('Max retries reached for prompt: ' + prompt) - return "" + return "Error" def get_json_content(response): @@ -379,14 +489,15 @@ def add_preface_if_needed(data): -def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): +def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): + enc = tiktoken.encoding_for_model(model) if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) page_list = [] for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] page_text = page.extract_text() - token_length = litellm.token_counter(model=model, text=page_text) + token_length = len(enc.encode(page_text)) page_list.append((page_text, token_length)) return page_list elif pdf_parser == "PyMuPDF": @@ -398,7 +509,7 @@ def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): page_list = [] for page in doc: page_text = page.get_text() - token_length = litellm.token_counter(model=model, text=page_text) + token_length = len(enc.encode(page_text)) page_list.append((page_text, token_length)) return page_list else: @@ -501,7 +612,7 @@ def remove_structure_text(data): def check_token_limit(structure, limit=110000): list = structure_to_list(structure) for node in list: - num_tokens = count_tokens(node['text'], model=None) + num_tokens = count_tokens(node['text'], model='gpt-4o') if num_tokens > limit: print(f"Node ID: {node['node_id']} has {num_tokens} tokens") print("Start Index:", node['start_index']) @@ -577,7 +688,7 @@ async def generate_node_summary(node, model=None): Directly return the description, do not include any other text. """ - response = await llm_acompletion(model, prompt) + response = await ChatGPT_API_async(model, prompt) return response @@ -622,7 +733,7 @@ def generate_doc_description(structure, model=None): Directly return the description, do not include any other text. """ - response = llm_completion(model, prompt) + response = ChatGPT_API(model, prompt) return response @@ -677,4 +788,4 @@ def load(self, user_opt=None) -> config: self._validate_keys(user_dict) merged = {**self._default_dict, **user_dict} - return config(**merged) + return config(**merged) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 3b82eda0b..d595e0fc9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ -litellm==1.82.0 +litellm>=1.0.0 +openai>=1.0.0 pymupdf==1.26.4 PyPDF2==3.0.1 python-dotenv==1.1.0 +tiktoken==0.11.0 pyyaml==6.0.2 From 4c34e60c3bad944583ee5c9fcf3e84bdb30da0cb Mon Sep 17 00:00:00 2001 From: JoeVenner Date: Sun, 22 Mar 2026 14:16:49 +0100 Subject: [PATCH 2/3] Refactored token counting to use LiteLLM for multi-provider support, removing reliance on direct tiktoken and hardcoded models, while cleaning up tokenization logic to prevent crashes --- pageindex/page_index.py | 70 ++++++++++++++++++----------------------- pageindex/utils.py | 50 +++++++---------------------- 2 files changed, 42 insertions(+), 78 deletions(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 719255463..5f9aaa8f2 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -36,7 +36,7 @@ async def check_title_appearance(item, page_list, start_index=1, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = await llm_acompletion(model=model, prompt=prompt) + response = await ChatGPT_API_async(model=model, prompt=prompt) response = extract_json(response) if 'answer' in response: answer = response['answer'] @@ -64,7 +64,7 @@ async def check_title_appearance_in_start(title, page_text, model=None, logger=N }} Directly return the final JSON structure. Do not output anything else.""" - response = await llm_acompletion(model=model, prompt=prompt) + response = await ChatGPT_API_async(model=model, prompt=prompt) response = extract_json(response) if logger: logger.info(f"Response: {response}") @@ -116,7 +116,7 @@ def toc_detector_single_page(content, model=None): Directly return the final JSON structure. Do not output anything else. Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" - response = llm_completion(model=model, prompt=prompt) + response = ChatGPT_API(model=model, prompt=prompt) # print('response', response) json_content = extract_json(response) return json_content['toc_detected'] @@ -135,7 +135,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc - response = llm_completion(model=model, prompt=prompt) + response = ChatGPT_API(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -153,7 +153,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc - response = llm_completion(model=model, prompt=prompt) + response = ChatGPT_API(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -165,7 +165,7 @@ def extract_toc_content(content, model=None): Directly return the full table of contents content. Do not output anything else.""" - response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) + response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) if_complete = check_if_toc_transformation_is_complete(content, response, model) if if_complete == "yes" and finish_reason == "finished": @@ -176,26 +176,23 @@ def extract_toc_content(content, model=None): {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) + new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) - attempt = 0 - max_attempts = 5 - while not (if_complete == "yes" and finish_reason == "finished"): - attempt += 1 - if attempt > max_attempts: - raise Exception('Failed to complete table of contents after maximum retries') - chat_history = [ - {"role": "user", "content": prompt}, - {"role": "assistant", "content": response}, + {"role": "user", "content": prompt}, + {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) + new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) + + # Optional: Add a maximum retry limit to prevent infinite loops + if len(chat_history) > 5: # Arbitrary limit of 10 attempts + raise Exception('Failed to complete table of contents after maximum retries') return response @@ -215,7 +212,7 @@ def detect_page_index(toc_content, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = llm_completion(model=model, prompt=prompt) + response = ChatGPT_API(model=model, prompt=prompt) json_content = extract_json(response) return json_content['page_index_given_in_toc'] @@ -242,7 +239,7 @@ def transform_dots_to_colon(text): def toc_index_extractor(toc, content, model=None): print('start toc_index_extractor') - toc_extractor_prompt = """ + tob_extractor_prompt = """ You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format. The provided pages contains tags like and to indicate the physical location of the page X. @@ -263,8 +260,8 @@ def toc_index_extractor(toc, content, model=None): If the section is not in the provided pages, do not add the physical_index to it. Directly return the final JSON structure. Do not output anything else.""" - prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content - response = llm_completion(model=model, prompt=prompt) + prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content + response = ChatGPT_API(model=model, prompt=prompt) json_content = extract_json(response) return json_content @@ -292,7 +289,7 @@ def toc_transformer(toc_content, model=None): Directly return the final JSON structure, do not output anything else. """ prompt = init_prompt + '\n Given table of contents\n:' + toc_content - last_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) + last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model) if if_complete == "yes" and finish_reason == "finished": last_complete = extract_json(last_complete) @@ -300,12 +297,7 @@ def toc_transformer(toc_content, model=None): return cleaned_response last_complete = get_json_content(last_complete) - attempt = 0 - max_attempts = 5 while not (if_complete == "yes" and finish_reason == "finished"): - attempt += 1 - if attempt > max_attempts: - raise Exception('Failed to complete toc transformation after maximum retries') position = last_complete.rfind('}') if position != -1: last_complete = last_complete[:position+2] @@ -321,7 +313,7 @@ def toc_transformer(toc_content, model=None): Please continue the json structure, directly output the remaining part of the json structure.""" - new_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) + new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) if new_complete.startswith('```json'): new_complete = get_json_content(new_complete) @@ -482,7 +474,7 @@ def add_page_number_to_toc(part, structure, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n" - current_json_raw = llm_completion(model=model, prompt=prompt) + current_json_raw = ChatGPT_API(model=model, prompt=prompt) json_result = extract_json(current_json_raw) for item in json_result: @@ -532,7 +524,7 @@ def generate_toc_continue(toc_content, part, model=None): Directly return the additional part of the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2) - response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) + response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) if finish_reason == 'finished': return extract_json(response) else: @@ -566,7 +558,7 @@ def generate_toc_init(part, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part - response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) + response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) if finish_reason == 'finished': return extract_json(response) @@ -737,8 +729,8 @@ def check_toc(page_list, opt=None): ################### fix incorrect toc ######################################################### -async def single_toc_item_index_fixer(section_title, content, model=None): - toc_extractor_prompt = """ +def single_toc_item_index_fixer(section_title, content, model=None): + tob_extractor_prompt = """ You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document. The provided pages contains tags like and to indicate the physical location of the page X. @@ -750,8 +742,8 @@ async def single_toc_item_index_fixer(section_title, content, model=None): } Directly return the final JSON structure. Do not output anything else.""" - prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content - response = await llm_acompletion(model=model, prompt=prompt) + prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content + response = ChatGPT_API(model=model, prompt=prompt) json_content = extract_json(response) return convert_physical_index_to_int(json_content['physical_index']) @@ -812,15 +804,15 @@ async def process_and_check_item(incorrect_item): page_contents=[] for page_index in range(prev_correct, next_correct+1): # Add bounds checking to prevent IndexError - page_list_idx = page_index - start_index - if page_list_idx >= 0 and page_list_idx < len(page_list): - page_text = f"\n{page_list[page_list_idx][0]}\n\n\n" + list_index = page_index - start_index + if list_index >= 0 and list_index < len(page_list): + page_text = f"\n{page_list[list_index][0]}\n\n\n" page_contents.append(page_text) else: continue content_range = ''.join(page_contents) - physical_index_int = await single_toc_item_index_fixer(incorrect_item['title'], content_range, model) + physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model) # Check if the result is correct check_item = incorrect_item.copy() diff --git a/pageindex/utils.py b/pageindex/utils.py index e9f59ffc9..e73e469a6 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -1,4 +1,3 @@ -import tiktoken import litellm import logging import os @@ -26,42 +25,16 @@ def count_tokens(text, model=None): """ - Count tokens in text using the appropriate tokenizer for the model. - Uses tiktoken for OpenAI models and LiteLLM's token counter for other providers. + Count tokens in text using LiteLLM's token counter, which automatically + selects the appropriate tokenizer for each provider. """ if not text: return 0 - - # Check if it's an OpenAI model (no prefix or openai/ prefix) - model_lower = model.lower() if model else "" - is_openai_model = ( - not "/" in model or - model_lower.startswith("openai/") or - model_lower.startswith("gpt-") or - model_lower.startswith("o1-") or - model_lower.startswith("o3-") - ) - - if is_openai_model: - # Use tiktoken for OpenAI models - try: - # Strip openai/ prefix if present - clean_model = model.replace("openai/", "") if model else "gpt-4o" - enc = tiktoken.encoding_for_model(clean_model) - tokens = enc.encode(text) - return len(tokens) - except KeyError: - # Fallback to cl100k_base encoding for unknown OpenAI models - enc = tiktoken.get_encoding("cl100k_base") - tokens = enc.encode(text) - return len(tokens) - else: - # Use LiteLLM's token counter for other providers - try: - return litellm.token_counter(model=model, text=text) - except Exception: - # Fallback to approximate counting (4 chars per token) - return len(text) // 4 + try: + return litellm.token_counter(model=model or "gpt-4o", text=text) + except Exception: + # Fallback to approximate counting (4 chars per token) + return len(text) // 4 def ChatGPT_API_with_finish_reason(model, prompt, api_key=None, chat_history=None): """ @@ -490,14 +463,13 @@ def add_preface_if_needed(data): def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): - enc = tiktoken.encoding_for_model(model) if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) page_list = [] for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] page_text = page.extract_text() - token_length = len(enc.encode(page_text)) + token_length = count_tokens(page_text, model=model) page_list.append((page_text, token_length)) return page_list elif pdf_parser == "PyMuPDF": @@ -509,7 +481,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): page_list = [] for page in doc: page_text = page.get_text() - token_length = len(enc.encode(page_text)) + token_length = count_tokens(page_text, model=model) page_list.append((page_text, token_length)) return page_list else: @@ -609,10 +581,10 @@ def remove_structure_text(data): return data -def check_token_limit(structure, limit=110000): +def check_token_limit(structure, limit=110000, model=None): list = structure_to_list(structure) for node in list: - num_tokens = count_tokens(node['text'], model='gpt-4o') + num_tokens = count_tokens(node['text'], model=model) if num_tokens > limit: print(f"Node ID: {node['node_id']} has {num_tokens} tokens") print("Start Index:", node['start_index']) From 7d632096afaf0eb690d75e3fa974855341cde442 Mon Sep 17 00:00:00 2001 From: JoeVenner Date: Sun, 22 Mar 2026 14:23:54 +0100 Subject: [PATCH 3/3] Resolve merge conflicts: accept upstream LiteLLM integration Accept upstream's LiteLLM implementation (llm_completion/llm_acompletion, pinned litellm==1.82.0, ConfigLoader, drop_params) which supersedes our parallel implementation. Co-Authored-By: Claude Opus 4.6 --- pageindex/page_index.py | 70 +++++++++++--------- pageindex/utils.py | 139 ++++++++-------------------------------- requirements.txt | 4 +- 3 files changed, 68 insertions(+), 145 deletions(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 5f9aaa8f2..719255463 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -36,7 +36,7 @@ async def check_title_appearance(item, page_list, start_index=1, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = await ChatGPT_API_async(model=model, prompt=prompt) + response = await llm_acompletion(model=model, prompt=prompt) response = extract_json(response) if 'answer' in response: answer = response['answer'] @@ -64,7 +64,7 @@ async def check_title_appearance_in_start(title, page_text, model=None, logger=N }} Directly return the final JSON structure. Do not output anything else.""" - response = await ChatGPT_API_async(model=model, prompt=prompt) + response = await llm_acompletion(model=model, prompt=prompt) response = extract_json(response) if logger: logger.info(f"Response: {response}") @@ -116,7 +116,7 @@ def toc_detector_single_page(content, model=None): Directly return the final JSON structure. Do not output anything else. Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" - response = ChatGPT_API(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) # print('response', response) json_content = extract_json(response) return json_content['toc_detected'] @@ -135,7 +135,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc - response = ChatGPT_API(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -153,7 +153,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc - response = ChatGPT_API(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -165,7 +165,7 @@ def extract_toc_content(content, model=None): Directly return the full table of contents content. Do not output anything else.""" - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if_complete = check_if_toc_transformation_is_complete(content, response, model) if if_complete == "yes" and finish_reason == "finished": @@ -176,23 +176,26 @@ def extract_toc_content(content, model=None): {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) + new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) + attempt = 0 + max_attempts = 5 + while not (if_complete == "yes" and finish_reason == "finished"): + attempt += 1 + if attempt > max_attempts: + raise Exception('Failed to complete table of contents after maximum retries') + chat_history = [ - {"role": "user", "content": prompt}, - {"role": "assistant", "content": response}, + {"role": "user", "content": prompt}, + {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) + new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) - - # Optional: Add a maximum retry limit to prevent infinite loops - if len(chat_history) > 5: # Arbitrary limit of 10 attempts - raise Exception('Failed to complete table of contents after maximum retries') return response @@ -212,7 +215,7 @@ def detect_page_index(toc_content, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = ChatGPT_API(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) return json_content['page_index_given_in_toc'] @@ -239,7 +242,7 @@ def transform_dots_to_colon(text): def toc_index_extractor(toc, content, model=None): print('start toc_index_extractor') - tob_extractor_prompt = """ + toc_extractor_prompt = """ You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format. The provided pages contains tags like and to indicate the physical location of the page X. @@ -260,8 +263,8 @@ def toc_index_extractor(toc, content, model=None): If the section is not in the provided pages, do not add the physical_index to it. Directly return the final JSON structure. Do not output anything else.""" - prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content - response = ChatGPT_API(model=model, prompt=prompt) + prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content + response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) return json_content @@ -289,7 +292,7 @@ def toc_transformer(toc_content, model=None): Directly return the final JSON structure, do not output anything else. """ prompt = init_prompt + '\n Given table of contents\n:' + toc_content - last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + last_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model) if if_complete == "yes" and finish_reason == "finished": last_complete = extract_json(last_complete) @@ -297,7 +300,12 @@ def toc_transformer(toc_content, model=None): return cleaned_response last_complete = get_json_content(last_complete) + attempt = 0 + max_attempts = 5 while not (if_complete == "yes" and finish_reason == "finished"): + attempt += 1 + if attempt > max_attempts: + raise Exception('Failed to complete toc transformation after maximum retries') position = last_complete.rfind('}') if position != -1: last_complete = last_complete[:position+2] @@ -313,7 +321,7 @@ def toc_transformer(toc_content, model=None): Please continue the json structure, directly output the remaining part of the json structure.""" - new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + new_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if new_complete.startswith('```json'): new_complete = get_json_content(new_complete) @@ -474,7 +482,7 @@ def add_page_number_to_toc(part, structure, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n" - current_json_raw = ChatGPT_API(model=model, prompt=prompt) + current_json_raw = llm_completion(model=model, prompt=prompt) json_result = extract_json(current_json_raw) for item in json_result: @@ -524,7 +532,7 @@ def generate_toc_continue(toc_content, part, model=None): Directly return the additional part of the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2) - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if finish_reason == 'finished': return extract_json(response) else: @@ -558,7 +566,7 @@ def generate_toc_init(part, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if finish_reason == 'finished': return extract_json(response) @@ -729,8 +737,8 @@ def check_toc(page_list, opt=None): ################### fix incorrect toc ######################################################### -def single_toc_item_index_fixer(section_title, content, model=None): - tob_extractor_prompt = """ +async def single_toc_item_index_fixer(section_title, content, model=None): + toc_extractor_prompt = """ You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document. The provided pages contains tags like and to indicate the physical location of the page X. @@ -742,8 +750,8 @@ def single_toc_item_index_fixer(section_title, content, model=None): } Directly return the final JSON structure. Do not output anything else.""" - prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content - response = ChatGPT_API(model=model, prompt=prompt) + prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content + response = await llm_acompletion(model=model, prompt=prompt) json_content = extract_json(response) return convert_physical_index_to_int(json_content['physical_index']) @@ -804,15 +812,15 @@ async def process_and_check_item(incorrect_item): page_contents=[] for page_index in range(prev_correct, next_correct+1): # Add bounds checking to prevent IndexError - list_index = page_index - start_index - if list_index >= 0 and list_index < len(page_list): - page_text = f"\n{page_list[list_index][0]}\n\n\n" + page_list_idx = page_index - start_index + if page_list_idx >= 0 and page_list_idx < len(page_list): + page_text = f"\n{page_list[page_list_idx][0]}\n\n\n" page_contents.append(page_text) else: continue content_range = ''.join(page_contents) - physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model) + physical_index_int = await single_toc_item_index_fixer(incorrect_item['title'], content_range, model) # Check if the result is correct check_item = incorrect_item.copy() diff --git a/pageindex/utils.py b/pageindex/utils.py index e73e469a6..57b69c5b5 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -16,148 +16,65 @@ from pathlib import Path from types import SimpleNamespace as config -# Support multiple API key environment variables for different providers -CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") or os.getenv("OPENAI_API_KEY") +# Backward compatibility: support CHATGPT_API_KEY as alias for OPENAI_API_KEY +if not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): + os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY") -# Configure LiteLLM to use environment variables for different providers -# Users can set: OPENAI_API_KEY, ANTHROPIC_API_KEY, GEMINI_API_KEY, etc. -# See: https://docs.litellm.ai/docs/providers +litellm.drop_params = True def count_tokens(text, model=None): - """ - Count tokens in text using LiteLLM's token counter, which automatically - selects the appropriate tokenizer for each provider. - """ if not text: return 0 - try: - return litellm.token_counter(model=model or "gpt-4o", text=text) - except Exception: - # Fallback to approximate counting (4 chars per token) - return len(text) // 4 + return litellm.token_counter(model=model, text=text) -def ChatGPT_API_with_finish_reason(model, prompt, api_key=None, chat_history=None): - """ - Synchronous chat completion API with finish reason tracking. - Uses LiteLLM to support multiple LLM providers. - - Args: - model: Model string (e.g., "gpt-4o", "claude-3-opus-20240229", "gemini/gemini-pro") - prompt: User prompt - api_key: API key (optional, uses environment variables if not provided) - chat_history: Previous conversation history - - Returns: - Tuple of (response_content, finish_reason) - """ + +def llm_completion(model, prompt, chat_history=None, return_finish_reason=False): max_retries = 10 - - # Build messages list - if chat_history: - messages = chat_history.copy() - messages.append({"role": "user", "content": prompt}) - else: - messages = [{"role": "user", "content": prompt}] - + messages = list(chat_history) + [{"role": "user", "content": prompt}] if chat_history else [{"role": "user", "content": prompt}] for i in range(max_retries): try: response = litellm.completion( model=model, messages=messages, temperature=0, - api_key=api_key, ) - if response.choices[0].finish_reason == "length": - return response.choices[0].message.content, "max_output_reached" - else: - return response.choices[0].message.content, "finished" - + content = response.choices[0].message.content + if return_finish_reason: + finish_reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished" + return content, finish_reason + return content except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - time.sleep(1) # Wait for 1s before retrying + time.sleep(1) else: logging.error('Max retries reached for prompt: ' + prompt) - return "Error", "error" - + if return_finish_reason: + return "", "error" + return "" -def ChatGPT_API(model, prompt, api_key=None, chat_history=None): - """ - Synchronous chat completion API. - Uses LiteLLM to support multiple LLM providers. - - Args: - model: Model string (e.g., "gpt-4o", "claude-3-opus-20240229", "gemini/gemini-pro") - prompt: User prompt - api_key: API key (optional, uses environment variables if not provided) - chat_history: Previous conversation history - - Returns: - Response content string - """ - max_retries = 10 - - # Build messages list - if chat_history: - messages = chat_history.copy() - messages.append({"role": "user", "content": prompt}) - else: - messages = [{"role": "user", "content": prompt}] - - for i in range(max_retries): - try: - response = litellm.completion( - model=model, - messages=messages, - temperature=0, - api_key=api_key, - ) - return response.choices[0].message.content - except Exception as e: - print('************* Retrying *************') - logging.error(f"Error: {e}") - if i < max_retries - 1: - time.sleep(1) # Wait for 1s before retrying - else: - logging.error('Max retries reached for prompt: ' + prompt) - return "Error" - -async def ChatGPT_API_async(model, prompt, api_key=None): - """ - Asynchronous chat completion API. - Uses LiteLLM to support multiple LLM providers. - - Args: - model: Model string (e.g., "gpt-4o", "claude-3-opus-20240229", "gemini/gemini-pro") - prompt: User prompt - api_key: API key (optional, uses environment variables if not provided) - - Returns: - Response content string - """ +async def llm_acompletion(model, prompt): max_retries = 10 messages = [{"role": "user", "content": prompt}] - for i in range(max_retries): try: response = await litellm.acompletion( model=model, messages=messages, temperature=0, - api_key=api_key, ) return response.choices[0].message.content except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - await asyncio.sleep(1) # Wait for 1s before retrying + await asyncio.sleep(1) else: logging.error('Max retries reached for prompt: ' + prompt) - return "Error" + return "" def get_json_content(response): @@ -462,14 +379,14 @@ def add_preface_if_needed(data): -def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): +def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) page_list = [] for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] page_text = page.extract_text() - token_length = count_tokens(page_text, model=model) + token_length = litellm.token_counter(model=model, text=page_text) page_list.append((page_text, token_length)) return page_list elif pdf_parser == "PyMuPDF": @@ -481,7 +398,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): page_list = [] for page in doc: page_text = page.get_text() - token_length = count_tokens(page_text, model=model) + token_length = litellm.token_counter(model=model, text=page_text) page_list.append((page_text, token_length)) return page_list else: @@ -581,10 +498,10 @@ def remove_structure_text(data): return data -def check_token_limit(structure, limit=110000, model=None): +def check_token_limit(structure, limit=110000): list = structure_to_list(structure) for node in list: - num_tokens = count_tokens(node['text'], model=model) + num_tokens = count_tokens(node['text'], model=None) if num_tokens > limit: print(f"Node ID: {node['node_id']} has {num_tokens} tokens") print("Start Index:", node['start_index']) @@ -660,7 +577,7 @@ async def generate_node_summary(node, model=None): Directly return the description, do not include any other text. """ - response = await ChatGPT_API_async(model, prompt) + response = await llm_acompletion(model, prompt) return response @@ -705,7 +622,7 @@ def generate_doc_description(structure, model=None): Directly return the description, do not include any other text. """ - response = ChatGPT_API(model, prompt) + response = llm_completion(model, prompt) return response @@ -760,4 +677,4 @@ def load(self, user_opt=None) -> config: self._validate_keys(user_dict) merged = {**self._default_dict, **user_dict} - return config(**merged) \ No newline at end of file + return config(**merged) diff --git a/requirements.txt b/requirements.txt index d595e0fc9..3b82eda0b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ -litellm>=1.0.0 -openai>=1.0.0 +litellm==1.82.0 pymupdf==1.26.4 PyPDF2==3.0.1 python-dotenv==1.1.0 -tiktoken==0.11.0 pyyaml==6.0.2