diff --git a/.env.example b/.env.example index 1a670126..68cf257a 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,57 @@ -SYNTHESIZER_MODEL= +# Tokenizer +TOKENIZER_MODEL= + +# LLM +# Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt + +# http_api / openai_api +SYNTHESIZER_BACKEND=openai_api +SYNTHESIZER_MODEL=gpt-4o-mini SYNTHESIZER_BASE_URL= SYNTHESIZER_API_KEY= -TRAINEE_MODEL= +TRAINEE_BACKEND=openai_api +TRAINEE_MODEL=gpt-4o-mini TRAINEE_BASE_URL= TRAINEE_API_KEY= + +# azure_openai_api +# SYNTHESIZER_BACKEND=azure_openai_api +# The following is the same as your "Deployment name" in Azure +# SYNTHESIZER_MODEL= +# SYNTHESIZER_BASE_URL=https://.openai.azure.com/openai/deployments//chat/completions +# SYNTHESIZER_API_KEY= +# SYNTHESIZER_API_VERSION= + +# # ollama_api +# SYNTHESIZER_BACKEND=ollama_api +# SYNTHESIZER_MODEL=gemma3 +# SYNTHESIZER_BASE_URL=http://localhost:11434 +# +# Note: TRAINEE with ollama_api backend is not supported yet as ollama_api does not support logprobs. + +# # huggingface +# SYNTHESIZER_BACKEND=huggingface +# SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct +# +# TRAINEE_BACKEND=huggingface +# TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct + +# # sglang +# SYNTHESIZER_BACKEND=sglang +# SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct +# SYNTHESIZER_TP_SIZE=1 +# SYNTHESIZER_NUM_GPUS=1 + +# TRAINEE_BACKEND=sglang +# TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct +# SYNTHESIZER_TP_SIZE=1 +# SYNTHESIZER_NUM_GPUS=1 + +# # vllm +# SYNTHESIZER_BACKEND=vllm +# SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct +# SYNTHESIZER_NUM_GPUS=1 + +# TRAINEE_BACKEND=vllm +# TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct +# TRAINEE_NUM_GPUS=1 diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..74514987 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,24 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..bbcbbe7d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/contributing.md b/.github/contributing.md new file mode 100644 index 00000000..f0b95d32 --- /dev/null +++ b/.github/contributing.md @@ -0,0 +1,34 @@ +## Contribution Guide +Here are the steps to contribute to this project: + +1. Star this repository. +2. Fork this repository. + + Type the following command on Git bash console: + ```bash + git clone https://github.com/open-sciencelab/GraphGen.git + ``` + +3. Create a new branch + + Now before making changes to the files, go to your terminal under the repo you just cloned, and type the following: + + ```bash + git checkout -b add-my-name + ``` + + By running the above command, you just created a new branch called add-my-name and checked it out, what this does is that it creates a new branch with the commit history of the master branch or the branch that you were on previously. + +4. Make your changes and push your code. + + ``` + git add . + git commit -m "xxx" + git push + ``` + + This will create a new commit with the changes you made. + +5. Now create a pull request and add the title. + + Sit back and relax while your pull request is being reviewed and merged. diff --git a/.github/sync-config.yml b/.github/sync-config.yml new file mode 100644 index 00000000..417ed83b --- /dev/null +++ b/.github/sync-config.yml @@ -0,0 +1,17 @@ +sync: + - source: graphgen/ + dest: graphgen/ + - source: resources/nltk_data/ + dest: resources/nltk_data/ + - source: resources/examples/ + dest: resources/examples/ + - source: resources/images/logo.png + dest: resources/images/logo.png + - source: webui/ + dest: webui/ + - source: webui/app.py + dest: app.py + - source: requirements.txt + dest: requirements.txt + - source: LICENSE + dest: LICENSE diff --git a/.github/workflows/push-to-hf.yml b/.github/workflows/push-to-hf.yml new file mode 100644 index 00000000..81f23c04 --- /dev/null +++ b/.github/workflows/push-to-hf.yml @@ -0,0 +1,51 @@ +name: Push demo branch to Hugging Face + +on: + workflow_call: + inputs: + ref: + required: false + default: demo + type: string + secrets: + HF_TOKEN: + required: true + +jobs: + push-hf: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Configure Git identity + run: | + git config --global user.email "actions@github.com" + git config --global user.name "github-actions[bot]" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install huggingface_hub + + - name: Push to Hugging Face + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_REPO_TYPE: spaces + HF_REPO_ID: chenzihong/GraphGen + run: | + git config --global credential.helper store + echo "https://user:${HF_TOKEN}@huggingface.co" > ~/.git-credentials + + [[ -d hf-repo ]] && rm -rf hf-repo + git clone https://huggingface.co/${HF_REPO_TYPE}/${HF_REPO_ID} hf-repo + + rsync -a --delete --exclude='.git' --exclude='hf-repo' --exclude='README.md' ./ hf-repo/ + + cd hf-repo + git add . + git diff-index --quiet HEAD || \ + (git commit -m "Auto-sync from ${{ inputs.ref }} at $(date -u)" && git push) \ No newline at end of file diff --git a/.github/workflows/push-to-ms.yml b/.github/workflows/push-to-ms.yml new file mode 100644 index 00000000..139da12e --- /dev/null +++ b/.github/workflows/push-to-ms.yml @@ -0,0 +1,50 @@ +name: Push demo branch to ModelScope + +on: + workflow_call: + inputs: + ref: + required: false + default: demo + type: string + secrets: + MS_TOKEN: + required: true + +jobs: + push-ms: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Configure Git identity + run: | + git config --global user.email "actions@github.com" + git config --global user.name "github-actions[bot]" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + # ModelScope official SDK (optional, install only if you need to call the platform API) + pip install modelscope + + - name: Push to ModelScope + env: + MS_TOKEN: ${{ secrets.MS_TOKEN }} + MS_REPO_TYPE: studios + MS_REPO_ID: chenzihong/GraphGen + run: | + [[ -d ms-repo ]] && rm -rf ms-repo + git clone https://oauth2:${MS_TOKEN}@www.modelscope.cn/${MS_REPO_TYPE}/${MS_REPO_ID}.git ms-repo + + rsync -a --delete --exclude='.git' --exclude='ms-repo' --exclude='README.md' ./ ms-repo/ + + cd ms-repo + git add . + git diff-index --quiet HEAD || \ + (git commit -m "Auto-sync from ${{ inputs.ref }} at $(date -u)" && \ + git push "https://oauth2:${MS_TOKEN}@www.modelscope.cn/${MS_REPO_TYPE}/${MS_REPO_ID}.git") diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 1e00ab5b..0cc7dfe1 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10", "3.11"] + python-version: ["3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/workflow.yml b/.github/workflows/pypi.yml similarity index 100% rename from .github/workflows/workflow.yml rename to .github/workflows/pypi.yml diff --git a/.github/workflows/sync-demo.yml b/.github/workflows/sync-demo.yml new file mode 100644 index 00000000..97ee1fed --- /dev/null +++ b/.github/workflows/sync-demo.yml @@ -0,0 +1,96 @@ +name: Sync Demo Branch + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + sync-demo: + runs-on: ubuntu-latest + + steps: + - name: Checkout main branch + uses: actions/checkout@v4 + with: + ref: main + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Create demo branch if it doesn't exist + run: | + if ! git ls-remote --heads origin demo | grep -q demo; then + echo "Creating demo branch..." + git checkout -b demo + git push origin demo + else + echo "Demo branch already exists" + fi + + - name: Checkout demo branch + uses: actions/checkout@v4 + with: + ref: demo + token: ${{ secrets.GITHUB_TOKEN }} + path: demo + + - name: Clean demo directory + run: | + cd demo + find . -mindepth 1 -path './.git' -prune -o -exec rm -rf {} + 2>/dev/null || true + + - name: Copy files using config + run: | + yq eval '.sync[] | .source + ":" + .dest' .github/sync-config.yml | while IFS=: read -r src dst; do + src=$(echo "$src" | xargs) + dst=$(echo "$dst" | xargs) + + [ -z "$src" ] && continue + + if [ -e "$src" ]; then + target_path="demo/$dst" + + # 处理目录(以/结尾或本身是目录) + if [[ "$dst" == */ ]] || [ -d "$src" ]; then + mkdir -p "$target_path" + # 复制目录里的内容,而不是目录本身 + cp -r "$src"/* "$target_path" + echo "Copied $src/* → $target_path" + else + mkdir -p "$(dirname "$target_path")" + cp "$src" "$target_path" + echo "Copied $src → $target_path" + fi + else + echo "Source not found: $src" + fi + done + + - name: Commit and push changes + run: | + cd demo + git config --global user.email "actions@github.com" + git config --global user.name "github-actions[bot]" + + # 检查是否有变化 + if [[ -n $(git status --porcelain) ]]; then + git add . + git commit -m "Auto-sync demo branch with main branch" + git push origin demo + echo "Changes pushed to demo branch" + else + echo "No changes to sync" + fi + + push-hf: + needs: sync-demo + uses: ./.github/workflows/push-to-hf.yml + secrets: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + push-ms: + needs: sync-demo + uses: ./.github/workflows/push-to-ms.yml + secrets: + MS_TOKEN: ${{ secrets.MS_TOKEN }} + with: + ref: demo diff --git a/.gitignore b/.gitignore index 678cdc50..c5ea805f 100644 --- a/.gitignore +++ b/.gitignore @@ -174,6 +174,36 @@ cython_debug/ .pypirc cache +cache_* +databases/ *.pyc *.html .gradio +*.sst +*.db/ +*.nhr +*.ndb +*.nin +*.njs +*.nog +*.nos +*.not +*.nsq +*.ntf +*.nto +*.phr +*.pin +*.pdb +*.pjs +*.pog +*.pos +*.psq +*.ptf +*.pto +*.wal +graph_kuzu* +resources/ + +# macOS +.DS_Store +**/.DS_Store diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..5f799fde --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: + - repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black + name: black + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + name: isort + - repo: https://github.com/PyCQA/pylint + rev: v2.15.5 + hooks: + - id: pylint + name: pylint + entry: pylint + language: system + types: [python] diff --git a/.pylintrc b/.pylintrc index 594a7f7f..094d8888 100644 --- a/.pylintrc +++ b/.pylintrc @@ -100,7 +100,7 @@ source-roots= # When enabled, pylint would attempt to guess common misconfiguration and emit # user-friendly hints instead of false-positive error messages. -suggestion-mode=yes +# suggestion-mode=yes # Allow loading of arbitrary C extensions. Extensions are imported into the # active Python interpreter and may run arbitrary code. @@ -308,7 +308,7 @@ max-public-methods=20 max-returns=6 # Maximum number of statements in function / method body. -max-statements=50 +max-statements=60 # Minimum number of public methods for a class (see R0903). min-public-methods=2 @@ -452,6 +452,7 @@ disable=raw-checker-failed, R0917, # Too many positional arguments (6/5) (too-many-positional-arguments) C0103, E0401, + W0718, # Catching too general exception Exception (broad-except) # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..e7e6ecd1 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +chenzihong_gavin@foxmail.com. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/MANIFEST.in b/MANIFEST.in index b7fd18c5..d1a9003b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ +include requirements.txt include graphgen/configs/*.yaml recursive-include graphgen/configs *.yaml diff --git a/README.md b/README.md index a30b4540..99288683 100644 --- a/README.md +++ b/README.md @@ -8,28 +8,34 @@ [![forks](https://img.shields.io/github/forks/open-sciencelab/GraphGen.svg)](https://github.com/open-sciencelab/GraphGen) [![open issues](https://img.shields.io/github/issues-raw/open-sciencelab/GraphGen)](https://github.com/open-sciencelab/GraphGen/issues) [![issue resolution](https://img.shields.io/github/issues-closed-raw/open-sciencelab/GraphGen)](https://github.com/open-sciencelab/GraphGen/issues) -[![documentation](https://img.shields.io/badge/docs-latest-blue)](https://graphgen-cookbook.readthedocs.io/en/latest/) +[![documentation](https://img.shields.io/badge/docs-latest-blue)](https://chenzihong.gitbook.io/graphgen-cookbook/) +[![pypi](https://img.shields.io/pypi/v/graphg.svg?style=flat&logo=pypi&logoColor=white)](https://pypi.org/project/graphg/) [![wechat](https://img.shields.io/badge/wechat-brightgreen?logo=wechat&logoColor=white)](https://cdn.vansin.top/internlm/dou.jpg) [![arXiv](https://img.shields.io/badge/Paper-arXiv-white)](https://arxiv.org/abs/2505.20416) -[![Hugging Face](https://img.shields.io/badge/Paper-on%20HF-blue?logo=huggingface&logoColor=yellow)](https://huggingface.co/papers/2505.20416) +[![Hugging Face](https://img.shields.io/badge/Paper-on%20HF-white?logo=huggingface&logoColor=yellow)](https://huggingface.co/papers/2505.20416) +[![Hugging Face](https://img.shields.io/badge/Demo-on%20HF-blue?logo=huggingface&logoColor=yellow)](https://huggingface.co/spaces/chenzihong/GraphGen) +[![Model Scope](https://img.shields.io/badge/%F0%9F%A4%96%20Demo-on%20MS-green)](https://modelscope.cn/studios/chenzihong/GraphGen) GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation -
+[English](README.md) | [中文](README_zh.md) + +
📚 Table of Contents - 📝 [What is GraphGen?](#-what-is-graphgen) -- 🚀 [Quick Start](#-quick-start) - 📌 [Latest Updates](#-latest-updates) +- ⚙️ [Support List](#-support-list) +- 🚀 [Quick Start](#-quick-start) - 🏗️ [System Architecture](#-system-architecture) - 🍀 [Acknowledgements](#-acknowledgements) - 📚 [Citation](#-citation) - 📜 [License](#-license) +- 📅 [Star History](#-star-history) [//]: # (- 🌟 [Key Features](#-key-features)) -[//]: # (- 📅 [Roadmap](#-roadmap)) [//]: # (- 💰 [Cost Analysis](#-cost-analysis)) [//]: # (- ⚙️ [Configurations](#-configurations)) @@ -37,30 +43,139 @@ GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthe ## 📝 What is GraphGen? -GraphGen is a framework for synthetic data generation guided by knowledge graphs. Here is our [**paper**](https://github.com/open-sciencelab/GraphGen/tree/main/resources/GraphGen.pdf) and [best practice](https://github.com/open-sciencelab/GraphGen/issues/17). +GraphGen is a framework for synthetic data generation guided by knowledge graphs. Please check the [**paper**](https://arxiv.org/abs/2505.20416) and [best practice](https://github.com/open-sciencelab/GraphGen/issues/17). + +Here is post-training result which **over 50% SFT data** comes from GraphGen and our data clean pipeline. + +| Domain | Dataset | Ours | Qwen2.5-7B-Instruct (baseline) | +|:---------:|:---------------------------------------------------------:|:--------:|:------------------------------:| +| Plant | [SeedBench](https://github.com/open-sciencelab/SeedBench) | **65.9** | 51.5 | +| Common | CMMLU | 73.6 | **75.8** | +| Knowledge | GPQA-Diamond | **40.0** | 33.3 | +| Math | AIME24 | **20.6** | 16.7 | +| | AIME25 | **22.7** | 7.2 | It begins by constructing a fine-grained knowledge graph from the source text,then identifies knowledge gaps in LLMs using the expected calibration error metric, prioritizing the generation of QA pairs that target high-value, long-tail knowledge. Furthermore, GraphGen incorporates multi-hop neighborhood sampling to capture complex relational information and employs style-controlled generation to diversify the resulting QA data. +After data generation, you can use [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) and [xtuner](https://github.com/InternLM/xtuner) to finetune your LLMs. + +## 📌 Latest Updates + +- **2025.12.16**: Added [rocksdb](https://github.com/facebook/rocksdb) for key-value storage backend and [kuzudb](https://github.com/kuzudb/kuzu) for graph database backend support. +- **2025.12.16**: Added [vllm](https://github.com/vllm-project/vllm) for local inference backend support. +- **2025.12.16**: Refactored the data generation pipeline using [ray](https://github.com/ray-project/ray) to improve the efficiency of distributed execution and resource management. + +
+History + +- **2025.12.1**: Added search support for [NCBI](https://www.ncbi.nlm.nih.gov/) and [RNAcentral](https://rnacentral.org/) databases, enabling extraction of DNA and RNA data from these bioinformatics databases. +- **2025.10.30**: We support several new LLM clients and inference backends including [Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) and [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py). +- **2025.10.23**: We support VQA(Visual Question Answering) data generation now. Run script: `bash scripts/generate/generate_vqa.sh`. +- **2025.10.21**: We support PDF as input format for data generation now via [MinerU](https://github.com/opendatalab/MinerU). +- **2025.09.29**: We auto-update gradio demo on [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) and [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen). +- **2025.08.14**: We have added support for community detection in knowledge graphs using the Leiden algorithm, enabling the synthesis of Chain-of-Thought (CoT) data. +- **2025.07.31**: We have added Google, Bing, Wikipedia, and UniProt as search back-ends. +- **2025.04.21**: We have released the initial version of GraphGen. + +
+ + +## ⚙️ Support List + +We support various LLM inference servers, API servers, inference clients, input file formats, data modalities, output data formats, and output data types. +Users can flexibly configure according to the needs of synthetic data. + + +| Inference Server | Api Server | Inference Client | Data Source | Data Modal | Data Type | +|--------------------------------------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------------| +| [![hf-icon]HF][hf]
[![sg-icon]SGLang][sg]
[![vllm-icon]vllm][vllm] | [![sif-icon]Silicon][sif]
[![oai-icon]OpenAI][oai]
[![az-icon]Azure][az] | HTTP
[![ol-icon]Ollama][ol]
[![oai-icon]OpenAI][oai] | Files(CSV, JSON, PDF, TXT, etc.)
Databases([![uniprot-icon]UniProt][uniprot], [![ncbi-icon]NCBI][ncbi], [![rnacentral-icon]RNAcentral][rnacentral])
Search Engines([![bing-icon]Bing][bing], [![google-icon]Google][google])
Knowledge Graphs([![wiki-icon]Wikipedia][wiki]) | TEXT
IMAGE | Aggregated
Atomic
CoT
Multi-hop
VQA | + + +[hf]: https://huggingface.co/docs/transformers/index +[sg]: https://docs.sglang.ai +[vllm]: https://github.com/vllm-project/vllm +[sif]: https://siliconflow.cn +[oai]: https://openai.com +[az]: https://azure.microsoft.com/en-us/services/cognitive-services/openai-service/ +[ol]: https://ollama.com +[uniprot]: https://www.uniprot.org/ +[ncbi]: https://www.ncbi.nlm.nih.gov/ +[rnacentral]: https://rnacentral.org/ +[wiki]: https://www.wikipedia.org/ +[bing]: https://www.bing.com/ +[google]: https://www.google.com + + + +[hf-icon]: https://www.google.com/s2/favicons?domain=https://huggingface.co +[sg-icon]: https://www.google.com/s2/favicons?domain=https://docs.sglang.ai +[vllm-icon]: https://www.google.com/s2/favicons?domain=https://docs.vllm.ai +[sif-icon]: https://www.google.com/s2/favicons?domain=siliconflow.com +[oai-icon]: https://www.google.com/s2/favicons?domain=https://openai.com +[az-icon]: https://www.google.com/s2/favicons?domain=https://azure.microsoft.com +[ol-icon]: https://www.google.com/s2/favicons?domain=https://ollama.com + +[uniprot-icon]: https://www.google.com/s2/favicons?domain=https://www.uniprot.org +[ncbi-icon]: https://www.google.com/s2/favicons?domain=https://www.ncbi.nlm.nih.gov/ +[rnacentral-icon]: https://www.google.com/s2/favicons?domain=https://rnacentral.org/ +[wiki-icon]: https://www.google.com/s2/favicons?domain=https://www.wikipedia.org/ +[bing-icon]: https://www.google.com/s2/favicons?domain=https://www.bing.com/ +[google-icon]: https://www.google.com/s2/favicons?domain=https://www.google.com + + ## 🚀 Quick Start -Experience GraphGen through [Web](https://g-app-center-000704-6802-aerppvq.openxlab.space) or [Backup Web Entrance](https://openxlab.org.cn/apps/detail/tpoisonooo/GraphGen) +Experience GraphGen Demo through [Huggingface](https://huggingface.co/spaces/chenzihong/GraphGen) or [Modelscope](https://modelscope.cn/studios/chenzihong/GraphGen). For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGen/issues/10), open new [issue](https://github.com/open-sciencelab/GraphGen/issues) or join our [wechat group](https://cdn.vansin.top/internlm/dou.jpg) and ask. -### Gradio Demo +### Preparation + +1. Install [uv](https://docs.astral.sh/uv/reference/installer/) + + ```bash + # You could try pipx or pip to install uv when meet network issues, refer the uv doc for more details + curl -LsSf https://astral.sh/uv/install.sh | sh + ``` +2. Clone the repository + + ```bash + git clone --depth=1 https://github.com/open-sciencelab/GraphGen + cd GraphGen + ``` + +3. Create a new uv environment + + ```bash + uv venv --python 3.10 + ``` + +4. Configure the dependencies + ```bash + uv pip install -r requirements.txt + ``` + +### Run Gradio Demo + + ```bash + python -m webui.app + ``` + + For hot-reload during development, run ```bash - python webui/app.py + PYTHONPATH=. gradio webui/app.py ``` + ![ui](https://github.com/user-attachments/assets/3024e9bc-5d45-45f8-a4e6-b57bd2350d84) ### Run from PyPI 1. Install GraphGen ```bash - pip install graphg + uv pip install graphg ``` 2. Run in CLI @@ -76,44 +191,115 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe ### Run from Source -1. Install dependencies - ```bash - pip install -r requirements.txt - ``` -2. Configure the environment +1. Configure the environment - Create an `.env` file in the root directory ```bash cp .env.example .env ``` - Set the following environment variables: ```bash - # Synthesizer is the model used to construct KG and generate data - SYNTHESIZER_MODEL=your_synthesizer_model_name - SYNTHESIZER_BASE_URL=your_base_url_for_synthesizer_model - SYNTHESIZER_API_KEY=your_api_key_for_synthesizer_model - # Trainee is the model used to train with the generated data - TRAINEE_MODEL=your_trainee_model_name - TRAINEE_BASE_URL=your_base_url_for_trainee_model - TRAINEE_API_KEY=your_api_key_for_trainee_model + # Tokenizer + TOKENIZER_MODEL= + + # LLM + # Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt + # Synthesizer is the model used to construct KG and generate data + # Trainee is the model used to train with the generated data + + # http_api / openai_api + SYNTHESIZER_BACKEND=openai_api + SYNTHESIZER_MODEL=gpt-4o-mini + SYNTHESIZER_BASE_URL= + SYNTHESIZER_API_KEY= + TRAINEE_BACKEND=openai_api + TRAINEE_MODEL=gpt-4o-mini + TRAINEE_BASE_URL= + TRAINEE_API_KEY= + + # azure_openai_api + # SYNTHESIZER_BACKEND=azure_openai_api + # The following is the same as your "Deployment name" in Azure + # SYNTHESIZER_MODEL= + # SYNTHESIZER_BASE_URL=https://.openai.azure.com/openai/deployments//chat/completions + # SYNTHESIZER_API_KEY= + # SYNTHESIZER_API_VERSION= + + # # ollama_api + # SYNTHESIZER_BACKEND=ollama_api + # SYNTHESIZER_MODEL=gemma3 + # SYNTHESIZER_BASE_URL=http://localhost:11434 + # + # Note: TRAINEE with ollama_api backend is not supported yet as ollama_api does not support logprobs. + + # # huggingface + # SYNTHESIZER_BACKEND=huggingface + # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # + # TRAINEE_BACKEND=huggingface + # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct + + # # sglang + # SYNTHESIZER_BACKEND=sglang + # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # SYNTHESIZER_TP_SIZE=1 + # SYNTHESIZER_NUM_GPUS=1 + + # TRAINEE_BACKEND=sglang + # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # SYNTHESIZER_TP_SIZE=1 + # SYNTHESIZER_NUM_GPUS=1 + + # # vllm + # SYNTHESIZER_BACKEND=vllm + # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # SYNTHESIZER_NUM_GPUS=1 + + # TRAINEE_BACKEND=vllm + # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # TRAINEE_NUM_GPUS=1 ``` -3. (Optional) If you want to modify the default generated configuration, you can edit the content of the configs/graphgen_config.yaml file. +2. (Optional) Customize generation parameters in `config.yaml` . + + Edit the corresponding YAML file, e.g.: + ```yaml - # configs/graphgen_config.yaml - # Example configuration - data_type: "raw" - input_file: "resources/examples/raw_demo.jsonl" - # more configurations... + # examples/generate/generate_aggregated_qa/aggregated_config.yaml + global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + + nodes: + - id: read_files # id is unique in the pipeline, and can be referenced by other steps + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + # additional settings... ``` -4. Run the generation script - ```bash - bash scripts/generate.sh - ``` -5. Get the generated data + +3. Generate data + + Pick the desired format and run the matching script: + + | Format | Script to run | Notes | + | ------------ | ---------------------------------------------------------------------- | -------------------------------------------------------------------------- | + | `cot` | `bash examples/generate/generate_cot_qa/generate_cot.sh` | Chain-of-Thought Q\&A pairs | + | `atomic` | `bash examples/generate/generate_atomic_qa/generate_atomic.sh` | Atomic Q\&A pairs covering basic knowledge | + | `aggregated` | `bash examples/generate/generate_aggregated_qa/generate_aggregated.sh` | Aggregated Q\&A pairs incorporating complex, integrated knowledge | + | `multi-hop` | `examples/generate/generate_multi_hop_qa/generate_multi_hop.sh` | Multi-hop reasoning Q\&A pairs | + | `vqa` | `bash examples/generate/generate_vqa/generate_vqa.sh` | Visual Question Answering pairs combining visual and textual understanding | + + +4. Get the generated data ```bash - ls cache/data/graphgen + ls cache/output ``` -## Run with Docker +### Run with Docker 1. Build the Docker image ```bash docker build -t graphgen . @@ -124,10 +310,6 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe ``` -## 📌 Latest Updates - -- **2025.04.21**: We have released the initial version of GraphGen. - ## 🏗️ System Architecture See [analysis](https://deepwiki.com/open-sciencelab/GraphGen) by deepwiki for a technical overview of the GraphGen system, its architecture, and core functionalities. @@ -138,9 +320,10 @@ See [analysis](https://deepwiki.com/open-sciencelab/GraphGen) by deepwiki for a ## 🍀 Acknowledgements -- [SiliconCloud](https://siliconflow.cn) Abundant LLM API, some models are free +- [SiliconFlow](https://siliconflow.cn) Abundant LLM API, some models are free - [LightRAG](https://github.com/HKUDS/LightRAG) Simple and efficient graph retrieval solution -- [ROGRAG](https://github.com/tpoisonooo/ROGRAG) ROGRAG: A Robustly Optimized GraphRAG Framework +- [ROGRAG](https://github.com/tpoisonooo/ROGRAG) A robustly optimized GraphRAG framework +- [DB-GPT](https://github.com/eosphoros-ai/DB-GPT) An AI native data app development framework ## 📚 Citation @@ -159,3 +342,7 @@ If you find this repository useful, please consider citing our work: ## 📜 License This project is licensed under the [Apache License 2.0](LICENSE). + +## 📅 Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=InternScience/GraphGen&type=Date)](https://www.star-history.com/#InternScience/GraphGen&Date) diff --git a/README_zh.md b/README_zh.md new file mode 100644 index 00000000..f15f5523 --- /dev/null +++ b/README_zh.md @@ -0,0 +1,346 @@ +

+ +

+ + + +[![stars](https://img.shields.io/github/stars/open-sciencelab/GraphGen.svg)](https://github.com/open-sciencelab/GraphGen) +[![forks](https://img.shields.io/github/forks/open-sciencelab/GraphGen.svg)](https://github.com/open-sciencelab/GraphGen) +[![open issues](https://img.shields.io/github/issues-raw/open-sciencelab/GraphGen)](https://github.com/open-sciencelab/GraphGen/issues) +[![issue resolution](https://img.shields.io/github/issues-closed-raw/open-sciencelab/GraphGen)](https://github.com/open-sciencelab/GraphGen/issues) +[![documentation](https://img.shields.io/badge/docs-latest-blue)](https://graphgen-cookbook.readthedocs.io/en/latest/) +[![pypi](https://img.shields.io/pypi/v/graphg.svg?style=flat&logo=pypi&logoColor=white)](https://pypi.org/project/graphg/) +[![wechat](https://img.shields.io/badge/wechat-brightgreen?logo=wechat&logoColor=white)](https://cdn.vansin.top/internlm/dou.jpg) +[![arXiv](https://img.shields.io/badge/Paper-arXiv-white)](https://arxiv.org/abs/2505.20416) +[![Hugging Face](https://img.shields.io/badge/Paper-on%20HF-white?logo=huggingface&logoColor=yellow)](https://huggingface.co/papers/2505.20416) + +[![Hugging Face](https://img.shields.io/badge/Demo-on%20HF-blue?logo=huggingface&logoColor=yellow)](https://huggingface.co/spaces/chenzihong/GraphGen) +[![Model Scope](https://img.shields.io/badge/%F0%9F%A4%96%20Demo-on%20MS-green)](https://modelscope.cn/studios/chenzihong/GraphGen) + +GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation + +[English](README.md) | [中文](README_zh.md) + +
+📚 目录 + +- 📝 [什么是 GraphGen?](#-什么是-graphgen) +- 📌 [最新更新](#-最新更新) +- ⚙️ [支持列表](#-支持列表) +- 🚀 [快速开始](#-快速开始) +- 🏗️ [系统架构](#-系统架构) +- 🍀 [致谢](#-致谢) +- 📚 [引用](#-引用) +- 📜 [许可证](#-许可证) +- 📅 [星标历史](#-星标历史) + + +[//]: # (- 🌟 [主要特性](#主要特性)) +[//]: # (- 💰 [成本分析](#成本分析)) +[//]: # (- ⚙️ [配置说明](#配置说明)) + +
+ + +## 📝 什么是 GraphGen? + +GraphGen 是一个基于知识图谱的数据合成框架。请查看[**论文**](https://arxiv.org/abs/2505.20416)和[最佳实践](https://github.com/open-sciencelab/GraphGen/issues/17)。 + +以下是在超过 50 % 的 SFT 数据来自 GraphGen 及我们的数据清洗流程时的训练后结果: + +| 领域 | 数据集 | 我们的方案 | Qwen2.5-7B-Instruct(基线) | +|:--:|:---------------------------------------------------------:|:--------:|:-----------------------:| +| 植物 | [SeedBench](https://github.com/open-sciencelab/SeedBench) | **65.9** | 51.5 | +| 常识 | CMMLU | 73.6 | **75.8** | +| 知识 | GPQA-Diamond | **40.0** | 33.3 | +| 数学 | AIME24 | **20.6** | 16.7 | +| | AIME25 | **22.7** | 7.2 | + +GraphGen 首先根据源文本构建细粒度的知识图谱,然后利用期望校准误差指标识别大语言模型中的知识缺口,优先生成针对高价值长尾知识的问答对。 +此外,GraphGen 采用多跳邻域采样捕获复杂关系信息,并使用风格控制生成来丰富问答数据的多样性。 + +在数据生成后,您可以使用[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) 和 [xtuner](https://github.com/InternLM/xtuner)对大语言模型进行微调。 + +## 📌 最新更新 +- **2025.12.16**:新增 [rocksdb](https://github.com/facebook/rocksdb) 作为键值存储后端, [kuzudb](https://github.com/kuzudb/kuzu) 作为图数据库后端的支持。 +- **2025.12.16**:新增 [vllm](https://github.com/vllm-project/vllm) 作为本地推理后端的支持。 +- **2025.12.16**:使用 [ray](https://github.com/ray-project/ray) 重构了数据生成 pipeline,提升了分布式执行和资源管理的效率。 + +
+历史更新 + +- **2025.12.1**:新增对 [NCBI](https://www.ncbi.nlm.nih.gov/) 和 [RNAcentral](https://rnacentral.org/) 数据库的检索支持,现在可以从这些生物信息学数据库中提取DNA和RNA数据。 +- **2025.10.30**:我们支持多种新的 LLM 客户端和推理后端,包括 [Ollama_client]([Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) 和 [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py)。 +- **2025.10.23**:我们现在支持视觉问答(VQA)数据生成。运行脚本:`bash scripts/generate/generate_vqa.sh`。 +- **2025.10.21**:我们现在通过 [MinerU](https://github.com/opendatalab/MinerU) 支持 PDF 作为数据生成的输入格式。 +- **2025.09.29**:我们在 [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) 和 [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen) 上自动更新 Gradio 应用。 +- **2025.08.14**:支持利用 Leiden 社区发现算法对知识图谱进行社区划分,合成 CoT 数据。 +- **2025.07.31**:新增 Google、Bing、Wikipedia 和 UniProt 作为搜索后端,帮助填补数据缺口。 +- **2025.04.21**:发布 GraphGen 初始版本。 + +
+ +## ⚙️ 支持列表 + +我们支持多种 LLM 推理服务器、API 服务器、推理客户端、输入文件格式、数据模态、输出数据格式和输出数据类型。 +可以根据合成数据的需求进行灵活配置。 + +| 推理服务器 | API 服务器 | 推理客户端 | 输入文件格式 | 数据模态 | 输出数据类型 | +|--------------------------------------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------------| +| [![hf-icon]HF][hf]
[![sg-icon]SGLang][sg]
[![vllm-icon]vllm][vllm] | [![sif-icon]Silicon][sif]
[![oai-icon]OpenAI][oai]
[![az-icon]Azure][az] | HTTP
[![ol-icon]Ollama][ol]
[![oai-icon]OpenAI][oai] | 文件(CSV, JSON, JSONL, PDF, TXT等)
数据库([![uniprot-icon]UniProt][uniprot], [![ncbi-icon]NCBI][ncbi], [![rnacentral-icon]RNAcentral][rnacentral])
搜索引擎([![bing-icon]Bing][bing], [![google-icon]Google][google])
知识图谱([![wiki-icon]Wikipedia][wiki]) | TEXT
IMAGE | Aggregated
Atomic
CoT
Multi-hop
VQA | + + +[hf]: https://huggingface.co/docs/transformers/index +[sg]: https://docs.sglang.ai +[vllm]: https://github.com/vllm-project/vllm +[sif]: https://siliconflow.cn +[oai]: https://openai.com +[az]: https://azure.microsoft.com/en-us/services/cognitive-services/openai-service/ +[ol]: https://ollama.com +[uniprot]: https://www.uniprot.org/ +[ncbi]: https://www.ncbi.nlm.nih.gov/ +[rnacentral]: https://rnacentral.org/ +[wiki]: https://www.wikipedia.org/ +[bing]: https://www.bing.com/ +[google]: https://www.google.com + + + +[hf-icon]: https://www.google.com/s2/favicons?domain=https://huggingface.co +[sg-icon]: https://www.google.com/s2/favicons?domain=https://docs.sglang.ai +[vllm-icon]: https://www.google.com/s2/favicons?domain=https://docs.vllm.ai +[sif-icon]: https://www.google.com/s2/favicons?domain=siliconflow.com +[oai-icon]: https://www.google.com/s2/favicons?domain=https://openai.com +[az-icon]: https://www.google.com/s2/favicons?domain=https://azure.microsoft.com +[ol-icon]: https://www.google.com/s2/favicons?domain=https://ollama.com + +[uniprot-icon]: https://www.google.com/s2/favicons?domain=https://www.uniprot.org +[ncbi-icon]: https://www.google.com/s2/favicons?domain=https://www.ncbi.nlm.nih.gov/ +[rnacentral-icon]: https://www.google.com/s2/favicons?domain=https://rnacentral.org/ +[wiki-icon]: https://www.google.com/s2/favicons?domain=https://www.wikipedia.org/ +[bing-icon]: https://www.google.com/s2/favicons?domain=https://www.bing.com/ +[google-icon]: https://www.google.com/s2/favicons?domain=https://www.google.com + + +## 🚀 快速开始 + +通过 [Huggingface](https://huggingface.co/spaces/chenzihong/GraphGen) 或 [Modelscope](https://modelscope.cn/studios/chenzihong/GraphGen) 体验 GraphGen。 + +如有任何问题,请查看 [FAQ](https://github.com/open-sciencelab/GraphGen/issues/10)、提交新的 [issue](https://github.com/open-sciencelab/GraphGen/issues) 或加入我们的[微信群](https://cdn.vansin.top/internlm/dou.jpg)咨询。 + +### 准备工作 + +1. 安装 [uv](https://docs.astral.sh/uv/reference/installer/) + + ```bash + # 若遇到网络问题,可尝试使用 pipx 或 pip 安装 uv,详见 uv 文档 + curl -LsSf https://astral.sh/uv/install.sh | sh + ``` +2. 克隆仓库 + + ```bash + git clone --depth=1 https://github.com/open-sciencelab/GraphGen + cd GraphGen + ``` +3. 创建新的 uv 环境 + + ```bash + uv venv --python 3.10 + ``` +4. 安装依赖 + + ```bash + uv pip install -r requirements.txt + ``` + +### 运行 Gradio 演示 + + ```bash + python -m webui.app + ``` + + 如果在开发过程中需要热重载,请运行 + + ```bash + PYTHONPATH=. gradio webui/app.py + ``` + + +![ui](https://github.com/user-attachments/assets/3024e9bc-5d45-45f8-a4e6-b57bd2350d84) + +### 从 PyPI 运行 + +1. 安装 GraphGen + ```bash + uv pip install graphg + ``` + +2. CLI 运行 + ```bash + SYNTHESIZER_MODEL=your_synthesizer_model_name \ + SYNTHESIZER_BASE_URL=your_base_url_for_synthesizer_model \ + SYNTHESIZER_API_KEY=your_api_key_for_synthesizer_model \ + TRAINEE_MODEL=your_trainee_model_name \ + TRAINEE_BASE_URL=your_base_url_for_trainee_model \ + TRAINEE_API_KEY=your_api_key_for_trainee_model \ + graphg --output_dir cache + ``` + +### 源码运行 + +1. 配置环境 + - 在项目根目录创建 `.env` 文件 + ```bash + cp .env.example .env + ``` + - 设置以下环境变量: + ```bash + # Tokenizer + TOKENIZER_MODEL= + + # LLM + # 支持不同的后端:http_api、openai_api、ollama_api、ollama、huggingface、tgi、sglang、tensorrt + # Synthesizer 用于构建知识图谱并生成数据 + # Trainee 用于使用生成数据进行训练 + + # http_api / openai_api + SYNTHESIZER_BACKEND=openai_api + SYNTHESIZER_MODEL=gpt-4o-mini + SYNTHESIZER_BASE_URL= + SYNTHESIZER_API_KEY= + TRAINEE_BACKEND=openai_api + TRAINEE_MODEL=gpt-4o-mini + TRAINEE_BASE_URL= + TRAINEE_API_KEY= + + # azure_openai_api + # SYNTHESIZER_BACKEND=azure_openai_api + # The following is the same as your "Deployment name" in Azure + # SYNTHESIZER_MODEL= + # SYNTHESIZER_BASE_URL=https://.openai.azure.com/openai/deployments//chat/completions + # SYNTHESIZER_API_KEY= + # SYNTHESIZER_API_VERSION= + + # # ollama_api + # SYNTHESIZER_BACKEND=ollama_api + # SYNTHESIZER_MODEL=gemma3 + # SYNTHESIZER_BASE_URL=http://localhost:11434 + # + # Note: TRAINEE with ollama_api backend is not supported yet as ollama_api does not support logprobs. + + # # huggingface + # SYNTHESIZER_BACKEND=huggingface + # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # + # TRAINEE_BACKEND=huggingface + # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct + + # # sglang + # SYNTHESIZER_BACKEND=sglang + # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # SYNTHESIZER_TP_SIZE=1 + # SYNTHESIZER_NUM_GPUS=1 + + # TRAINEE_BACKEND=sglang + # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # SYNTHESIZER_TP_SIZE=1 + # SYNTHESIZER_NUM_GPUS=1 + + # # vllm + # SYNTHESIZER_BACKEND=vllm + # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # SYNTHESIZER_NUM_GPUS=1 + + # TRAINEE_BACKEND=vllm + # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct + # TRAINEE_NUM_GPUS=1 + ``` +2. (可选)如需修改默认生成配置,可编辑 `graphgen/configs/` 文件夹中的 YAML 文件. + + 例如: + + ```yaml + # examples/generate/generate_aggregated_qa/aggregated_config.yaml + global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + + nodes: + - id: read_files # id is unique in the pipeline, and can be referenced by other steps + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + # 其他设置... + ``` + +3. 生成数据 + + 选择所需格式并运行对应脚本: + + | 格式 | 运行脚本 | 说明 | + | ------------ | ---------------------------------------------------------------------- | --------------- | + | `cot` | `bash examples/generate/generate_cot_qa/generate_cot.sh` | 思维链问答对 | + | `atomic` | `bash examples/generate/generate_atomic_qa/generate_atomic.sh` | 覆盖基础知识的原子问答对 | + | `aggregated` | `bash examples/generate/generate_aggregated_qa/generate_aggregated.sh` | 整合复杂知识的聚合问答对 | + | `multi-hop` | `bash examples/generate/generate_multi_hop_qa/generate_multi_hop.sh` | 多跳推理问答对 | + | `vqa` | `bash examples/generate/generate_vqa/generate_vqa.sh` | 视觉问答对,结合视觉和文本理解 | + + + +4. 查看生成结果 + ```bash + ls cache/output + ``` + +### 使用 Docker 运行 +1. 构建镜像 + ```bash + docker build -t graphgen . + ``` +2. 启动容器 + ```bash + docker run -p 7860:7860 graphgen + ``` + + +## 🏗️ 系统架构 +参阅 deepwiki 的[分析](https://deepwiki.com/open-sciencelab/GraphGen)了解 GraphGen 系统、架构与核心功能的技术概览。 + + +### 工作流程 +![workflow](resources/images/flow.png) + + +## 🍀 致谢 +- [SiliconFlow](https://siliconflow.cn) 提供丰富的 LLM API,部分模型免费 +- [LightRAG](https://github.com/HKUDS/LightRAG) 简单高效的图检索方案 +- [ROGRAG](https://github.com/tpoisonooo/ROGRAG) 鲁棒优化版 GraphRAG 框架 +- [DB-GPT](https://github.com/eosphoros-ai/DB-GPT) AI 原生数据应用开发框架 + + +## 📚 引用 +如果本项目对你有帮助,请考虑引用我们的工作: +```bibtex +@misc{chen2025graphgenenhancingsupervisedfinetuning, + title={GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation}, + author={Zihong Chen and Wanli Jiang and Jinzhe Li and Zhonghang Yuan and Huanjun Kong and Wanli Ouyang and Nanqing Dong}, + year={2025}, + eprint={2505.20416}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2505.20416}, +} +``` + +## 📜 许可证 +本项目采用 [Apache License 2.0](LICENSE) 许可证。 + +## 📅 星标历史 + +[![Star History Chart](https://api.star-history.com/svg?repos=InternScience/GraphGen&type=Date)](https://www.star-history.com/#InternScience/GraphGen&Date) + diff --git a/baselines/BDS/README.md b/baselines/BDS/README.md new file mode 100644 index 00000000..9ef5abaa --- /dev/null +++ b/baselines/BDS/README.md @@ -0,0 +1,2 @@ +# Bottom-up Domain-specific Superintelligence: A Reliable Knowledge Graph is What We Need +https://arxiv.org/pdf/2507.13966 \ No newline at end of file diff --git a/graphgen/models/embed/__init__.py b/baselines/BDS/__init__.py similarity index 100% rename from graphgen/models/embed/__init__.py rename to baselines/BDS/__init__.py diff --git a/baselines/BDS/bds.py b/baselines/BDS/bds.py new file mode 100644 index 00000000..ea8bf45f --- /dev/null +++ b/baselines/BDS/bds.py @@ -0,0 +1,152 @@ +import argparse +import asyncio +import json +from typing import List + +import networkx as nx +from dotenv import load_dotenv +from tqdm.asyncio import tqdm as tqdm_async + +from graphgen.bases import BaseLLMWrapper +from graphgen.common import init_llm +from graphgen.models import NetworkXStorage +from graphgen.utils import create_event_loop + +QA_GENERATION_PROMPT = """ +Create an agriculture examination question for advanced agricultural students that tests the relationship between {src} and {tgt}. The relationship is: {path}. The question should: + 1. Be in multiple choice format (4 options) + 2. Require agriculture reasoning along the relationship + 3. Include a brief farm or field scenario + 4. Not directly mention the relationship in the question stem + 5. Have one clearly correct answer +Format: + + [Farm or Field Scenario] + + + A. [Option] + B. [Option] + C. [Option] + D. [Option] + + : + [Correct Option Letter] + +""" + + +def _post_process(text: str) -> dict: + try: + q = text.split("")[1].split("")[0].strip() + opts = text.split("")[1].split("")[0].strip().splitlines() + opts = [o.strip() for o in opts if o.strip()] + ans = text.split(":")[1].strip()[0].upper() + return { + "question": q, + "options": opts, + "answer": ord(ans) - ord("A"), + "raw": text, + } + except Exception as e: # pylint: disable=broad-except + print(f"Error in post-processing: {e}") + return {} + + +class BDS: + def __init__(self, llm_client: BaseLLMWrapper = None, max_concurrent: int = 1000): + self.llm_client: BaseLLMWrapper = llm_client or init_llm("synthesizer") + self.max_concurrent: int = max_concurrent + + def generate(self, tasks: List[dict]) -> List[dict]: + loop = create_event_loop() + return loop.run_until_complete(self._async_generate(tasks)) + + async def _async_generate(self, tasks: List[dict]) -> List[dict]: + sem = asyncio.Semaphore(self.max_concurrent) + + async def job(item): + async with sem: + path_str = " -> ".join([f"({h},{r},{t})" for h, r, t in item["path"]]) + prompt = QA_GENERATION_PROMPT.format( + src=item["src"], tgt=item["tgt"], path=path_str + ) + resp = await self.llm_client.generate_answer(prompt) + return _post_process(resp) + + tasks = [job(it) for it in tasks] + results = [] + for coro in tqdm_async(asyncio.as_completed(tasks), total=len(tasks)): + try: + if r := await coro: + results.append(r) + except Exception as e: # pylint: disable=broad-except + print("Error:", e) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_file", + help="GraphML input file path.", + default="resources/input_examples/graphml_demo.graphml", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/bds_qa.jsonl", + type=str, + ) + args = parser.parse_args() + + load_dotenv() + + bds = BDS() + + graph = NetworkXStorage.load_nx_graph(args.input_file) + + MAX_PATH = 20000 + all_paths = [] + + G = graph.to_directed() if not graph.is_directed() else graph + print(G) + + source_nodes = [n for n in G.nodes if G.out_degree(n) > 0][:1000] + + for src in source_nodes: + for path in nx.all_simple_paths(G, source=src, target=list(G.nodes), cutoff=3): + if len(path) == 4: + all_paths.append(path) + if len(all_paths) >= MAX_PATH: + break + if len(all_paths) >= MAX_PATH: + break + if len(all_paths) >= MAX_PATH: + break + + print(f"Found {len(all_paths)} 4-node paths") + + items = [] + for path in all_paths: + path_edges = [] + for i in range(len(path) - 1): + edge_data = G.get_edge_data(path[i], path[i + 1]) + if edge_data is None: + edge_data = G.get_edge_data(path[i + 1], path[i]) + if edge_data is None: + print(f"Warning: No edge data between {path[i]} and {path[i+1]}") + relation = "related_to" + else: + relation = edge_data.get("relation", "related_to") + path_edges.append((path[i], relation, path[i + 1])) + items.append({"src": path[0], "tgt": path[-1], "path": path_edges}) + + print(f"Prepared {len(items)} items for question generation") + + qa_pairs = bds.generate(items) + print(f"Generated {len(qa_pairs)} QA pairs") + + # Save results + with open(args.output_file, "w", encoding="utf-8") as f: + json.dump(qa_pairs, f, indent=4, ensure_ascii=False) diff --git a/baselines/EntiGraph/README.md b/baselines/EntiGraph/README.md new file mode 100644 index 00000000..ae1a53b2 --- /dev/null +++ b/baselines/EntiGraph/README.md @@ -0,0 +1,3 @@ +# EntiGraph +https://arxiv.org/abs/2409.07431 +https://github.com/zitongyang/synthetic_continued_pretraining \ No newline at end of file diff --git a/baselines/EntiGraph/entigraph.py b/baselines/EntiGraph/entigraph.py index 01e22dbc..d04546ef 100644 --- a/baselines/EntiGraph/entigraph.py +++ b/baselines/EntiGraph/entigraph.py @@ -1,11 +1,8 @@ -# https://arxiv.org/abs/2409.07431 -# https://github.com/zitongyang/synthetic_continued_pretraining - -import os +import argparse +import asyncio import json +import os import random -import asyncio -import argparse from hashlib import md5 from tqdm.asyncio import tqdm as tqdm_async @@ -18,9 +15,9 @@ def compute_content_hash(content, prefix: str = ""): return prefix + md5(content.encode()).hexdigest() -async def generate_entities(document_content: str, - system_message: str, - openai_model: str): +async def generate_entities( + document_content: str, system_message: str, openai_model: str +): prompt = f""" ### Document Content: {document_content} @@ -30,23 +27,25 @@ async def generate_entities(document_content: str, max_tries = 5 while not can_read_entities and max_tries > 0: try: - completion = await gptqa(prompt, - openai_model, - system_message, - json_format=False) - completion = completion[completion.find("{"): completion.rfind("}") + 1] + completion = await gptqa( + prompt, openai_model, system_message, json_format=False + ) + completion = completion[completion.find("{") : completion.rfind("}") + 1] response = json.loads(completion) - can_read_entities = response['entities'] + can_read_entities = response["entities"] return response - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Failed to generate entities: {str(e)}") max_tries -= 1 -async def generate_two_entity_relations(document_content: str, - entity1: str, - entity2: str, - system_message: str, - openai_model: str): + +async def generate_two_entity_relations( + document_content: str, + entity1: str, + entity2: str, + system_message: str, + openai_model: str, +): prompt = f""" ### Document Content: {document_content} @@ -54,17 +53,18 @@ async def generate_two_entity_relations(document_content: str, - {entity1} - {entity2} """ - completion = await gptqa(prompt, - openai_model, - system_message) + completion = await gptqa(prompt, openai_model, system_message) return completion -async def generate_three_entity_relations(document_content: str, - entity1: str, - entity2: str, - entity3: str, - system_message: str, - openai_model: str): + +async def generate_three_entity_relations( + document_content: str, + entity1: str, + entity2: str, + entity3: str, + system_message: str, + openai_model: str, +): prompt = f""" ### Document Content: {document_content} @@ -73,11 +73,10 @@ async def generate_three_entity_relations(document_content: str, - {entity2} - {entity3} """ - completion = await gptqa(prompt, - openai_model, - system_message) + completion = await gptqa(prompt, openai_model, system_message) return completion + def _post_process_synthetic_data(data): block = data.split("\n\n") qas = {} @@ -87,7 +86,7 @@ def _post_process_synthetic_data(data): answer = line.split("Answer: ")[1] qas[compute_content_hash(question)] = { "question": question, - "answer": answer + "answer": answer, } break return qas @@ -105,25 +104,26 @@ async def generate_document_entities(doc): async with semaphore: try: entities = await generate_entities( - doc.text, - task.openai_system_generate_entities, - model_name) + doc.text, task.openai_system_generate_entities, model_name + ) if not entities: return None return { - 'document': doc.text, - 'entities': entities['entities'], - 'summary': entities['summary'] + "document": doc.text, + "entities": entities["entities"], + "summary": entities["summary"], } - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return None entities_list = [] for result in tqdm_async( - asyncio.as_completed([generate_document_entities(doc) for doc in task.documents]), - total=len(task.documents), - desc="Generating entities" + asyncio.as_completed( + [generate_document_entities(doc) for doc in task.documents] + ), + total=len(task.documents), + desc="Generating entities", ): result = await result if result: @@ -132,38 +132,42 @@ async def generate_document_entities(doc): # iterate over triples of entities and generate relations pair_list = [] for doc in entities_list: - entities = doc['entities'] + entities = doc["entities"] temp = [] for i, entity_i in enumerate(entities): if i == len(entities) - 1: break for j in range(i + 1, len(entities)): entity_j = entities[j] - pair = (doc['document'], entity_i, entity_j) + pair = (doc["document"], entity_i, entity_j) temp.append(pair) # Compute all possible combinations of entities is impractical, so we randomly sample 10 pairs pair_list.extend(random.sample(temp, min(len(temp), 10))) - async def process_two_entity_relations(pair): async with semaphore: try: document, entity1, entity2 = pair response = await generate_two_entity_relations( - document, entity1, entity2, + document, + entity1, + entity2, task.openai_system_generate_two_entity_relations, - model_name) + model_name, + ) return response - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return None - corpus= [] + corpus = [] for result in tqdm_async( - asyncio.as_completed([process_two_entity_relations(pair) for pair in pair_list]), - total=len(pair_list), - desc="Generating two entity relations" + asyncio.as_completed( + [process_two_entity_relations(pair) for pair in pair_list] + ), + total=len(pair_list), + desc="Generating two entity relations", ): result = await result if result: @@ -194,51 +198,60 @@ async def process_two_entity_relations(pair): # ): # corpus.append(await result) - corpus = [doc['summary'] for doc in entities_list] + corpus + corpus = [doc["summary"] for doc in entities_list] + corpus qa_sft_results = {} async def generate_qa_sft(content): async with semaphore: - completion = await gptqa(content, model_name, task.openai_system_quality_qa_sft) + completion = await gptqa( + content, model_name, task.openai_system_quality_qa_sft + ) return completion - for result in tqdm_async( - asyncio.as_completed([generate_qa_sft(content) for content in corpus]), - total=len(corpus), - desc="Generating QA SFT" + asyncio.as_completed([generate_qa_sft(content) for content in corpus]), + total=len(corpus), + desc="Generating QA SFT", ): try: result = await result if result: qa_sft_results.update(_post_process_synthetic_data(result)) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return qa_sft_results -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/entigraph.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/json_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/entigraph.json", + type=str, + ) args = parser.parse_args() - results = asyncio.run(generate_synthetic_data_for_document(args.input_file, args.data_type)) + results = asyncio.run( + generate_synthetic_data_for_document(args.input_file, args.data_type) + ) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/baselines/Genie/README.md b/baselines/Genie/README.md new file mode 100644 index 00000000..b28f6428 --- /dev/null +++ b/baselines/Genie/README.md @@ -0,0 +1,2 @@ +# Genie +https://arxiv.org/pdf/2401.14367 \ No newline at end of file diff --git a/baselines/Genie/genie.py b/baselines/Genie/genie.py index 4e740251..972b20fe 100644 --- a/baselines/Genie/genie.py +++ b/baselines/Genie/genie.py @@ -1,18 +1,17 @@ -# https://arxiv.org/pdf/2401.14367 - -import os -import json import argparse import asyncio -from typing import List +import json +import os from dataclasses import dataclass -from tqdm.asyncio import tqdm as tqdm_async +from typing import List + from dotenv import load_dotenv +from tqdm.asyncio import tqdm as tqdm_async -from graphgen.models import OpenAIModel -from graphgen.utils import create_event_loop, compute_content_hash +from graphgen.models import OpenAIClient +from graphgen.utils import compute_content_hash, create_event_loop -PROMPT_TEMPLATE = '''Instruction: Given the next [document], create a [question] and [answer] pair that are grounded \ +PROMPT_TEMPLATE = """Instruction: Given the next [document], create a [question] and [answer] pair that are grounded \ in the main point of the document, don't add any additional information that is not in the document. The [question] is \ by an information-seeking user and the [answer] is provided by a helping AI Agent. @@ -45,20 +44,20 @@ [document]: {doc} -### Response:''' +### Response:""" def _post_process(content: str) -> tuple: if "[question]:" in content and "[answer]:" in content: - question = content.split('[question]: ')[1].split('[answer]: ')[0] - answer = content.split('[answer]: ')[1] + question = content.split("[question]: ")[1].split("[answer]: ")[0] + answer = content.split("[answer]: ")[1] return question, answer return None, None @dataclass class Genie: - llm_client: OpenAIModel = None + llm_client: OpenAIClient = None max_concurrent: int = 1000 def generate(self, docs: List[List[dict]]) -> List[dict]: @@ -77,58 +76,67 @@ async def process_chunk(content: str): tasks = [] for doc in docs: for chunk in doc: - tasks.append(process_chunk(chunk['content'])) + tasks.append(process_chunk(chunk["content"])) - for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Genie"): + for result in tqdm_async( + asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Genie" + ): try: question, answer = _post_process(await result) if question and answer: final_results[compute_content_hash(question)] = { - 'question': question, - 'answer': answer + "question": question, + "answer": answer, } - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return final_results + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/genie.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/json_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/genie.json", + type=str, + ) args = parser.parse_args() load_dotenv() - llm_client = OpenAIModel( - model_name=os.getenv("SYNTHESIZER_MODEL"), + llm_client = OpenAIClient( + model=os.getenv("SYNTHESIZER_MODEL"), api_key=os.getenv("SYNTHESIZER_API_KEY"), - base_url=os.getenv("SYNTHESIZER_BASE_URL") + base_url=os.getenv("SYNTHESIZER_BASE_URL"), ) genie = Genie(llm_client=llm_client) - if args.data_type == 'raw': - with open(args.input_file, "r", encoding='utf-8') as f: + if args.data_type == "raw": + with open(args.input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] data = [[chunk] for chunk in data] - elif args.data_type == 'chunked': - with open(args.input_file, "r", encoding='utf-8') as f: + elif args.data_type == "chunked": + with open(args.input_file, "r", encoding="utf-8") as f: data = json.load(f) results = genie.generate(data) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/baselines/LongForm/README.md b/baselines/LongForm/README.md new file mode 100644 index 00000000..3bc7bbf0 --- /dev/null +++ b/baselines/LongForm/README.md @@ -0,0 +1,3 @@ +# LongForm +https://arxiv.org/pdf/2304.08460 +https://github.com/akoksal/LongForm/tree/main \ No newline at end of file diff --git a/baselines/LongForm/longform.py b/baselines/LongForm/longform.py index c37f1e8f..abf4fd3e 100644 --- a/baselines/LongForm/longform.py +++ b/baselines/LongForm/longform.py @@ -1,28 +1,26 @@ -# https://arxiv.org/pdf/2304.08460 -# https://github.com/akoksal/LongForm/tree/main - -import os -import json -from dataclasses import dataclass import argparse import asyncio +import json +import os +from dataclasses import dataclass from typing import List -from tqdm.asyncio import tqdm as tqdm_async -from dotenv import load_dotenv -from graphgen.models import OpenAIModel -from graphgen.utils import create_event_loop, compute_content_hash +from dotenv import load_dotenv +from tqdm.asyncio import tqdm as tqdm_async +from graphgen.models import OpenAIClient +from graphgen.utils import compute_content_hash, create_event_loop -PROMPT_TEMPLATE = '''Instruction: X +PROMPT_TEMPLATE = """Instruction: X Output:{doc} What kind of instruction could this be the answer to? -X:''' +X:""" + @dataclass class LongForm: - llm_client: OpenAIModel = None + llm_client: OpenAIClient = None max_concurrent: int = 1000 def generate(self, docs: List[List[dict]]) -> List[dict]: @@ -38,62 +36,73 @@ async def process_chunk(content: str): question = await self.llm_client.generate_answer(content) return { compute_content_hash(question): { - 'question': question, - 'answer': content + "question": question, + "answer": content, } } tasks = [] for doc in docs: for chunk in doc: - tasks.append(process_chunk(chunk['content'])) + tasks.append(process_chunk(chunk["content"])) - for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using LongForm"): + for result in tqdm_async( + asyncio.as_completed(tasks), + total=len(tasks), + desc="Generating using LongForm", + ): try: qa = await result final_results.update(qa) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return final_results + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/longform.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/json_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/longform.json", + type=str, + ) args = parser.parse_args() load_dotenv() - llm_client = OpenAIModel( - model_name=os.getenv("SYNTHESIZER_MODEL"), + llm_client = OpenAIClient( + model=os.getenv("SYNTHESIZER_MODEL"), api_key=os.getenv("SYNTHESIZER_API_KEY"), - base_url=os.getenv("SYNTHESIZER_BASE_URL") + base_url=os.getenv("SYNTHESIZER_BASE_URL"), ) longform = LongForm(llm_client=llm_client) - if args.data_type == 'raw': - with open(args.input_file, "r", encoding='utf-8') as f: + if args.data_type == "raw": + with open(args.input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] data = [[chunk] for chunk in data] - elif args.data_type == 'chunked': - with open(args.input_file, "r", encoding='utf-8') as f: + elif args.data_type == "chunked": + with open(args.input_file, "r", encoding="utf-8") as f: data = json.load(f) results = longform.generate(data) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/baselines/SELF-QA/README.md b/baselines/SELF-QA/README.md new file mode 100644 index 00000000..d8966888 --- /dev/null +++ b/baselines/SELF-QA/README.md @@ -0,0 +1,2 @@ +# SELF-QA +https://arxiv.org/abs/2305.11952 \ No newline at end of file diff --git a/baselines/SELF-QA/self-qa.py b/baselines/SELF-QA/self-qa.py index ba16b15c..b222d970 100644 --- a/baselines/SELF-QA/self-qa.py +++ b/baselines/SELF-QA/self-qa.py @@ -1,18 +1,17 @@ -# https://arxiv.org/abs/2305.11952 - -import os +import argparse +import asyncio import json +import os from dataclasses import dataclass from typing import List -import argparse -import asyncio -from tqdm.asyncio import tqdm as tqdm_async + from dotenv import load_dotenv +from tqdm.asyncio import tqdm as tqdm_async -from graphgen.models import OpenAIModel -from graphgen.utils import create_event_loop, compute_content_hash +from graphgen.models import OpenAIClient +from graphgen.utils import compute_content_hash, create_event_loop -INSTRUCTION_GENERATION_PROMPT = '''The background knowledge is: +INSTRUCTION_GENERATION_PROMPT = """The background knowledge is: {doc} Please generate ten instruction questions as diverse as possible based on the content of the above article. @@ -22,9 +21,9 @@ Please generate questions in the following format: 1. Question: ... 2. Question: ... -''' +""" -READING_COMPREHENSION_PROMPT = '''The background knowledge is: +READING_COMPREHENSION_PROMPT = """The background knowledge is: {doc} Please answer the following question based on the content of the article above: {question} @@ -34,27 +33,30 @@ Please generate the corresponding answer in the following format: Question: ... Answer: ... -''' +""" + def _post_process_instructions(content: str) -> list: - lines = content.split('\n') + lines = content.split("\n") questions = [] for line in lines: if "Question:" in line: - question = line.split('Question:')[1].strip() + question = line.split("Question:")[1].strip() questions.append(question) return questions + def _post_process_answers(content: str) -> tuple: if "Question:" in content and "Answer:" in content: - question = content.split('Question:')[1].split('Answer:')[0].strip() - answer = content.split('Answer:')[1].strip() + question = content.split("Question:")[1].split("Answer:")[0].strip() + answer = content.split("Answer:")[1].strip() return question, answer return None, None + @dataclass class SelfQA: - llm_client: OpenAIModel = None + llm_client: OpenAIClient = None max_concurrent: int = 100 def generate(self, docs: List[List[dict]]) -> List[dict]: @@ -73,81 +75,102 @@ async def process_chunk(content: str): instruction_questions = _post_process_instructions(response) qas = [] - for qa in tqdm_async(asyncio.as_completed([ - self.llm_client.generate_answer(READING_COMPREHENSION_PROMPT.format( - doc=content, - question=question - )) for question in instruction_questions]), - total=len(instruction_questions), desc="Generating QAs"): + for qa in tqdm_async( + asyncio.as_completed( + [ + self.llm_client.generate_answer( + READING_COMPREHENSION_PROMPT.format( + doc=content, question=question + ) + ) + for question in instruction_questions + ] + ), + total=len(instruction_questions), + desc="Generating QAs", + ): try: question, answer = _post_process_answers(await qa) if question and answer: - qas.append({ - compute_content_hash(question): { - 'question': question, - 'answer': answer + qas.append( + { + compute_content_hash(question): { + "question": question, + "answer": answer, + } } - }) - except Exception as e: # pylint: disable=broad-except + ) + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") continue return qas - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return [] tasks = [] for doc in docs: for chunk in doc: - tasks.append(process_chunk(chunk['content'])) + tasks.append(process_chunk(chunk["content"])) - for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using SelfQA"): + for result in tqdm_async( + asyncio.as_completed(tasks), + total=len(tasks), + desc="Generating using SelfQA", + ): try: qas = await result for qa in qas: final_results.update(qa) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return final_results + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/self-qa.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/json_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/self-qa.json", + type=str, + ) args = parser.parse_args() load_dotenv() - llm_client = OpenAIModel( - model_name=os.getenv("SYNTHESIZER_MODEL"), + llm_client = OpenAIClient( + model=os.getenv("SYNTHESIZER_MODEL"), api_key=os.getenv("SYNTHESIZER_API_KEY"), - base_url=os.getenv("SYNTHESIZER_BASE_URL") + base_url=os.getenv("SYNTHESIZER_BASE_URL"), ) self_qa = SelfQA(llm_client=llm_client) - if args.data_type == 'raw': - with open(args.input_file, "r", encoding='utf-8') as f: + if args.data_type == "raw": + with open(args.input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] data = [[chunk] for chunk in data] - elif args.data_type == 'chunked': - with open(args.input_file, "r", encoding='utf-8') as f: + elif args.data_type == "chunked": + with open(args.input_file, "r", encoding="utf-8") as f: data = json.load(f) results = self_qa.generate(data) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/baselines/Wrap/README.md b/baselines/Wrap/README.md new file mode 100644 index 00000000..f2002627 --- /dev/null +++ b/baselines/Wrap/README.md @@ -0,0 +1,2 @@ +# Wrap +https://arxiv.org/abs/2401.16380 \ No newline at end of file diff --git a/baselines/Wrap/wrap.py b/baselines/Wrap/wrap.py index 1bfdcf4a..90ce2a46 100644 --- a/baselines/Wrap/wrap.py +++ b/baselines/Wrap/wrap.py @@ -1,19 +1,17 @@ -# https://arxiv.org/abs/2401.16380 - -import os -import json import argparse import asyncio +import json +import os from dataclasses import dataclass from typing import List + from dotenv import load_dotenv from tqdm.asyncio import tqdm as tqdm_async -from graphgen.models import OpenAIModel -from graphgen.utils import create_event_loop, compute_content_hash +from graphgen.models import OpenAIClient +from graphgen.utils import compute_content_hash, create_event_loop - -PROMPT_TEMPLATE = '''A chat between a curious user and an artificial intelligence assistant. +PROMPT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the questions. USER: Convert the following paragraph into a conversational format with multiple tags of "Question:" followed by "Answer:":{doc}. @@ -26,18 +24,19 @@ --- Question: What was the revenue drop in the first quarter compared to the same period last year? Answer: The revenue dropped 15 percent. --- -''' +""" + def _post_process(content: str) -> list: - raw_qas = content.split('---') + raw_qas = content.split("---") qas = [] for item in raw_qas: try: if "Question:" in item and "Answer:" in item: - question = item.split('Question:')[1].split('Answer:')[0].strip() - answer = item.split('Answer:')[1].strip() + question = item.split("Question:")[1].split("Answer:")[0].strip() + answer = item.split("Answer:")[1].strip() qas.append((question, answer)) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") continue return qas @@ -45,7 +44,7 @@ def _post_process(content: str) -> list: @dataclass class Wrap: - llm_client: OpenAIModel = None + llm_client: OpenAIClient = None max_concurrent: int = 1000 def generate(self, docs: List[List[dict]]) -> List[dict]: @@ -64,59 +63,67 @@ async def process_chunk(content: str): tasks = [] for doc in docs: for chunk in doc: - tasks.append(process_chunk(chunk['content'])) + tasks.append(process_chunk(chunk["content"])) - for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Wrap"): + for result in tqdm_async( + asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Wrap" + ): try: qas = _post_process(await result) for qa in qas: final_results[compute_content_hash(qa[0])] = { - 'question': qa[0], - 'answer': qa[1] + "question": qa[0], + "answer": qa[1], } - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return final_results if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/wrap.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/json_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/wrap.json", + type=str, + ) args = parser.parse_args() load_dotenv() - llm_client = OpenAIModel( - model_name=os.getenv("SYNTHESIZER_MODEL"), + llm_client = OpenAIClient( + model=os.getenv("SYNTHESIZER_MODEL"), api_key=os.getenv("SYNTHESIZER_API_KEY"), - base_url=os.getenv("SYNTHESIZER_BASE_URL") + base_url=os.getenv("SYNTHESIZER_BASE_URL"), ) wrap = Wrap(llm_client=llm_client) - if args.data_type == 'raw': - with open(args.input_file, "r", encoding='utf-8') as f: + if args.data_type == "raw": + with open(args.input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] data = [[chunk] for chunk in data] - elif args.data_type == 'chunked': - with open(args.input_file, "r", encoding='utf-8') as f: + elif args.data_type == "chunked": + with open(args.input_file, "r", encoding="utf-8") as f: data = json.load(f) results = wrap.generate(data) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/scripts/baselines/generate_all_baselines.sh b/examples/baselines/generate_all_baselines.sh similarity index 100% rename from scripts/baselines/generate_all_baselines.sh rename to examples/baselines/generate_all_baselines.sh diff --git a/examples/baselines/generate_bds.sh b/examples/baselines/generate_bds.sh new file mode 100644 index 00000000..d4bd9e8c --- /dev/null +++ b/examples/baselines/generate_bds.sh @@ -0,0 +1,2 @@ +python3 -m baselines.BDS.bds --input_file resources/input_examples/graphml_demo.graphml \ + --output_file cache/data/bds.json \ diff --git a/scripts/baselines/generate_entigraph.sh b/examples/baselines/generate_entigraph.sh similarity index 85% rename from scripts/baselines/generate_entigraph.sh rename to examples/baselines/generate_entigraph.sh index ce9cc991..8474c96f 100644 --- a/scripts/baselines/generate_entigraph.sh +++ b/examples/baselines/generate_entigraph.sh @@ -1,3 +1,3 @@ -python3 -m baselines.EntiGraph.entigraph --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.EntiGraph.entigraph --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/entigraph.json \ diff --git a/scripts/baselines/generate_genie.sh b/examples/baselines/generate_genie.sh similarity index 57% rename from scripts/baselines/generate_genie.sh rename to examples/baselines/generate_genie.sh index 0119930d..3a06de10 100644 --- a/scripts/baselines/generate_genie.sh +++ b/examples/baselines/generate_genie.sh @@ -1,3 +1,3 @@ -python3 -m baselines.Genie.genie --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.Genie.genie --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/genie.json \ diff --git a/scripts/baselines/generate_longform.sh b/examples/baselines/generate_longform.sh similarity index 56% rename from scripts/baselines/generate_longform.sh rename to examples/baselines/generate_longform.sh index d7ed70c5..62de848c 100644 --- a/scripts/baselines/generate_longform.sh +++ b/examples/baselines/generate_longform.sh @@ -1,3 +1,3 @@ -python3 -m baselines.LongForm.longform --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.LongForm.longform --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/longform.json \ diff --git a/scripts/baselines/generate_selfqa.sh b/examples/baselines/generate_selfqa.sh similarity index 57% rename from scripts/baselines/generate_selfqa.sh rename to examples/baselines/generate_selfqa.sh index 18eb7b1f..ef13e721 100644 --- a/scripts/baselines/generate_selfqa.sh +++ b/examples/baselines/generate_selfqa.sh @@ -1,3 +1,3 @@ -python3 -m baselines.SELF-QA.self-qa --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.SELF-QA.self-qa --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/self-qa.json \ diff --git a/scripts/baselines/generate_wrap.sh b/examples/baselines/generate_wrap.sh similarity index 58% rename from scripts/baselines/generate_wrap.sh rename to examples/baselines/generate_wrap.sh index f10857a5..fcaf3933 100644 --- a/scripts/baselines/generate_wrap.sh +++ b/examples/baselines/generate_wrap.sh @@ -1,3 +1,3 @@ -python3 -m baselines.Wrap.wrap --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.Wrap.wrap --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/wrap.json \ diff --git a/scripts/evaluate.sh b/examples/evaluate/evaluate.sh similarity index 82% rename from scripts/evaluate.sh rename to examples/evaluate/evaluate.sh index 25706d48..2b352669 100644 --- a/scripts/evaluate.sh +++ b/examples/evaluate/evaluate.sh @@ -1,4 +1,3 @@ python3 -m graphgen.evaluate --folder cache/data \ - --output cache/output \ --reward "OpenAssistant/reward-model-deberta-v3-large-v2,BAAI/IndustryCorpus2_DataRater" \ --uni MingZhong/unieval-sum \ diff --git a/examples/extract/extract_schema_guided/README.md b/examples/extract/extract_schema_guided/README.md new file mode 100644 index 00000000..ab117c0f --- /dev/null +++ b/examples/extract/extract_schema_guided/README.md @@ -0,0 +1 @@ +# Extract Schema-Guided Information from Documents diff --git a/examples/extract/extract_schema_guided/extract_schema_guided.sh b/examples/extract/extract_schema_guided/extract_schema_guided.sh new file mode 100644 index 00000000..d9890567 --- /dev/null +++ b/examples/extract/extract_schema_guided/extract_schema_guided.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml diff --git a/examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml b/examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml new file mode 100644 index 00000000..1a25e196 --- /dev/null +++ b/examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml @@ -0,0 +1,35 @@ +global_params: + working_dir: cache + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/extract_demo.txt + + - id: chunk + op_name: chunk + type: map_batch + dependencies: + - read + execution_params: + replicas: 4 + params: + chunk_size: 20480 # larger chunk size for better context + chunk_overlap: 2000 + + - id: extract + op_name: extract + type: map_batch + dependencies: + - chunk + execution_params: + replicas: 1 + batch_size: 128 + params: + method: schema_guided + schema_path: graphgen/templates/extraction/schemas/legal_contract.json diff --git a/examples/generate/generate_aggregated_qa/README.md b/examples/generate/generate_aggregated_qa/README.md new file mode 100644 index 00000000..ab08693b --- /dev/null +++ b/examples/generate/generate_aggregated_qa/README.md @@ -0,0 +1,3 @@ +# Generate Aggregated QAs + +Aggregated mode is one of three question-answering scenarios in GraphGen (alongside atomic and multi-hop) designed to generate synthetic training data that incorporates complex, integrated knowledge from multiple sources. \ No newline at end of file diff --git a/examples/generate/generate_aggregated_qa/aggregated_config.yaml b/examples/generate/generate_aggregated_qa/aggregated_config.yaml new file mode 100644 index 00000000..5957dff0 --- /dev/null +++ b/examples/generate/generate_aggregated_qa/aggregated_config.yaml @@ -0,0 +1,79 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read_files # id is unique in the pipeline, and can be referenced by other steps + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: chunk_documents + op_name: chunk + type: map_batch + dependencies: + - read_files + execution_params: + replicas: 4 + params: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting + + - id: build_kg + op_name: build_kg + type: map_batch + dependencies: + - chunk_documents + execution_params: + replicas: 1 + batch_size: 128 + + - id: quiz + op_name: quiz + type: aggregate + dependencies: + - build_kg + execution_params: + replicas: 1 + batch_size: 128 + params: + quiz_samples: 2 # number of quiz samples to generate + concurrency_limit: 200 + + - id: judge + op_name: judge + type: map_batch + dependencies: + - quiz + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - judge + params: + method: ece # ece is a custom partition method based on comprehension loss + method_params: + max_units_per_community: 20 # max nodes and edges per community + min_units_per_community: 5 # min nodes and edges per community + max_tokens_per_community: 10240 # max tokens per community + unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: aggregated # atomic, aggregated, multi_hop, cot, vqa + data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/examples/generate/generate_aggregated_qa/generate_aggregated.sh b/examples/generate/generate_aggregated_qa/generate_aggregated.sh new file mode 100644 index 00000000..9169896b --- /dev/null +++ b/examples/generate/generate_aggregated_qa/generate_aggregated.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/generate/generate_aggregated_qa/aggregated_config.yaml diff --git a/examples/generate/generate_atomic_qa/README.md b/examples/generate/generate_atomic_qa/README.md new file mode 100644 index 00000000..e979b182 --- /dev/null +++ b/examples/generate/generate_atomic_qa/README.md @@ -0,0 +1,3 @@ +# Generate Atomic QAs + +Atomic mode generates question-answer pairs that test basic, isolated knowledge from individual facts or relationships in the knowledge graph. \ No newline at end of file diff --git a/examples/generate/generate_atomic_qa/atomic_config.yaml b/examples/generate/generate_atomic_qa/atomic_config.yaml new file mode 100644 index 00000000..826302d9 --- /dev/null +++ b/examples/generate/generate_atomic_qa/atomic_config.yaml @@ -0,0 +1,55 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/json_demo.json + + - id: chunk + op_name: chunk + type: map_batch + dependencies: + - read + execution_params: + replicas: 4 + params: + chunk_size: 1024 + chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + execution_params: + replicas: 1 + batch_size: 128 + dependencies: + - chunk + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: dfs + method_params: + max_units_per_community: 1 + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: atomic + data_format: Alpaca diff --git a/examples/generate/generate_atomic_qa/generate_atomic.sh b/examples/generate/generate_atomic_qa/generate_atomic.sh new file mode 100644 index 00000000..7f5339f5 --- /dev/null +++ b/examples/generate/generate_atomic_qa/generate_atomic.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/generate/generate_atomic_qa/atomic_config.yaml diff --git a/examples/generate/generate_cot_qa/README.md b/examples/generate/generate_cot_qa/README.md new file mode 100644 index 00000000..37afe9c7 --- /dev/null +++ b/examples/generate/generate_cot_qa/README.md @@ -0,0 +1 @@ +# Generate CoT QAs diff --git a/examples/generate/generate_cot_qa/cot_config.yaml b/examples/generate/generate_cot_qa/cot_config.yaml new file mode 100644 index 00000000..bb9b49c7 --- /dev/null +++ b/examples/generate/generate_cot_qa/cot_config.yaml @@ -0,0 +1,57 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/txt_demo.txt + + - id: chunk + op_name: chunk + type: map_batch + dependencies: + - read + execution_params: + replicas: 4 + params: + chunk_size: 1024 + chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + execution_params: + replicas: 1 + batch_size: 128 + dependencies: + - chunk + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: leiden + method_params: + max_size: 20 + use_lcc: false + random_seed: 42 + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: cot + data_format: Sharegpt diff --git a/examples/generate/generate_cot_qa/generate_cot.sh b/examples/generate/generate_cot_qa/generate_cot.sh new file mode 100644 index 00000000..78d17491 --- /dev/null +++ b/examples/generate/generate_cot_qa/generate_cot.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/generate/generate_cot_qa/cot_config.yaml diff --git a/examples/generate/generate_multi_hop_qa/README.md b/examples/generate/generate_multi_hop_qa/README.md new file mode 100644 index 00000000..dcee73be --- /dev/null +++ b/examples/generate/generate_multi_hop_qa/README.md @@ -0,0 +1 @@ +# Generate Multi-hop QAs diff --git a/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh b/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh new file mode 100644 index 00000000..3730ce9e --- /dev/null +++ b/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/generate/generate_multi_hop_qa/multi_hop_config.yaml diff --git a/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml b/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml new file mode 100644 index 00000000..a5f42b40 --- /dev/null +++ b/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml @@ -0,0 +1,58 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/csv_demo.csv + + - id: chunk + op_name: chunk + type: map_batch + dependencies: + - read + execution_params: + replicas: 4 + params: + chunk_size: 1024 + chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + dependencies: + - chunk + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: ece + method_params: + max_units_per_community: 3 + min_units_per_community: 3 + max_tokens_per_community: 10240 + unit_sampling: random + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: multi_hop + data_format: ChatML diff --git a/examples/generate/generate_omics_qa/README.md b/examples/generate/generate_omics_qa/README.md new file mode 100644 index 00000000..9aca9561 --- /dev/null +++ b/examples/generate/generate_omics_qa/README.md @@ -0,0 +1,216 @@ +# Multi-omics Knowledge Graph QA Generation + +This example demonstrates how to build knowledge graphs from multi-omics data (DNA, RNA, protein) and generate question-answer pairs using the unified `omics_qa` method. + +## Pipeline Overview + +The pipeline includes the following steps: + +1. **read**: Read input files (JSON/JSONL format with sequence queries or protein data) +2. **search**: Search biological databases (NCBI for DNA, RNAcentral for RNA, UniProt for protein) - *optional if input already contains search results* +3. **chunk**: Chunk sequences and metadata +4. **build_kg**: Extract entities and relationships to build knowledge graph +5. **partition**: Partition the knowledge graph into communities using anchor-based BFS +6. **generate**: Generate QA pairs from partitioned communities with automatic molecule caption extraction + +## Key Features + +- **Unified QA Generation**: Single `omics_qa` method supports DNA, RNA, and Protein +- **Automatic Caption Extraction**: Automatically extracts and attaches molecule-specific information (dna/rna/protein captions) to each QA pair +- **Flexible Configuration**: Easy to switch between DNA, RNA, and Protein by changing input file and data source +- **Anchor-based Partitioning**: Uses molecule type as anchor for BFS partitioning (dna/rna/protein) + +## Quick Start + +### 1. Configure Input Data + +Edit `omics_qa_config.yaml` to set the input file path: + +**For DNA:** +```yaml +input_path: + - examples/input_examples/search_dna_demo.jsonl +``` + +**For RNA:** +```yaml +input_path: + - examples/input_examples/search_rna_demo.jsonl +``` + +**For Protein:** +```yaml +input_path: + - examples/input_examples/search_protein_demo.jsonl +``` + +### 2. Configure Data Source + +Set the appropriate data source and parameters in the `search_data` node: + +**For DNA (NCBI):** +```yaml +data_sources: [ncbi] +ncbi_params: + email: your_email@example.com # Required! + tool: GraphGen + use_local_blast: true + local_blast_db: refseq_release/refseq_release + blast_num_threads: 2 + max_concurrent: 5 +``` + +**For RNA (RNAcentral):** +```yaml +data_sources: [rnacentral] +rnacentral_params: + use_local_blast: true + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD + blast_num_threads: 2 + max_concurrent: 5 +``` + +**For Protein (UniProt):** +```yaml +data_sources: [uniprot] +uniprot_params: + use_local_blast: true + local_blast_db: ${RELEASE}/uniprot_sprot + blast_num_threads: 2 + max_concurrent: 5 +``` + +### 3. Configure Anchor Type + +Set the `anchor_type` in the `partition` node to match your molecule type: + +```yaml +partition: + params: + method: anchor_bfs + method_params: + anchor_type: protein # Change to "dna" or "rna" as needed + max_units_per_community: 10 +``` + +### 4. Run the Pipeline + +```bash +./generate_omics_qa.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +### For DNA/RNA (JSONL format): +```jsonl +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": ">query\nATGCGATCG..."} +{"type": "text", "content": "ATGCGATCG..."} +``` + +### For Protein (JSONL format): +```jsonl +{"type": "text", "content": "P01308"} +{"type": "text", "content": "insulin"} +{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +``` + +## Output Format + +The `omics_qa` method automatically extracts and attaches molecule-specific captions to QA pairs: + +### Alpaca Format: +```json +{ + "instruction": "What is the function of this protein?", + "input": "", + "output": "The protein functions as...", + "dna": {...}, # DNA caption (if molecule_type is DNA) + "rna": {...}, # RNA caption (if molecule_type is RNA) + "protein": {...} # Protein caption (if molecule_type is protein) +} +``` + +### ChatML Format: +```json +{ + "messages": [ + { + "role": "user", + "content": [ + { + "text": "What is the function of this protein?", + "dna": {...}, + "rna": {...}, + "protein": {...} + } + ] + }, + { + "role": "assistant", + "content": "The protein functions as..." + } + ] +} +``` + +## Caption Information + +The generator automatically extracts relevant caption information based on molecule type: + +- **DNA**: gene_name, gene_description, organism, chromosome, genomic_location, function, gene_type, etc. +- **RNA**: rna_type, description, organism, related_genes, gene_name, so_term, modifications, etc. +- **Protein**: protein_name, gene_names, organism, function, sequence, entry_name, etc. + +## Configuration Options + +### Chunking Parameters +- `chunk_size`: Size for text metadata chunks (default: 1024) +- `chunk_overlap`: Overlap for text chunks (default: 100) +- `sequence_chunk_size`: Size for sequence chunks (default: 1000) +- `sequence_chunk_overlap`: Overlap for sequence chunks (default: 100) + +### Partition Parameters +- `method`: `anchor_bfs` (recommended for omics data) +- `anchor_type`: `dna`, `rna`, or `protein` (must match your data type) +- `max_units_per_community`: Maximum nodes and edges per community (default: 10) + +### Generation Parameters +- `method`: `omics_qa` (unified method for DNA/RNA/Protein) +- `data_format`: `Alpaca`, `ChatML`, or `Sharegpt` + +## Notes + +- **NCBI requires an email address** - Make sure to set `email` in `ncbi_params` +- **Anchor type must match molecule type** - Set `anchor_type` to match your data (dna/rna/protein) +- **Local BLAST** can be enabled if you have local databases set up (see `examples/search/build_db/`) +- **Caption extraction** is automatic - The generator detects molecule type and extracts relevant caption information +- Adjust `max_concurrent` based on your system resources and API rate limits + +## Examples + +### Generate QA for Protein Data +1. Set `input_path` to `examples/input_examples/search_protein_demo.jsonl` +2. Set `data_sources: [uniprot]` +3. Set `anchor_type: protein` +4. Run `./generate_omics_qa.sh` + +### Generate QA for DNA Data +1. Set `input_path` to `examples/input_examples/search_dna_demo.jsonl` +2. Set `data_sources: [ncbi]` +3. Set `anchor_type: dna` +4. Run `./generate_omics_qa.sh` + +### Generate QA for RNA Data +1. Set `input_path` to `examples/input_examples/search_rna_demo.jsonl` +2. Set `data_sources: [rnacentral]` +3. Set `anchor_type: rna` +4. Run `./generate_omics_qa.sh` diff --git a/examples/generate/generate_omics_qa/generate_omics_qa.sh b/examples/generate/generate_omics_qa/generate_omics_qa.sh new file mode 100755 index 00000000..3afb129e --- /dev/null +++ b/examples/generate/generate_omics_qa/generate_omics_qa.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ + --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml \ + --output_dir cache/ diff --git a/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh new file mode 100755 index 00000000..20b0b533 --- /dev/null +++ b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ + --config_file examples/generate/generate_omics_qa/omics_qa_config_searched.yaml \ + --output_dir cache/ diff --git a/examples/generate/generate_omics_qa/omics_qa_config.yaml b/examples/generate/generate_omics_qa/omics_qa_config.yaml new file mode 100644 index 00000000..22b9b26c --- /dev/null +++ b/examples/generate/generate_omics_qa/omics_qa_config.yaml @@ -0,0 +1,93 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read_files + op_name: read + type: source + dependencies: [] + params: + input_path: + # three input files to generate DNA, RNA, and Protein data together + - examples/input_examples/search_dna_demo.jsonl + - examples/input_examples/search_rna_demo.jsonl + - examples/input_examples/search_protein_demo.jsonl + + - id: search_data + op_name: search + type: map_batch + dependencies: + - read_files + execution_params: + replicas: 1 + batch_size: 10 + params: + data_sources: [ncbi, rnacentral, uniprot] # Multi-omics: use all three data sources + # DNA search parameters + ncbi_params: + email: your_email@example.com # Required for NCBI + tool: GraphGen + use_local_blast: true + local_blast_db: databases/refseq_232_old/refseq_232 + blast_num_threads: 2 + max_concurrent: 5 + # RNA search parameters + rnacentral_params: + use_local_blast: true + local_blast_db: databases/rnacentral_merged_20251213/rnacentral_merged_20251213 + blast_num_threads: 2 + max_concurrent: 5 + # Protein search parameters + uniprot_params: + use_local_blast: true + # local_blast_db: ${RELEASE}/uniprot_sprot + local_blast_db: databases/2025_04/uniprot_sprot + blast_num_threads: 2 + max_concurrent: 5 + + - id: chunk_documents + op_name: chunk + type: map_batch + dependencies: + - search_data + execution_params: + replicas: 4 + params: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting + sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein) + sequence_chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + dependencies: + - chunk_documents + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: anchor_bfs # partition method + method_params: + anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string) + max_units_per_community: 10 # max nodes and edges per community + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: omics_qa # unified QA generation method for DNA/RNA/Protein + data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml b/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml new file mode 100644 index 00000000..cf01bc65 --- /dev/null +++ b/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml @@ -0,0 +1,73 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read_files + op_name: read + type: source + dependencies: [] + params: + input_path: + # Use pre-searched data files (skip search step) + # The search_service will automatically detect and skip search if data already contains search results + - examples/input_examples/searched_dna_demo.jsonl + - examples/input_examples/searched_rna_demo.jsonl + - examples/input_examples/searched_protein_demo.jsonl + + - id: search_data + op_name: search + type: map_batch + dependencies: + - read_files + execution_params: + replicas: 1 + batch_size: 10 + # Note: search_service will automatically detect pre-searched data and skip search, + # but it will still normalize the data format (ensure _doc_id, content, data_source fields exist) + + - id: chunk_documents + op_name: chunk + type: map_batch + dependencies: + - search_data + execution_params: + replicas: 4 + params: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting + sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein) + sequence_chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + dependencies: + - chunk_documents + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: anchor_bfs # partition method + method_params: + anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string) + max_units_per_community: 10 # max nodes and edges per community + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: omics_qa # unified QA generation method for DNA/RNA/Protein + data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/examples/generate/generate_vqa/README.md b/examples/generate/generate_vqa/README.md new file mode 100644 index 00000000..42b13865 --- /dev/null +++ b/examples/generate/generate_vqa/README.md @@ -0,0 +1 @@ +# Generate VQAs \ No newline at end of file diff --git a/examples/generate/generate_vqa/generate_vqa.sh b/examples/generate/generate_vqa/generate_vqa.sh new file mode 100644 index 00000000..9cbb3d40 --- /dev/null +++ b/examples/generate/generate_vqa/generate_vqa.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/generate/generate_vqa/vqa_config.yaml diff --git a/examples/generate/generate_vqa/vqa_config.yaml b/examples/generate/generate_vqa/vqa_config.yaml new file mode 100644 index 00000000..7a869fe5 --- /dev/null +++ b/examples/generate/generate_vqa/vqa_config.yaml @@ -0,0 +1,59 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/vqa_demo.json + modalities: + - text + - image + + - id: chunk + op_name: chunk + type: map_batch + dependencies: + - read + execution_params: + replicas: 4 + params: + chunk_size: 1024 + chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + dependencies: + - chunk + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: anchor_bfs + method_params: + anchor_type: image + max_units_per_community: 10 + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: vqa + data_format: ChatML \ No newline at end of file diff --git a/examples/input_examples/csv_demo.csv b/examples/input_examples/csv_demo.csv new file mode 100644 index 00000000..e6c1b521 --- /dev/null +++ b/examples/input_examples/csv_demo.csv @@ -0,0 +1,5 @@ +type,content +text,云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。 +text,隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。 +text,"Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity." +text,"Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture." diff --git a/examples/input_examples/extract_demo.txt b/examples/input_examples/extract_demo.txt new file mode 100644 index 00000000..7d399227 --- /dev/null +++ b/examples/input_examples/extract_demo.txt @@ -0,0 +1,16 @@ +LEASE AGREEMENT +This Lease Agreement (“Agreement”) is entered into on 15 March 2024 (“Signing Date”) by and between: +Lessor: +Riverside Commercial LLC, a Delaware limited-liability company with principal offices at 1800 Riverfront Drive, Wilmington, Delaware 19801, USA. +Lessee: +BrightWave Technologies Inc., a California corporation having its registered address at 2500 Innovation Way, San Jose, California 95110, USA. +Leased Space: +Approximately 5,200 rentable square feet on the entire third (3rd) floor of the Building known as Riverfront Plaza, 1800 Riverfront Drive, Wilmington, Delaware 19801, consisting of open-plan office areas, three private offices, one conference room, a kitchenette, and associated common-area restrooms. +Designated Use: +General office, research & development, and ancillary administrative purposes related to software and hardware technology businesses, and for no other purpose without prior written consent of Lessor. +Term: +The lease shall commence on 1 May 2024 (“Start Date”) and shall expire at 11:59 p.m. on 30 April 2029 (This is “End Date” or “Expiration Date of Lease”), unless sooner terminated in accordance with the terms hereof. +Term of Payment: +Base rent in the amount of Twenty-Five Thousand and 00/100 United States Dollars (US $25,000.00) per calendar month, due in advance on the first (1st) day of each month without demand, deduction, or set-off. Rent shall be paid by ACH transfer to the account designated in writing by Lessor. In addition, Lessee shall pay its proportionate share of Operating Expenses and Real Estate Taxes as defined in Section 5 of this Agreement. +Extension Period: +Provided Lessee is not in default beyond any applicable cure period, Lessee may extend this Lease for one (1) additional period of five (5) years by giving Lessor written notice at least twelve (12) months but not more than fifteen (15) months prior to the Expiration Date of Lease (i.e., 30 April 2029). The rent during the extension period shall be adjusted to the then-prevailing market rate as determined under Section 18.2 hereof. \ No newline at end of file diff --git a/examples/input_examples/graphml_demo.graphml b/examples/input_examples/graphml_demo.graphml new file mode 100644 index 00000000..9d90c84f --- /dev/null +++ b/examples/input_examples/graphml_demo.graphml @@ -0,0 +1,362 @@ + + + + + + + + + + + + "TECHNOLOGY" + "农业技术是指用于提高作物产量和质量的科学方法,云粳26号的推广依赖于相关的农业技术。" + chunk-59c892b0b8d999d3057866424ac38c5f + 3.277478752446756e-07 + 50 + + + "LOCATION" + "云南省是中国的一个省份,云粳26号在该省的农业研究和推广中发挥了重要作用。" + chunk-59c892b0b8d999d3057866424ac38c5f + 3.128163225886591e-07 + 44 + + + "ORGANIZATION" + "云南省农业科学院粮食作物研究所是一个科研机构,负责育成和研究粮食作物,包括早熟品种云粳26号。" + chunk-59c892b0b8d999d3057866424ac38c5f + 8.336807513843113e-05 + 69 + + + "MISSION" + "农业推广是指将新技术和新品种引入生产实践的过程,云粳26号被列为主导品种以促进其推广。" + chunk-59c892b0b8d999d3057866424ac38c5f + 7.628287841247848e-05 + 51 + + + "CONCEPT" + "食用价值是指食品在营养和口感方面的综合评价,云粳26号因其食味品质好而具有较高的食用价值。" + chunk-59c892b0b8d999d3057866424ac38c5f + 2.997547934514262 + 58 + + + "ORGANIZATION" + "农业部是中国的一个政府机构,负责农业政策的制定和推广,云粳26号被其列为主导品种。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.08804169791274698 + 48 + + + "CONCEPT" + "推广主导品种是指在特定区域内被推荐种植的主要作物品种,云粳26号在西南稻区被列为此类品种。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.24357749228919026 + 55 + + + "LOCATION" + "中海拔稻区是指海拔在1500至1800米之间的稻田区域,适合云粳26号的生长。" + chunk-59c892b0b8d999d3057866424ac38c5f + 5.926949838261465 + 45 + + + "KEYWORD" + "谷壳黄色是云粳26号的外观特征之一,表明其成熟状态。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.4537467972212248 + 35 + + + "KEYWORD" + "高抗稻瘟病是云粳26号的抗病性特征,增强了其种植的可行性。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.010406265234302301 + 45 + + + "KEYWORD" + "落粒性适中是云粳26号的特性之一,影响其收割和加工过程。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.39751120253379707 + 36 + + + "KEYWORD" + "有香味是云粳26号的食味品质之一,增加了其市场吸引力。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.1572458000758473 + 36 + + + "NATURE" + "稻瘟病是一种影响水稻的病害,云粳26号具有高抗稻瘟病的特性,增强了其种植的可行性。" + chunk-59c892b0b8d999d3057866424ac38c5f + 2.880133332981227e-07 + 63 + + + "CONCEPT" + "外观特点是指云粳26号的视觉特征,包括颖尖、谷壳颜色等,影响其市场接受度。" + chunk-59c892b0b8d999d3057866424ac38c5f + 1.668849735265512 + 51 + + + "KEYWORD" + "食味品质好是云粳26号的一个重要特性,影响消费者的选择。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.11790494848439569 + 33 + + + "WORK" + "云粳26号是由云南省农业科学院粮食作物研究所于2005年育成的早熟稻品种,具有良好的外观和食味品质。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.4054990190854595 + 68 + + + "NATURE" + "香味是指米粒在烹饪后散发的气味,云粳26号的米粒具有良好的香味,增加了其市场吸引力。" + chunk-59c892b0b8d999d3057866424ac38c5f + 1.704282376131091 + 60 + + + "DATE" + "2012年是云粳26号被农业部列为西南稻区农业推广主导品种的年份。" + chunk-59c892b0b8d999d3057866424ac38c5f + 1.6109373199087365 + 38 + + + "DATE" + "2005年是云粳26号品种育成的年份,标志着该品种的研发开始。" + chunk-59c892b0b8d999d3057866424ac38c5f + 2.5795454440481884 + 35 + + + "KEYWORD" + "米粒大是云粳26号的一个重要特征,通常与食用价值相关。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.8984304647517591 + 31 + + + "NATURE" + "落粒性是指稻谷在成熟后从稻穗上脱落的特性,云粳26号的落粒性适中,影响其收割和加工的便利性。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.32474459712132514 + 66 + + + "CONCEPT" + "早熟品种是指生长周期较短的作物品种,云粳26号属于这一类,适合在特定气候条件下种植。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.0018348240895041633 + 55 + + + "LOCATION" + "云南中海拔 1 500∼1 800 m 稻区是云粳26号适宜种植的区域,提供了特定的生长环境。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.03589417758929121 + 55 + + + "KEYWORD" + "颖尖无色、无芒是云粳26号的外观特点之一,影响其市场接受度。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.7739797890026089 + 39 + + + "NATURE" + "米粒是稻谷经过加工后得到的食用部分,云粳26号的米粒大且有香味,提升了其食用价值。" + chunk-59c892b0b8d999d3057866424ac38c5f + 7.555004360466345e-05 + 54 + + + "CONCEPT" + "适宜种植区域是指云粳26号推荐的种植环境,主要为云南中海拔 1 500∼1 800 m 稻区。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.4086719057062533 + 58 + + + "LOCATION" + "西南稻区是指中国西南地区,云粳26号在此区域被推广。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.004633279566562749 + 31 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的推广依赖于相关的农业技术,以提高其种植效率和产量。" + 4.469515500104365e-07 + 39 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号是云南省育成的品种,适合在该省的特定环境中种植。" + 0.0005762292913028245 + 38 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云南省农业科学院粮食作物研究所位于云南省,专注于该省的粮食作物研究和育种工作。" + 0.0004066592991875774 + 55 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号是由云南省农业科学院粮食作物研究所育成的早熟稻品种,体现了该机构的科研成果。" + 8.201402306440362e-05 + 60 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号被列为主导品种,旨在通过农业推广提高其种植和消费。" + 0.0016328476835250948 + 37 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号因其食味品质好而具有较高的食用价值,适合市场需求。" + 9.018137544091494e-05 + 39 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号在2012年被农业部列为西南稻区农业推广主导品种,显示了其在农业政策中的重要性。" + 0.0660654296723789 + 51 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号在西南稻区被列为推广主导品种,显示其在该地区的重要性。" + 0.004850300392718063 + 36 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "中海拔稻区是云粳26号推荐的种植区域,提供了适合其生长的气候条件。" + 0.4154816437684379 + 44 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "谷壳黄色是云粳26号的外观特征之一,影响其消费者的接受度。" + 0.055662130155681604 + 39 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "高抗稻瘟病是云粳26号的抗病性特征,增强了其种植的可行性。" + 0.007417909388497926 + 45 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "落粒性适中是云粳26号的特性之一,影响其收割和加工的便利性。" + 0.04371501902711867 + 39 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "有香味是云粳26号的食味品质之一,增加了其市场吸引力。" + 0.238045756594794 + 36 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号具有高抗稻瘟病的特性,表明其在种植过程中能够抵御这一病害。" + 0.006100528949900764 + 51 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的外观特点包括颖尖无色、无芒等,影响其市场表现和消费者选择。" + 0.4906831717572373 + 43 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "食味品质好是云粳26号的一个重要特性,影响消费者的选择。" + 0.013361831643390068 + 33 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号适宜在云南中海拔 1 500∼1 800 m 稻区种植,表明其对环境的适应性。" + 0.39592594481115617 + 52 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号适宜在云南中海拔 1 500∼1 800 m 稻区种植,表明其生长环境的要求。" + 0.5030086991360689 + 51 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "米粒大是云粳26号的重要特征,通常与其食用价值相关。" + 0.14715079462621358 + 31 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号是一种早熟品种,适合在特定的气候条件下种植,具有较短的生长周期。" + 0.008795430480896508 + 49 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号于2005年育成,标志着该品种的研发历程。" + 2.4413040461122195 + 31 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的香味是其食用品质的重要组成部分,吸引了消费者的关注。" + 2.379013142252285e-06 + 40 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "颖尖无色、无芒是云粳26号的外观特点,影响其市场表现。" + 2.96823331740096 + 35 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的落粒性适中,影响其在收割时的表现和加工效率。" + 0.11254188650424042 + 36 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "2012年云粳26号被农业部列为西南稻区农业推广主导品种,标志着其推广的重要性。" + 0.26297679025 + 46 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的米粒大且有香味,提升了其在市场上的竞争力。" + 0.0008942391715159215 + 37 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "2012年云粳26号在西南稻区被推广,表明其在该地区的适应性和重要性。" + 0.001021465373999705 + 39 + + diff --git a/examples/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg b/examples/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg new file mode 100644 index 00000000..2d9e1e8e Binary files /dev/null and b/examples/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg differ diff --git a/examples/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg b/examples/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg new file mode 100644 index 00000000..d1872ed4 Binary files /dev/null and b/examples/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg differ diff --git a/examples/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg b/examples/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg new file mode 100644 index 00000000..66e790df Binary files /dev/null and b/examples/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg differ diff --git a/examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg b/examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg new file mode 100644 index 00000000..755a0bd7 Binary files /dev/null and b/examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg differ diff --git a/examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg b/examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg new file mode 100644 index 00000000..59c133a6 Binary files /dev/null and b/examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg differ diff --git a/examples/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg b/examples/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg new file mode 100644 index 00000000..f2de8c08 Binary files /dev/null and b/examples/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg differ diff --git a/examples/input_examples/json_demo.json b/examples/input_examples/json_demo.json new file mode 100644 index 00000000..a53101da --- /dev/null +++ b/examples/input_examples/json_demo.json @@ -0,0 +1,6 @@ +[ + {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}, + {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}, + {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}, + {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} +] diff --git a/examples/input_examples/jsonl_demo.jsonl b/examples/input_examples/jsonl_demo.jsonl new file mode 100644 index 00000000..2b743578 --- /dev/null +++ b/examples/input_examples/jsonl_demo.jsonl @@ -0,0 +1,4 @@ +{"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} +{"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} +{"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} +{"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} diff --git a/examples/input_examples/pdf_demo.pdf b/examples/input_examples/pdf_demo.pdf new file mode 100644 index 00000000..5a328e4c Binary files /dev/null and b/examples/input_examples/pdf_demo.pdf differ diff --git a/examples/input_examples/search_dna_demo.jsonl b/examples/input_examples/search_dna_demo.jsonl new file mode 100644 index 00000000..f423e1c1 --- /dev/null +++ b/examples/input_examples/search_dna_demo.jsonl @@ -0,0 +1,4 @@ +{"type": "text", "content": "NG_033923"} +{"type": "text", "content": "NG_056118"} +{"type": "text", "content": ">query\nACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} +{"type": "text", "content": "ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} diff --git a/examples/input_examples/search_protein_demo.jsonl b/examples/input_examples/search_protein_demo.jsonl new file mode 100644 index 00000000..e119cec8 --- /dev/null +++ b/examples/input_examples/search_protein_demo.jsonl @@ -0,0 +1,14 @@ +{"type": "text", "content": "P01308"} +{"type": "text", "content": "P68871"} +{"type": "text", "content": "P02768"} +{"type": "text", "content": "P04637"} +{"type": "text", "content": "insulin"} +{"type": "text", "content": "hemoglobin"} +{"type": "text", "content": "p53"} +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": "albumin"} +{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +{"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} +{"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} +{"type": "text", "content": "MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKTTVIALREIEEGLINNQILDVRERQEQQEQEAAELQAVTAIAEGRR"} +{"type": "text", "content": "GSHMLCAISGKVPRRPVLSPKSRTIFEKSLLEQYVKDTGNDPITNEPLSIEEIVEIVPSAQ"} diff --git a/examples/input_examples/search_rna_demo.jsonl b/examples/input_examples/search_rna_demo.jsonl new file mode 100644 index 00000000..896473e2 --- /dev/null +++ b/examples/input_examples/search_rna_demo.jsonl @@ -0,0 +1,8 @@ +{"type": "text", "content": "hsa-let-7a-1"} +{"type": "text", "content": "XIST regulator"} +{"type": "text", "content": "URS0000123456"} +{"type": "text", "content": "URS0000000001"} +{"type": "text", "content": "URS0000000787"} +{"type": "text", "content": "GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"} +{"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} +{"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} diff --git a/examples/input_examples/searched_dna_demo.jsonl b/examples/input_examples/searched_dna_demo.jsonl new file mode 100644 index 00000000..05778743 --- /dev/null +++ b/examples/input_examples/searched_dna_demo.jsonl @@ -0,0 +1,3 @@ +{"_doc_id":"doc-NG_011079","type":"dna","content":"Title: Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1\nSequence: ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_011079","gene_name":"RPL35AP6","gene_description":"ribosomal protein L35a pseudogene 6","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_011079","gene_synonyms":["RPL35A_3_191"],"gene_type":"other","chromosome":"1","genomic_location":"1-522","function":null,"title":"Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1","sequence":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","sequence_length":522,"gene_id":"100271312","molecule_type_detail":"genomic region","_search_query":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} +{"_doc_id":"doc-NG_033923","type":"dna","content":"Title: Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2\nSequence: GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_033923","gene_name":"IRGMP","gene_description":"immunity-related GTPase family, M, pseudogene","organism":"Callithrix jacchus","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_033923","gene_synonyms":null,"gene_type":"other","chromosome":"2","genomic_location":"1-830","function":null,"title":"Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2","sequence":"GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","sequence_length":830,"gene_id":"100409682","molecule_type_detail":"genomic region","_search_query":"NG_033923"} +{"_doc_id":"doc-NG_056118","type":"dna","content":"Title: Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6\nSequence: GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_056118","gene_name":"HLA-DRB8","gene_description":"major histocompatibility complex, class II, DR beta 8 (pseudogene)","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_056118","gene_synonyms":null,"gene_type":"other","chromosome":"6","genomic_location":"1-2737","function":null,"title":"Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6","sequence":"GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","sequence_length":2737,"gene_id":"3130","molecule_type_detail":"genomic region","_search_query":"NG_056118"} diff --git a/examples/input_examples/searched_protein_demo.jsonl b/examples/input_examples/searched_protein_demo.jsonl new file mode 100644 index 00000000..47ab02ad --- /dev/null +++ b/examples/input_examples/searched_protein_demo.jsonl @@ -0,0 +1,8 @@ +{"_doc_id":"doc-P01308","type":"protein","content":"Function: ['Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver.']\nSequence: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P01308","entry_name":"INS_HUMAN","gene_names":[{"Name":"INS"}],"protein_name":"Insulin","organism":"Homo sapiens","sequence":"MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","function":["Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver."],"url":"https:\/\/www.uniprot.org\/uniprot\/P01308","_search_query":"P01308"} +{"_doc_id":"doc-Q6UWZ7","type":"protein","content":"Function: [\"Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}.\"]\nSequence: MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q6UWZ7","entry_name":"ABRX1_HUMAN","gene_names":[{"Name":"ABRAXAS1 {ECO:0000312|HGNC:HGNC:25829}","Synonyms":["ABRA1 {ECO:0000312|HGNC:HGNC:25829}","CCDC98","FAM175A {ECO:0000312|HGNC:HGNC:25829}"],"ORFNames":["UNQ496\/PRO1013"]}],"protein_name":"BRCA1-A complex subunit Abraxas 1 {ECO:0000312|HGNC:HGNC:25829}","organism":"Homo sapiens","sequence":"MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","function":["Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q6UWZ7","_search_query":"BRCA1"} +{"_doc_id":"doc-P27355","type":"protein","content":"Function: ['Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds.']\nSequence: MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P27355","entry_name":"MEMG_METTR","gene_names":[{"Name":"mmoZ"}],"protein_name":"Methane monooxygenase component A gamma chain","organism":"Methylosinus trichosporium.","sequence":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","function":["Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds."],"url":"https:\/\/www.uniprot.org\/uniprot\/P27355","_search_query":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} +{"_doc_id":"doc-Q96GG9","type":"protein","content":"Function: ['Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}.']\nSequence: MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q96GG9","entry_name":"DCNL1_HUMAN","gene_names":[{"Name":"DCUN1D1 {ECO:0000312|HGNC:HGNC:18184}","Synonyms":["DCN1 {ECO:0000303|PubMed:28581483}","DCUN1L1","RP42","SCCRO"]}],"protein_name":"DCN1-like protein 1 {ECO:0000305}","organism":"Homo sapiens","sequence":"MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","function":["Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q96GG9","_search_query":"MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} +{"_doc_id":"doc-P68871","type":"protein","content":"Function: ['Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.', 'LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.', '[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation.']\nSequence: MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P68871","entry_name":"HBB_HUMAN","gene_names":[{"Name":"HBB"}],"protein_name":"Hemoglobin subunit beta","organism":"Homo sapiens","sequence":"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","function":["Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.","LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.","[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation."],"url":"https:\/\/www.uniprot.org\/uniprot\/P68871","_search_query":"P68871"} +{"_doc_id":"doc-P22939","type":"protein","content":"Sequence: MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P22939","entry_name":"ISPA_ECOLI","gene_names":[{"Name":"ispA","OrderedLocusNames":["b0421","JW0411"]}],"protein_name":"Farnesyl diphosphate synthase","organism":"Escherichia coli","sequence":"MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","function":[],"url":"https:\/\/www.uniprot.org\/uniprot\/P22939","_search_query":"MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +{"_doc_id":"doc-Q8I8V0","type":"protein","content":"Function: ['Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.', '[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.', '[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.']\nSequence: MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q8I8V0","entry_name":"TAD2B_DROME","gene_names":[{"Name":"Ada2b {ECO:0000312|FlyBase:FBgn0037555}","Synonyms":["Ada2S {ECO:0000303|PubMed:12697829}"],"ORFNames":["CG9638 {ECO:0000312|FlyBase:FBgn0037555}"]}],"protein_name":"Transcriptional adapter 2b {ECO:0000312|FlyBase:FBgn0037555}","organism":"Drosophila melanogaster","sequence":"MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","function":["Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.","[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.","[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q8I8V0","_search_query":"p53"} +{"_doc_id":"doc-P04637","type":"protein","content":"Function: ['Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}.']\nSequence: MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P04637","entry_name":"P53_HUMAN","gene_names":[{"Name":"TP53","Synonyms":["P53"]}],"protein_name":"Cellular tumor antigen p53","organism":"Homo sapiens","sequence":"MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","function":["Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}."],"url":"https:\/\/www.uniprot.org\/uniprot\/P04637","_search_query":"P04637"} diff --git a/examples/input_examples/searched_rna_demo.jsonl b/examples/input_examples/searched_rna_demo.jsonl new file mode 100644 index 00000000..9ad088c0 --- /dev/null +++ b/examples/input_examples/searched_rna_demo.jsonl @@ -0,0 +1,6 @@ +{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000123456"} +{"_doc_id":"doc-URS00000088CC","type":"rna","content":"Description: lncRNA from 1 species\nSequence: GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS00000088CC","rnacentral_id":"URS00000088CC","sequence":"GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","sequence_length":210,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS00000088CC","organism":"Homo sapiens","related_genes":["ENSG00000265458.1","lnc-C17orf62-1","ENSG00000265458","NONHSAG023099","HSALNG0119438","NONHSAG023099.2","ENSG00000265458.4","RP13-20L14.6","NARF-AS2"],"gene_name":"ENSG00000265458, ENSG00000265458.1, ENSG00000265458.4, HSALNG0119438, NARF-AS2, NONHSAG023099, NONHSAG023099.2, RP13-20L14.6, lnc-C17orf62-1","so_term":"antisense, ncRNA","modifications":null,"_search_query":"GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"} +{"_doc_id":"doc-URS000342178E","type":"rna","content":"Description: None misc RNA\nSequence: GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS000342178E","rnacentral_id":"URS000342178E","sequence":"GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","sequence_length":73,"rna_type":"misc_RNA","description":"None misc RNA","url":"https:\/\/rnacentral.org\/rna\/URS000342178E","organism":null,"related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"XIST regulator"} +{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} +{"_doc_id":"doc-URS0000000787","type":"rna","content":"Description: lncRNA from 1 species\nSequence: AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000787","rnacentral_id":"URS0000000787","sequence":"AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","sequence_length":576,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000787","organism":"Homo sapiens","related_genes":["KB-1183D5.13","lnc-GGT2-26","ENSG00000206142.10","ENSG00000206142.9","NONHSAG033362.2","FAM230H","NONHSAG033362","lnc-GGT2-4","ENSG00000206142","lnc-GGT2-2","HSALNG0134219"],"gene_name":"ENSG00000206142, ENSG00000206142.10, ENSG00000206142.9, FAM230H, HSALNG0134219, KB-1183D5.13, NONHSAG033362, NONHSAG033362.2, lnc-GGT2-2, lnc-GGT2-26, lnc-GGT2-4","so_term":"lincRNA, ncRNA","modifications":null,"_search_query":"URS0000000787"} +{"_doc_id":"doc-URS0000000001","type":"rna","content":"Description: rRNA from 1 species\nSequence: AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000001","rnacentral_id":"URS0000000001","sequence":"AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","sequence_length":200,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000001","organism":"uncultured bacterium","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000000001"} diff --git a/resources/examples/txt_demo.txt b/examples/input_examples/txt_demo.txt similarity index 100% rename from resources/examples/txt_demo.txt rename to examples/input_examples/txt_demo.txt diff --git a/examples/input_examples/vqa_demo.json b/examples/input_examples/vqa_demo.json new file mode 100644 index 00000000..d3aed723 --- /dev/null +++ b/examples/input_examples/vqa_demo.json @@ -0,0 +1,93 @@ +[ + { + "type": "text", + "content": "The $4 4 - \\mathbf { k D }$ protein, named harpin, was electroeluted from a preparative SDS-polyacrylamide gel (12). At concentrations ${ \\ge } 5 0 0 \\mathbf { n } \\mathbf { M }$ $( \\geq 2 5 ~ | \\mathbf { \\mu } \\mathbf { g } / \\mathbf { m l } )$ , harpin elicited HR in leaves of tobacco (Fig. 2, sectors 6 and " + }, + { + "type": "text", + "content": "Because supernatants from E. amylovora Ea321(pCPP430) or E. coli DH5α (pCPP430) did not elicit HR, we postulated that harpin was not secreted but rather was present in or on the bacteria. Whole bacteria treated with protease failed to elicit HR, whereas bacteria incubated with protease together with $0 . 5 ~ \\mathrm { m M }$ phenylmethylsulfonyl fluoride (PMSF, a protease inhibitor) did (Table 1). Treatment of bacteria with increasing amounts of protease resulted in a decreased ability to elicit HR that correlated with the disappearance of harpin detectable in SDS-polyacrylamide gels (Table 1). After centrifugation of CFEP at $_ { 1 0 5 , 0 0 0 g }$ for 1 hour, most HR-eliciting activity was found in the supernatant. However, when the cell suspension was brought to $3 0 \\mathrm { \\ m M \\ M g C l } _ { 2 }$ ,before sonication, most activity was associated with the sedimented membrane fraction. Gel-permeation chromatography of unheated CFEP also indicated association of the elicitor with a high molecular weight $( > 1 0 ^ { 6 }$ daltons) fraction, probably membrane vesicles (14). Only the membrane fraction of E. amylovora Ea321(pCPP430) reacted with an antiserum raised in response to harpin (15), further supporting the cell-envelope location of harpin (Fig. 4). " + }, + { + "type": "image", + "content":{ + "img_path": "examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg", + "image_caption": [ + "Fig. 1. (A) Physical map of the hrp gene cluster of E. amylovora (4, 18, 29), showing restriction sites: B, Bam HI; E, Eco RI; H, Hind II. Gene hrpN, encoding harpin, is contained in the 1.3 kb Hind II fragment indicated by the solid bar. The shaded region (including hrpN) contains that part of the hrp gene cluster in which most transposon insertions, exemplified by K49, a Tn10 mini-kan (30) insertion, abolish the HR and pathogenicity phenotypes. Most " + ] + } + }, + { + "type": "text", + "content": "HR-eliciting activity, harpin was not detected. However, when the protease inhibitor PMSF $( 0 . 5 \\mathrm { \\ m M } )$ was included, the bacteria retained HR-eliciting activity and possessed detectable harpin for more than 2 hours. More protease was required per cell to destroy harpin produced by E. coli $\\mathsf { D H S } \\alpha ( \\mathsf { p C P P } 4 3 0 )$ than by Ea321(pCPP430), suggesting that E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ produces more harpin or degrades it more slowly, or both. " + }, + { + "type": "text", + "content": "The ability of bacterial strains to elicit the HR in intact tobacco leaves is related genetically to their ability to elicit a $\\mathbf { K } ^ { + } / \\mathbf { H } ^ { + }$ exchange reaction (XR) in tobacco cell suspension cultures (TCSCs) (16); both reactions require the hrp gene cluster (17). " + }, + { + "type": "image", + "content": { + "img_path": "examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg", + "image_caption": [ + "Fig. 2. Tobacco leaf showing responses 24 hours after infitration of sectors (7) with the following preparations: 1,, living E. coli DH5α (pCPP9) $( 1 \\times 1 0 ^ { 8 } / \\mathrm { m l } )$ ; 2, E. coli DH5α (pCPP430) $( 1 \\ \\times \\ 1 0 ^ { 8 } / \\mathrm { m l } )$ ; 3, E. coli DH5α (pCPP430K49) $( 1 \\times 1 0 ^ { 8 } / \\mathrm { m } )$ ; 4, E. amylovora Ea321 $( 1 \\times 1 0 ^ { 8 } / \\mathsf { m l } )$ ; 5, Ea321K49, an hrp mutant $( 1 \\times 1 0 ^ { 8 } / \\mathsf { m } )$ , 8, heat-treated CFEP from $\\pmb { \\varepsilon }$ coli ${ \\mathsf { D } } { \\mathsf { H } } { \\mathsf { S } } { \\mathsf { { \\alpha } } } ( { \\mathsf { P } } { \\mathsf { C } } { \\mathsf { P } } { \\mathsf { P } } { \\mathsf { 9 } } )$ ; 9,heat-treated CFEP from E. coli DH5α(pCPP430); 10, heat-treated CFEP from E. coli DH5α(pCPP430K49); 11, heattreated CFEP from $\\boldsymbol { \\varepsilon }$ amylovora Ea321; 12, heat-treated CFEP from Ea321K49; 6, harpin $( 1 . 1 \\mu M )$ from E. coli DH5α(pCPP430) eluted from SDS-polyacrylamide gel; 7, same preparation as 6, but protease treated for 2 hours then heated for io min to inactivate protease; 13, harpin $( 1 \\pmb { \\mu } \\pmb { M } )$ from E. amylovora Ea321 eluted from SDS-polyacrylamide gel; 14, same preparation as 13 but with protease treatment as sample 7. Harpin solutions $< - 0 . 3 \\mu \\mathsf { m }$ do not cause collapse of infitrated tissue; spotty and incomplete collapse is caused by harpin between 0.3 and $0 . 5 ~ { \\mu } \\mathsf { m }$ . " + ] + } + }, + { + "type": "text", + "content": "We tested the ability of harpin to raise the pH of TCSC bathing solution, an indicator of the XR (Fig. 5). Cells of E. amylovora, grown in rich medium and added to TCSCs caused an increase in pH of the bathing solution after 2 to 3 hours. Addition of purified harpin caused an increase in pH within 1 hour. Erwinia amylovora mutant Ea321K49, which did not produce harpin in culture, and strains of E. coli containing mutated hrp gene clusters failed to elicit the XR. " + }, + { + "type": "text", + "content": "Table 1. Protease sensitivity of the HR-eliciting activity of whole cells of E. amylovora Ea321(pCPP430). Cells were grown in LB medium, harvested'by centrifugation, and resuspended in 0.1 volume of $5 m M$ potassium phosphate $( \\mathsf { p H } \\thinspace 6 . 5 )$ containing tetracycline (40 $\\mu { \\sf g } / { \\sf m } 1 )$ . After incubation with protease (Sigma P5147), as indicated, at $\\mathfrak { s } 7 ^ { \\circ } \\mathfrak { C }$ for 5 min, $1 0 0 ~ \\mu !$ of each cell suspension was infiltrated into tobacco leaves. Leaf sector collapse was assayed at 24 hours. At the time of infiltration, portions of protease-treated cell mixtures were iysed, held'in boiling water for 10 min, centrifuged for 10 min at $1 2 . 0 0 0 g .$ and electrophoresed on a $10 \\%$ SDS-polyacrylamide gel to detect harpin. Electrophoresis was done for 2 hours at $1 5 m \\mathsf { A }$ followed by staining with Coomassie blue R-250. Cell-free supernatant, produced from the LB culture, was filter-sterilized and then concentrated with the Centriprep-10 (Amicon, Danvers, Massachusetts). " + }, + { + "type": "table", + "content": { + "img_path": "examples/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg", + "table_caption": [], + "table_footnote": [], + "table_body": "
Protease per milliterTissue collapseHarpin detected
0++
5μg++
10μg++
20 μgWeak+
40 μg-
80μg
80μg + 0.5 mM PMSF++
Cell-free supernatant
" + } + }, + { + "type": "text", + "content": "expressed fom pCPP1084 in the T7RNA (20). Insertions of Tn5tac1 in hrpN (21) (Fig. 1) abolished the ability of E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ to elicit HR on tobacco or produce harpin detectable on Western blots. Ea321T5, a derivative of E. amylo" + }, + { + "type": "image", + "content": { + "img_path": "examples/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg", + "image_caption": [ + "Fig. 3. SDS-polyacrylamide gel electrophoresis of CFEPs and purified harpin. Lanes: 1, purified harpin $( 1 . 5 \\ \\mathsf { \\pmb { \\mu } } \\mathsf { \\pmb { \\mathsf { g } } } )$ from E. coli $\\mathsf { D M } 5 \\alpha ( \\mathsf { p C P } 4 3 0 )$ incubated with protease (9) for 1 hour; 2, purified harpin $( 1 . 5 \\mu \\mathfrak { g } )$ from E. amylovora Ea321 incubated with protease for 1 hour; 3, same as 1, but without treatment with protease; 4, same as 2, but without treatment with protease; 5, CFEP (5 ${ \\pmb { \\mu } } ( { \\pmb q } )$ from E. coli DH5α(pCPP9) treated at $1 0 0 ^ { \\circ } \\mathbb { C }$ for 10'min; 6, CFEP $( 5 \\ \\pmb { \\mu } \\pmb { \\mu } )$ from E. coli DH5a(pCPP430K49) treated at $\\pmb { 1 0 0 } \\pmb { \\circ } \\pmb { \\subset }$ for 10 min; 7, CFEP $( 5 ~ \\mu 9 )$ from E. amylovora Ea321 treated " + ] + } + }, + { + "type": "text", + "content": "at $\\pmb { 1 0 0 ^ { \\circ } } \\pmb { \\mathbb { C } }$ for 10 min; 8, CFEP $( 5 ~ \\mu 9 )$ from E. coli DH5a(pCPP430) treated at $1 0 0 ^ { \\circ } \\mathsf { C }$ for 10 min; 9, CFEP $( 5 ~ \\mu 9 )$ from E. amylovora Ea321K49 treated at $_ { 1 0 0 ^ { \\circ } \\mathbb { C } }$ for 10 min. Samples from the preparations in lanes 3, 4, 7, and 8 elicited HR in tobacco leaves. Samples were prepared as described (8) and brought to 125 mM tris-HCI $( \\mathsf { p H } 6 . 8 )$ $4 \\%$ SDS, $20 \\%$ glycerol, boiled for 3 min, then electrophoresed through a $10 \\%$ (w/v) polyacrylamide gel with $0 . 1 \\%$ SDS at $1 5 m A$ for 2 hours in a Mighty Small apparatus according to instructions (Hoefer Scientific Instruments, San Francisco, California). The gel was stained with $0 . 0 2 5 \\%$ Coomassie Blue R-250. Low-range prestained molecular weight standards (Bio-Rad 161-0305) were used and calibrated with an unstained protein marker (Bio-Rad 161-0304). Arrow indicates region corresponding to $4 4 \\ k \\mathsf$ " + }, + { + "type": "image", + "content": { + "img_path": "examples/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg", + "image_caption": [ + "Fig. 4. Subcellular location of elicitor protein. Logphase cells $( 1 . 5 m )$ of strain Ea321(pCPP430) were fractionated (31). Proteins from each fraction were electrophoresed and transferred to Immobilon-P membrane (Millipore, Bedford, Massachusetts). The Amplified Alkaline Phosphatase Immuno-Blot Assay Kit (170-6412, Bio-Rad Richmond, California) was ", + "used in a Western blot to detect the elicitor protein with an antiserum raised in rabbit in response to harpin (15). (A) Fractions in lanes: 1, periplasm; 2, membrane; 3, whole cells; 4, supernatant; 5, cytoplasm. (B) Harpin purified by high-performance liquid chromatography (19) hybridized with antiserum. Arrows indicates $4 4 \\ k \\mathsf { D }$ based on the molecular weight markers used in Fig. 3. (C) Normal serum control. CFEP from E. coli DH5a(pCPP430) hybridized with pre-immune serum. " + ] + } + }, + { + "type": "text", + "content": "DNA sequence data from the $1 . 3 – \\mathbf { k } \\mathbf { b }$ Hind II fragment revealed that hrpN is 1155 base pairs long, and it encodes a 385–amino acid protein (Fig. 1). The 15 $\\mathrm { N H } _ { 2 }$ -terminal residues revealed by amino acid sequencing corresponded to those deduced from the DNA' sequence (Fig. 1). The deduced amino acid sequence of harpin (Fig. 1), which corresponded closely with the analyzed amino acid composition, reveals a glycine-rich protein with a high degree of hydrophilicity. It appears to have an open structure, which may explain its heat stability and sensitivity to proteases. A FASTA search (23) of GenBank for similar proteins revealed similarity only with other glycine-rich proteins, such as several plant cell wall proteins and keratins. " + }, + { + "type": "image", + "content": { + "img_path": "examples/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg", + "image_caption": [ + "Fig. 5. Changes in pH of bathing solution of tobacco cell-suspension cultures (TCSC). Control values (no additive) were subtracted. Open squares, harpin (60 nM); open circles, cells of E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ $( 5 ~ \\times ~ 1 0 ^ { 7 }$ cells per milliliter); filled squares, cells of E. amylovora Ea321 $( 5 \\times 1 0 ^ { 7 }$ cells per milliiter); triangles, cells of E. coli DH5α(pCPP430K49) $( 5 \\times 1 0 ^ { 7 }$ cells per milliter); diamonds, cells of $\\boldsymbol { \\varepsilon }$ amylovora Ea321K49 $( 5 ~ \\times ~ 1 0 ^ { 7 }$ cells per milliter); filled circles, cells of $\\boldsymbol { E } .$ coli DH5α(pCPP9) $( 5 \\times$ $\\pmb { 1 0 ^ { 6 } }$ cells per mililiter). TCSCs were shaken at room temperature with the indicated preparations. The pH was measured at the intervals indicated. All preparations that elicited HR in tobacco leaves (Fig. 2) also caused a pH increase in the TCSC medium. " + ] + } + } +] \ No newline at end of file diff --git a/examples/output_examples/aggregated_chatml.json b/examples/output_examples/aggregated_chatml.json new file mode 100644 index 00000000..1561a6d0 --- /dev/null +++ b/examples/output_examples/aggregated_chatml.json @@ -0,0 +1,98 @@ +[ + { + "messages": [ + { + "role": "user", + "content": "What laboratory technique was used to study the localization of TAC4 in NB epidermal cells?" + }, + { + "role": "assistant", + "content": "The transient assay, a laboratory technique utilized to introduce genetic material into cells for a limited duration, was conducted in NB epidermal cells. This approach aimed to investigate the localization of TAC4 within these cells." + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "What was the focus of the study regarding TAC4-RFP fusion proteins, and how did researchers visualize their localization?" + }, + { + "role": "assistant", + "content": "The study focused on the localization of TAC4-RFP fusion proteins, which were primarily found in the cell wall, the rigid outer layer of plant cells that provides structural support and protection. To visualize the localization of these proteins, RFP signals, emitted by the Red Fluorescent Protein, were employed. This use of RFP signals allowed researchers to effectively track the presence of TAC4-RFP fusion proteins within the cell wall, highlighting their functional role in that specific area." + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "云粳26号是什么品种,它的育成背景和适应性如何?" + }, + { + "role": "assistant", + "content": "云粳26号是一种由云南省农业科学院粮食作物研究所于2005年育成的早熟稻品种。该品种不仅具有良好的外观,还在食味品质上表现出色,因而其食用价值较高,吸引了众多消费者的关注。\n\n首先,云粳26号适宜在云南中海拔稻区种植,这表明它对该环境具有良好的适应性。稻区是指适合水稻种植的区域,而云粳26号的成功种植进一步体现了云南省农业科学院在水稻研究和品种选育方面的重要作用。因此,云粳26号的推广不仅有助于提升当地的农业生产水平,也为消费者提供了高品质的稻米选择。" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "云粳26号在西南稻区的农业推广中扮演了什么样的角色?" + }, + { + "role": "assistant", + "content": "西南稻区是一个重要的农业区域,其中云粳26号被列为农业推广的主导品种。2012年,这一品种正式被农业部认定为西南稻区的主导品种,这一决定突显了云粳26号在该区域的重要性。主导品种是指在特定区域内被广泛推广和种植的农作物品种,因此,云粳26号作为西南稻区的主导品种,反映了其良好的适应性以及市场的需求。这一系列的推广和认定,表明了云粳26号在西南稻区农业发展中的关键角色。" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "What is the role of the TAC4 gene in rice plant architecture and how does its function impact the tiller angle and overall growth?" + }, + { + "role": "assistant", + "content": "The loss of TAC4 function results in a notable increase in the tiller angle of rice plants, which significantly impacts their overall architecture. TAC4 is a plant-specific gene that plays a crucial role in regulating rice shoot gravitropism, a vital process for proper plant orientation and growth. Sequence analysis of TAC4 has revealed that it underwent a genetic bottleneck during the domestication of indica cultivars, leading to a reduction in its genetic diversity.\n\nFurthermore, TAC4 influences the content of indole acetic acid, a type of auxin that is essential for regulating various aspects of plant growth and development, including gravitropism. The function of TAC4 is critical for the regulation of tiller angle, as it encodes a nuclear protein that is integral to this process. By affecting the distribution of auxin, TAC4 contributes to the regulatory mechanisms that govern plant architecture.\n\nResearch on TAC4 not only enhances our understanding of these regulatory mechanisms but also provides insights that could facilitate the improvement of rice plant architecture. The highly conserved nature of the TAC4 gene underscores its importance across different plant species. Additionally, the BG1 gene, which is influenced by auxin, plays a role in gravitropism and subsequently affects the growth and orientation of rice grains. Ultimately, the tiller angle is a key factor that directly influences the grain yield of rice, highlighting the interconnectedness of these genetic and physiological processes." + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "云粳26号是什么品种,它的主要特点和适应性如何?" + }, + { + "role": "assistant", + "content": "云粳26号是一种早熟稻品种,由云南省农业科学院粮食作物研究所于2005年育成。这一品种的研发标志着其在水稻研究领域的重要起始时间。云粳26号具有多项外观特点,其中包括颖尖无色、无芒和谷壳黄色,这些特征不仅体现了其独特性,还直接影响了其市场表现。\n\n此外,云粳26号的米粒较大,食味品质优良,并且具有香味,这些特性显著提升了其食用价值和市场吸引力。值得一提的是,云粳26号还具备高抗稻瘟病的特性,使其在稻区种植时能够有效减少病害损失,适合在云南中海拔1,500至1,800米的稻区种植。\n\n在2012年,云粳26号被农业部列为西南稻区农业推广的主导品种,这一认可进一步强调了其在农业政策中的重要性。云粳26号的落粒性适中,适应性强,能够在中海拔地区良好生长,反映了其对特定气候和土壤条件的适应能力。因此,云粳26号不仅是一个优质的早熟品种,也是云南省在水稻研究和推广方面的重要成果。" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "隆两优1212水稻品种的引入和试种情况如何?请详细描述其生长特性、产量以及推广情况。" + }, + { + "role": "assistant", + "content": "隆两优1212是一种水稻品种,于2017年在福建省龙岩市长汀县引入进行试种。该品种在圣丰家庭农场的河田镇南塘村进行试种,表现出优良的生长特性和高产量。隆两优1212的成熟时间为10月21日,标志着可以收获。其产量达到8.78 t/hm^2,显示出其高产特性。\n\n隆两优1212具有多项优良特性,包括抗性好、后期转色好、分蘖力强、适应性好、生育期适中、抽穗整齐等。这些特性使其在种植过程中能够有效抵抗病虫害,减少损失,并在市场上更具吸引力。此外,隆两优1212的适应性好使其能够在不同的生长环境中良好生长,而生育期适中则使其适合当地的气候条件。\n\n在种植过程中,隆两优1212采用了机插和机播技术,这不仅提高了插秧的质量,也提升了种植效率。隆两优1212作为烟后稻进行种植,符合当地的种植习惯。2018年和2019年,隆两优1212的示范面积分别扩大至4.00 hm^2和13.50 hm^2,显示出其推广潜力。在2018年,隆两优1212在润丰优质稻专业合作社进行示范种植,而在2019年则在绿丰优质稻专业合作社进行示范种植,进一步推广该品种。\n\n总的来说,隆两优1212在长汀县的试种和示范种植中,凭借其优良的生长特性和高产量,展现了良好的市场前景。" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "What is the role of the BIG GRAIN1 (BG1) gene in rice, and how does its manipulation affect grain development and yield?" + }, + { + "role": "assistant", + "content": "The gene known as BIG GRAIN1 (BG1) plays a crucial role in regulating auxin transport and response, acting as a positive regulator that enhances the plant's ability to respond to this important hormone. BG1 is preferentially expressed in vascular tissue, which is essential for nutrient transport and grain development. In rice (Oryza sativa L.), manipulation of the BG1 gene can lead to significant improvements in plant biomass, seed weight, and overall yield.\n\nSpecifically, the expression of BG1 is linked to increased seed weight, highlighting its importance in grain development. The dominant mutant known as BIG GRAIN1 (BG1-D) is associated with the BG1 gene and is characterized by an extra-large grain phenotype, which affects both grain size and gravitropism. Furthermore, the regulation of BG1 is critical for enhancing yield, making it a key target for genetic improvements aimed at boosting crop productivity.\n\nWhen BG1 expression is reduced through a genetic technique known as knockdown, the result is smaller grains and decreased sensitivities to auxin, underscoring BG1's vital role in regulating grain size. Additionally, the sensitivity of the BG1-D mutant to the auxin transport inhibitor N-1-naphthylphthalamic acid indicates a significant connection between BG1 and auxin transport mechanisms.\n\nOverexpression of BG1 has been shown to lead to larger grain sizes, further demonstrating its influence on grain development. The expression of BG1 in the culms, or stems, of the rice plant suggests its involvement in overall growth and structural integrity. Moreover, BG1 is integral to the regulation of auxin transport, which is essential for proper plant growth and the determination of grain size.\n\nThe BG1 gene also affects the observable characteristics of grain phenotype, particularly in terms of size and response to environmental signals. Its expression in young panicles, the flowering parts of the rice plant, indicates its critical role during the early stages of grain development. Additionally, BG1 influences the plant's sensitivities to auxin and auxin transport inhibitors.\n\nResearch on BG1 has been conducted in both rice and the model plant Arabidopsis, emphasizing its importance in plant development. By manipulating BG1, researchers can achieve increased plant biomass, further illustrating its significance in overall plant growth. The big grain1 (BG1-D) mutant, identified from a T-DNA insertion population, exemplifies the application of genetic tools in plant research and showcases the potential for altering grain size and gravitropism in rice." + } + ] + } +] \ No newline at end of file diff --git a/examples/output_examples/atomic_alpaca.json b/examples/output_examples/atomic_alpaca.json new file mode 100644 index 00000000..de4bca21 --- /dev/null +++ b/examples/output_examples/atomic_alpaca.json @@ -0,0 +1,52 @@ +[ + { + "instruction": "What do regulatory mechanisms control in plants?", + "input": "", + "output": "Regulatory mechanisms control the development and function of plant traits, such as tiller angle." + }, + { + "instruction": "What is the primary focus of genetic studies on rice?", + "input": "", + "output": "The primary focus of genetic studies on rice is to improve grain yield and size." + }, + { + "instruction": "What is the function of the cell wall in plant cells?", + "input": "", + "output": "The cell wall provides structural support and protection to plant cells." + }, + { + "instruction": "What is the result of overexpressing a gene like BG1?", + "input": "", + "output": "Overexpressing a gene like BG1 leads to enhanced traits such as larger grain size." + }, + { + "instruction": "云粳26号是由哪个机构育成的?", + "input": "", + "output": "云粳26号是由云南省农业科学院育成的。" + }, + { + "instruction": "农业推广的定义是什么?", + "input": "", + "output": "农业推广是指通过政策和技术手段,促进优质农作物的种植和应用。" + }, + { + "instruction": "What is the purpose of the Red Fluorescent Protein (RFP) in cellular studies?", + "input": "", + "output": "The Red Fluorescent Protein (RFP) is used to visualize protein localization in cells by emitting fluorescent signals." + }, + { + "instruction": "隆两优1212的产量是多少?", + "input": "", + "output": "隆两优1212的产量为8.78 t/hm^2。" + }, + { + "instruction": "云粳26号适宜种植的海拔范围是多少?", + "input": "", + "output": "云粳26号适宜种植的海拔范围是1 500∼1 800 m。" + }, + { + "instruction": "云粳26号的食味品质中有哪些特点?", + "input": "", + "output": "有香味是云粳26号的食味品质之一,增加了其市场吸引力。" + } +] \ No newline at end of file diff --git a/examples/output_examples/cot_sharegpt.json b/examples/output_examples/cot_sharegpt.json new file mode 100644 index 00000000..1ae8cd95 --- /dev/null +++ b/examples/output_examples/cot_sharegpt.json @@ -0,0 +1,122 @@ +[ + { + "conversations": [ + { + "from": "human", + "value": "What is the localization of TAC4-RFP fusion proteins in relation to the cell wall?" + }, + { + "from": "gpt", + "value": "The entity \"TAC4-RFP FUSION PROTEINS\" is identified as the subject of interest regarding localization. The relationship that indicates the localization of \"TAC4-RFP FUSION PROTEINS\" to \"CELL-WALL\" is established, showing that these proteins are primarily found in that area. This localization suggests a functional role for \"TAC4-RFP FUSION PROTEINS\" within the \"CELL-WALL\". Therefore, the findings indicate that \"TAC4-RFP FUSION PROTEINS\" are localized to the cell wall, highlighting their importance in that specific cellular context." + } + ] + }, + { + "conversations": [ + { + "from": "human", + "value": "云粳26号在西南稻区被列为主导品种的年份是什么?" + }, + { + "from": "gpt", + "value": "云粳26号是一个在西南稻区被广泛推广的农作物品种。为了找到云粳26号在西南稻区被列为主导品种的年份,首先需要确认云粳26号与西南稻区的关系。根据信息,云粳26号在西南稻区被列为农业推广主导品种。接下来,查找与西南稻区相关的年份信息。根据提供的数据,2012年是云粳26号被列为西南稻区农业推广主导品种的年份。最后,验证2012年与云粳26号的关系,确认2012年确实是云粳26号在西南稻区被列为主导品种的年份。因此,答案是2012年。" + } + ] + }, + { + "conversations": [ + { + "from": "human", + "value": "What is the purpose of performing a transient assay in NB epidermal cells?" + }, + { + "from": "gpt", + "value": "The entity \"TRANSIENT ASSAY\" is identified, which is a laboratory technique used to introduce genetic material into cells for a short period to study gene expression and protein function. The entity \"NB EPIDERMAL CELLS\" is also identified, which indicates that the transient assay was performed in these cells to study the localization of TAC4. The relationship between \"TRANSIENT ASSAY\" and \"NB EPIDERMAL CELLS\" reveals that the purpose of the assay is to investigate the localization of TAC4 within the NB epidermal cells. The specific purpose extracted from this relationship is to study how TAC4 is localized in these cells. Therefore, the answer is that the purpose of performing a transient assay in NB epidermal cells is to study the localization of TAC4." + } + ] + }, + { + "conversations": [ + { + "from": "human", + "value": "How does the expression of the BG1 gene influence the grain size and development in rice plants?" + }, + { + "from": "gpt", + "value": "The BG1 gene plays a crucial role in regulating grain size in rice plants. It is involved in the process of auxin transport, which is essential for proper plant growth and development. The expression of BG1 is linked to increased seed weight, indicating that it directly influences the mass of grains produced. Additionally, BG1 affects the sensitivities of the plant to auxin and auxin transport inhibitors, suggesting that its expression levels can alter how rice plants respond to hormonal signals that regulate growth.\n\nIn young panicles, BG1 is preferentially expressed, highlighting its critical role during the early stages of grain development. This expression is vital for ensuring that the grains develop properly and achieve optimal size. Furthermore, overexpression of BG1 has been shown to lead to increased grain size, demonstrating its significant impact on grain development.\n\nFinally, the BG1 gene influences the grain phenotype, particularly in terms of size and how the grains respond to environmental signals. This multifaceted role of BG1 underscores its importance in enhancing grain size and overall yield in rice plants." + } + ] + }, + { + "conversations": [ + { + "from": "human", + "value": "How does the tiller angle influence the grain yield of rice?" + }, + { + "from": "gpt", + "value": "Recognize entities: \"GRAIN YIELD\" refers to the amount of rice produced per unit area, while \"TILLER ANGLE\" is an important aspect of plant architecture that significantly affects this yield. Identify relationships: The relationship indicates that the tiller angle is a critical factor that directly affects the grain yield of rice. Associate \"GRAIN YIELD\" with its description, noting that it is influenced by various factors, and \"TILLER ANGLE\" with its description, emphasizing its role in plant architecture. Establish the connection between \"TILLER ANGLE\" and \"GRAIN YIELD\" based on the identified relationship, which highlights the direct impact of the tiller angle on the yield. Formulate the question to explore how the tiller angle influences the grain yield of rice, leading to an understanding of the significance of the tiller angle in maximizing rice production." + } + ] + }, + { + "conversations": [ + { + "from": "human", + "value": "What is the relationship between the big grain1 (Bg1-D) mutant and its identification from the T-DNA insertion population in rice?" + }, + { + "from": "gpt", + "value": "The entity \"BIG GRAIN1 (BG1-D)\" is a dominant mutant in rice that exhibits an extra-large grain phenotype. This mutant is significant in the study of rice genetics due to its impact on grain size and gravitropism. The entity \"T-DNA INSERTION POPULATION\" is a genetic tool used in plant research to create mutations, including the big grain1 (Bg1-D) mutant. The relationship between \"BIG GRAIN1 (BG1-D)\" and \"T-DNA INSERTION POPULATION\" indicates that the Bg1-D mutant was specifically identified from this population, showcasing the application of genetic tools in plant research. Additionally, the connection between \"BIG GRAIN1 (BG1-D)\" and \"RICE\" highlights that this mutant variant is part of the rice species, further contextualizing its relevance in genetic studies aimed at improving rice yield and size. The question regarding the relationship between the big grain1 (Bg1-D) mutant and its identification from the T-DNA insertion population can be answered by confirming that the Bg1-D mutant was indeed identified through the use of the T-DNA insertion population in rice genetics research." + } + ] + }, + { + "conversations": [ + { + "from": "human", + "value": "What role does TAC4 play in the regulation of tiller angle and how does its function relate to the improvement of rice plant architecture?" + }, + { + "from": "gpt", + "value": "TAC4 is a gene that plays a crucial role in the regulation of tiller angle in rice plants. It encodes a nuclear protein essential for this regulation, indicating its importance in determining plant architecture. The relationships involving TAC4 reveal that it significantly influences tiller angle regulation, which is critical for optimizing plant structure and yield. \n\nResearch on TAC4 also highlights its potential in improving rice plant architecture. By understanding how TAC4 functions, insights can be gained that may lead to enhancements in the structural traits of rice plants, ultimately increasing their adaptability and yield. \n\nFurthermore, TAC4 affects the distribution of auxin, a plant hormone that regulates various aspects of growth and development, including gravitropism. Specifically, TAC4 regulates rice shoot gravitropism by increasing the content of indole acetic acid, a type of auxin. This connection underscores the role of TAC4 in facilitating proper plant orientation and growth.\n\nIn summary, TAC4 is integral to the regulation of tiller angle, which directly impacts rice plant architecture. Its function not only influences the plant's structural traits but also provides a pathway for potential improvements in rice cultivation practices." + } + ] + }, + { + "conversations": [ + { + "from": "human", + "value": "哪些农作物适合在中海拔地区种植?" + }, + { + "from": "gpt", + "value": "中海拔地区的特征主要是气候和土壤条件适合特定农作物的生长。接下来,需要查找适宜种植区域与中海拔的关系,以确定哪些农作物能够在此生长。通过识别适合在中海拔种植的农作物品种,可以进一步了解这些作物的特性。分析这些农作物的外观特点,有助于了解它们在市场上的接受度。最后,确定早熟品种在中海拔地区的适应性,可以为农民提供更好的种植选择。" + } + ] + }, + { + "conversations": [ + { + "from": "human", + "value": "隆两优1212在长汀县的试种表现出哪些优良特性?" + }, + { + "from": "gpt", + "value": "隆两优1212在长汀县进行试种,表现出优良的生长特性。首先,隆两优1212的分蘖力强,使其能够产生较多的分蘖,适合在长汀县推广种植。其次,隆两优1212的抗性好,能够有效抵抗病虫害,减少种植过程中的损失。此外,隆两优1212的生育期适中,符合当地的气候条件,促进了其健康生长。最后,隆两优1212的抽穗整齐和后期转色好,进一步提高了其市场竞争力和吸引力。因此,隆两优1212在长汀县的试种表现出分蘖力强、抗性好、生育期适中、抽穗整齐和后期转色好的优良特性。" + } + ] + }, + { + "conversations": [ + { + "from": "human", + "value": "云粳26号的外观特点有哪些?" + }, + { + "from": "gpt", + "value": "云粳26号的外观特点有哪些? \n首先,我需要查询云粳26号的外观特点。根据信息,云粳26号的外观特点包括“颖尖无色、无芒”和“谷壳黄色”。接下来,我验证“颖尖无色、无芒”是否为云粳26号的外观特点。确实,颖尖无色、无芒是云粳26号的外观特点之一,体现了其独特性。然后,我再验证“谷壳黄色”是否为云粳26号的外观特点。谷壳黄色也是云粳26号的外观特征,影响其市场表现。因此,云粳26号的外观特点包括颖尖无色、无芒和谷壳黄色。" + } + ] + } +] \ No newline at end of file diff --git a/examples/output_examples/multi-hop_chatml.json b/examples/output_examples/multi-hop_chatml.json new file mode 100644 index 00000000..9e651ae3 --- /dev/null +++ b/examples/output_examples/multi-hop_chatml.json @@ -0,0 +1,122 @@ +[ + { + "messages": [ + { + "role": "user", + "content": "What architectural feature of rice plants is crucial for maximizing the amount of rice produced per unit area?" + }, + { + "role": "assistant", + "content": "Tiller angle" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "How does the manipulation of the BG1 gene in Arabidopsis affect the overall growth of the plant, particularly in terms of biomass?" + }, + { + "role": "assistant", + "content": "It leads to increased plant biomass, indicating its importance in overall plant growth." + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "隆两优1212在长汀县表现出哪些特性,使其适合在绿丰优质稻专业合作社进行示范种植?" + }, + { + "role": "assistant", + "content": "分蘖力强和优良的生长特性。" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "In which part of the plant cells were TAC4-RFP fusion proteins primarily localized, and what method was used to visualize their location?" + }, + { + "role": "assistant", + "content": "The TAC4-RFP fusion proteins were primarily localized to the cell wall, and RFP signals were used to visualize their location." + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "云粳26号的外观特点如何影响其市场接受度和种植选择?" + }, + { + "role": "assistant", + "content": "云粳26号的外观特点包括颖尖无色、无芒,这些特征影响其市场接受度和种植选择。" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "云粳26号在2012年被列为西南稻区的农业推广主导品种,这一决定反映了什么特征?" + }, + { + "role": "assistant", + "content": "云粳26号的适应性和市场需求。" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "What genetic factor in rice is responsible for the extra-large grain phenotype observed in the big grain1 (Bg1-D) mutant, and how does it influence grain size?" + }, + { + "role": "assistant", + "content": "The BG1 gene is responsible for the extra-large grain phenotype in the big grain1 (Bg1-D) mutant, as it regulates grain size and is involved in auxin transport." + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "云粳26号的高食用价值是由哪个机构的研究成果所体现的?" + }, + { + "role": "assistant", + "content": "云南省农业科学院" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "隆两优1212在种植过程中采用了哪些技术,这些技术如何影响其市场竞争力?" + }, + { + "role": "assistant", + "content": "隆两优1212在种植过程中采用了机插和机播技术,这些技术确保了插秧的质量和提高了种植效率,从而提升了其抽穗整齐特性,进而提高了市场竞争力。" + } + ] + }, + { + "messages": [ + { + "role": "user", + "content": "How does the overexpression of the BG1 gene influence the growth of rice plants, particularly in relation to grain size and auxin transport?" + }, + { + "role": "assistant", + "content": "The overexpression of the BG1 gene leads to increased grain size by enhancing auxin transport, which is crucial for regulating growth and development in rice plants. Additionally, BG1 expression in the culms indicates its role in the overall growth and structural integrity of the plant." + } + ] + } +] \ No newline at end of file diff --git a/examples/search/build_db/build_dna_blast_db.sh b/examples/search/build_db/build_dna_blast_db.sh new file mode 100755 index 00000000..f06bf11b --- /dev/null +++ b/examples/search/build_db/build_dna_blast_db.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +set -e + +# Downloads NCBI RefSeq nucleotide sequences and creates BLAST databases. +# +# RefSeq 目录结构说明(按生物分类组织): +# - vertebrate_mammalian (哺乳动物) +# - vertebrate_other (其他脊椎动物) +# - bacteria (细菌) +# - archaea (古菌) +# - fungi (真菌) +# - invertebrate (无脊椎动物) +# - plant (植物) +# - viral (病毒) +# - protozoa (原生动物) +# - mitochondrion (线粒体) +# - plastid (质体) +# - plasmid (质粒) +# - other (其他) +# - complete/ (完整基因组,包含所有分类) +# +# 每个分类目录下包含: +# - {category}.{number}.genomic.fna.gz (基因组序列) +# - {category}.{number}.rna.fna.gz (RNA序列) +# +# Usage: ./build_dna_blast_db.sh [human_mouse_drosophila_yeast|representative|complete|all] +# human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest) +# representative: Download genomic sequences from major categories (recommended, smaller) +# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi +# complete: Download all complete genomic sequences from complete/ directory (very large) +# all: Download all genomic sequences from all categories (very large) +# +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +DOWNLOAD_TYPE=${1:-human_mouse_drosophila_yeast} + +# Better to use a stable DOWNLOAD_TMP name to support resuming downloads +DOWNLOAD_TMP=_downloading_dna +mkdir -p ${DOWNLOAD_TMP} +cd ${DOWNLOAD_TMP} + +# Download RefSeq release information +echo "Downloading RefSeq release information..." +wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || { + echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier" + RELEASE=$(date +%Y%m%d) +} + +if [ -f "RELEASE_NUMBER" ]; then + RELEASE=$(cat RELEASE_NUMBER | tr -d '\n') + echo "RefSeq release: ${RELEASE}" +else + RELEASE=$(date +%Y%m%d) + echo "Using date as release identifier: ${RELEASE}" +fi + +# Function to check if a file is already downloaded and complete +check_file_downloaded() { + local filename=$1 + # Since we're already in ${DOWNLOAD_TMP} directory, use filename directly + local local_file="${filename}" + + # Check if compressed file (.fna.gz) exists + if [ -f "${local_file}" ] && [ -s "${local_file}" ]; then + # Try to verify it's a valid gzip file by attempting to decompress a small portion + if gunzip -t "${local_file}" 2>/dev/null; then + return 0 # Compressed file exists and is valid + else + # File exists but is corrupted, remove it + rm -f "${local_file}" + return 1 + fi + fi + + # Check if decompressed file (.fna) exists (may have been extracted previously) + local decompressed_file="${local_file%.gz}" + if [ -f "${decompressed_file}" ] && [ -s "${decompressed_file}" ]; then + # Check if it's a valid FASTA file (starts with >) + if head -1 "${decompressed_file}" 2>/dev/null | grep -q "^>"; then + return 0 # Decompressed file exists and appears valid + else + # File exists but doesn't look like a valid FASTA file + return 1 + fi + fi + + return 1 # Neither compressed nor decompressed file exists +} + +# Function to check if a file contains target species +check_file_for_species() { + local url=$1 + local filename=$2 + local temp_file="/tmp/check_${filename//\//_}" + + # First check if file is already downloaded locally + if check_file_downloaded "${filename}"; then + # File already exists, check if it contains target species + # Check both compressed and decompressed versions + local decompressed_file="${filename%.gz}" + if [ -f "${filename}" ]; then + # Compressed file exists + if gunzip -c "${filename}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then + return 0 # Contains target species + else + return 1 # Does not contain target species + fi + elif [ -f "${decompressed_file}" ]; then + # Decompressed file exists + if head -2000 "${decompressed_file}" 2>/dev/null | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then + return 0 # Contains target species + else + return 1 # Does not contain target species + fi + fi + fi + + # File not downloaded yet, download first 500KB to check + # Download first 500KB (enough to get many sequence headers) + # This should be sufficient to identify the species in most cases + if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then + # Try to decompress and check for species names + # Check for: Homo sapiens (人), Mus musculus (小鼠), Drosophila melanogaster (果蝇), Saccharomyces cerevisiae (酵母) + if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then + rm -f "${temp_file}" + return 0 # Contains target species + else + rm -f "${temp_file}" + return 1 # Does not contain target species + fi + else + # If partial download fails, skip this file (don't download it) + rm -f "${temp_file}" + return 1 + fi +} + +# Download based on type +case ${DOWNLOAD_TYPE} in + human_mouse_drosophila_yeast) + echo "Downloading RefSeq sequences for Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal size)..." + echo "This will check each file to see if it contains target species sequences..." + + # Check multiple categories: vertebrate_mammalian (人、小鼠), invertebrate (果蝇), fungi (酵母) + categories="vertebrate_mammalian invertebrate fungi" + total_file_count=0 + total_download_count=0 + + for category in ${categories}; do + echo "Checking files in ${category} category..." + + # Get list of files and save to temp file to avoid subshell issues + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + file_count=0 + download_count=0 + + while read filename; do + file_count=$((file_count + 1)) + total_file_count=$((total_file_count + 1)) + url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" + echo -n "[${total_file_count}] Checking ${category}/${filename}... " + + if check_file_for_species "${url}" "${filename}"; then + # Check if file is already downloaded + if check_file_downloaded "${filename}"; then + echo "✓ already downloaded (contains target species)" + download_count=$((download_count + 1)) + total_download_count=$((total_download_count + 1)) + else + echo "✓ contains target species, downloading..." + download_count=$((download_count + 1)) + total_download_count=$((total_download_count + 1)) + wget -c -q --show-progress "${url}" || { + echo "Warning: Failed to download ${filename}" + } + fi + else + echo "✗ skipping (no target species data)" + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt + echo " ${category}: Checked ${file_count} files, downloaded ${download_count} files." + done + + echo "" + echo "Summary: Checked ${total_file_count} files total, downloaded ${total_download_count} files containing target species (human, mouse, fruit fly, yeast)." + ;; + representative) + echo "Downloading RefSeq representative sequences (recommended, smaller size)..." + # Download major categories for representative coverage + # Note: You can modify this list based on your specific requirements + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do + echo "Downloading ${category} sequences..." + # Get list of files and save to temp file to avoid subshell issues + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt + done + ;; + complete) + echo "Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..." + # Get list of files and save to temp file to avoid subshell issues + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_complete.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + fi + done < /tmp/refseq_files_complete.txt + + rm -f /tmp/refseq_files_complete.txt + ;; + all) + echo "Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..." + # Download genomic sequences from all categories + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do + echo "Downloading ${category} genomic sequences..." + # Get list of files and save to temp file to avoid subshell issues + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt + done + ;; + *) + echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" + echo "Usage: $0 [human_mouse_drosophila_yeast|representative|complete|all]" + echo " human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal)" + echo " representative: Download major categories (recommended)" + echo " complete: Download all complete genomic sequences (very large)" + echo " all: Download all genomic sequences (extremely large)" + echo "Note: For RNA sequences, use build_rna_blast_db.sh instead" + exit 1 + ;; +esac + +cd .. + +# Create release directory +mkdir -p refseq_${RELEASE} +mv ${DOWNLOAD_TMP}/* refseq_${RELEASE}/ 2>/dev/null || true +rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + +cd refseq_${RELEASE} + +# Extract and combine sequences +echo "Extracting and combining sequences..." + +# Extract all downloaded genomic sequences +if [ $(find . -name "*.genomic.fna.gz" -type f | wc -l) -gt 0 ]; then + echo "Extracting genomic sequences..." + find . -name "*.genomic.fna.gz" -type f -exec gunzip {} \; +fi + +# Combine all FASTA files into one +echo "Combining all FASTA files..." +FASTA_FILES=$(find . -name "*.fna" -type f) +if [ -z "$FASTA_FILES" ]; then + FASTA_FILES=$(find . -name "*.fa" -type f) +fi + +if [ -z "$FASTA_FILES" ]; then + echo "Error: No FASTA files found to combine" + exit 1 +fi + +echo "$FASTA_FILES" | while read -r file; do + if [ -f "$file" ]; then + cat "$file" >> refseq_${RELEASE}.fasta + fi +done + +# Check if we have sequences +if [ ! -s "refseq_${RELEASE}.fasta" ]; then + echo "Error: Combined FASTA file is empty" + exit 1 +fi + +echo "Creating BLAST database..." +# Create BLAST database for DNA sequences (use -dbtype nucl for nucleotide) +makeblastdb -in refseq_${RELEASE}.fasta \ + -out refseq_${RELEASE} \ + -dbtype nucl \ + -parse_seqids \ + -title "RefSeq_${RELEASE}" + +echo "BLAST database created successfully!" +echo "Database location: $(pwd)/refseq_${RELEASE}" +echo "" +echo "To use this database, set in your config:" +echo " local_blast_db: $(pwd)/refseq_${RELEASE}" +echo "" +echo "Note: The database files are:" +ls -lh refseq_${RELEASE}.* + +cd .. + diff --git a/examples/search/build_db/build_protein_blast_db.sh b/examples/search/build_db/build_protein_blast_db.sh new file mode 100755 index 00000000..a9169959 --- /dev/null +++ b/examples/search/build_db/build_protein_blast_db.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +set -e + +# Downloads the latest release of UniProt, putting it in a release-specific directory. +# Creates associated BLAST databases. +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +# NOTE: UniProt mirror +# Available mirrors: +# - UK/EBI: ftp://ftp.ebi.ac.uk/pub/databases/uniprot (current, recommended) +# - US: ftp://ftp.uniprot.org/pub/databases/uniprot +# - CH: ftp://ftp.expasy.org/databases/uniprot +UNIPROT_BASE="ftp://ftp.ebi.ac.uk/pub/databases/uniprot" + +# Parse command line arguments +DOWNLOAD_MODE="sprot" # sprot (Swiss-Prot) or full (sprot + trembl) + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " -s, --sprot-only Download only Swiss-Prot database (recommended, high quality)" + echo " -f, --full Download full release (Swiss-Prot + TrEMBL, merged as uniprot_\${RELEASE})" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 --sprot-only # Download only uniprot_sprot" + echo " $0 --full # Download uniprot_\${RELEASE} (Swiss-Prot + TrEMBL)" +} + +while [[ $# -gt 0 ]]; do + case $1 in + -s|--sprot-only) + DOWNLOAD_MODE="sprot" + shift + ;; + -f|--full) + DOWNLOAD_MODE="full" + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" + usage + exit 1 + ;; + esac +done + +echo "Download mode: ${DOWNLOAD_MODE}" +if [ "${DOWNLOAD_MODE}" = "sprot" ]; then + echo " - Will download: uniprot_sprot only" +else + echo " - Will download: uniprot_\${RELEASE} (Swiss-Prot + TrEMBL merged)" +fi +echo "Using mirror: ${UNIPROT_BASE} (EBI/UK - fast for Asia/Europe)" +echo "" + +# Better to use a stable DOWNLOAD_TMP name to support resuming downloads +DOWNLOAD_TMP=_downloading +mkdir -p ${DOWNLOAD_TMP} +cd ${DOWNLOAD_TMP} + +echo "Downloading RELEASE.metalink..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/RELEASE.metalink" + +# Extract the release name (like 2017_10 or 2017_1) +# Use sed for cross-platform compatibility (works on both macOS and Linux) +RELEASE=$(sed -n 's/.*\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1) + +echo "UniProt release: ${RELEASE}" +echo "" + +# Download Swiss-Prot (always needed) +echo "Downloading uniprot_sprot.fasta.gz..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" + +# Download TrEMBL only if full mode +if [ "${DOWNLOAD_MODE}" = "full" ]; then + echo "Downloading uniprot_trembl.fasta.gz..." + wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" +fi + +# Download metadata files +echo "Downloading metadata files..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/reldate.txt" +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/README" +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/LICENSE" + +cd .. + +mkdir -p ${RELEASE} +mv ${DOWNLOAD_TMP}/* ${RELEASE} +rmdir ${DOWNLOAD_TMP} + +cd ${RELEASE} + +echo "" +echo "Extracting files..." +gunzip uniprot_sprot.fasta.gz + +if [ "${DOWNLOAD_MODE}" = "full" ]; then + gunzip uniprot_trembl.fasta.gz + echo "Merging Swiss-Prot and TrEMBL..." + cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta +fi + +echo "" +echo "Building BLAST databases..." + +# Always build Swiss-Prot database +makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot + +# Build full release database only if in full mode +if [ "${DOWNLOAD_MODE}" = "full" ]; then + makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE} + makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl +fi + +cd .. + +echo "" +echo "BLAST databases created successfully!" +echo "Database locations:" +if [ "${DOWNLOAD_MODE}" = "sprot" ]; then + echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" + echo "" + echo "To use this database, set in your config:" + echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot" +else + echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}" + echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" + echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl" + echo "" + echo "To use these databases, set in your config:" + echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl" +fi + diff --git a/examples/search/build_db/build_rna_blast_db.sh b/examples/search/build_db/build_rna_blast_db.sh new file mode 100755 index 00000000..af688ac1 --- /dev/null +++ b/examples/search/build_db/build_rna_blast_db.sh @@ -0,0 +1,433 @@ +#!/bin/bash + +set -e + +# Downloads RNAcentral sequences and creates BLAST databases. +# This script downloads the RNAcentral active database, which is the same +# data source used for online RNAcentral searches, ensuring consistency +# between local and online search results. +# +# RNAcentral is a comprehensive database of non-coding RNA sequences that +# integrates data from multiple expert databases including RefSeq, Rfam, etc. +# +# Usage: ./build_rna_blast_db.sh [all|list|selected|database_name...] +# all (default): Download complete active database (~8.4G compressed) +# list: List all available database subsets +# selected: Download predefined database subsets (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase) +# database_name: Download specific database subset (e.g., refseq, rfam, mirbase) +# database_name1 database_name2 ...: Download multiple database subsets +# +# Available database subsets (examples): +# - refseq.fasta (~98M): RefSeq RNA sequences +# - rfam.fasta (~1.5G): Rfam RNA families +# - mirbase.fasta (~10M): microRNA sequences +# - ensembl_gencode.fasta (~337M): Ensembl/GENCODE annotations (human) +# - gtrnadb.fasta (~38M): tRNA sequences +# - lncbase.fasta (~106K): Human lncRNA database +# - See "list" option for complete list +# +# The complete "active" database contains all sequences from all expert databases. +# Using a specific database subset provides a smaller, focused database. +# +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +# RNAcentral base URL (using EBI HTTPS) +# NOTE: RNAcentral only has one official mirror at EBI +RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral" +RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release" +RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences" +RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database" + +# Parse command line arguments +DB_SELECTION=${1:-selected} + +# Predefined database list for "selected" option +SELECTED_DATABASES=("ensembl_gencode" "mirbase" "gtrnadb" "refseq" "lncbase" "rfam") + +# List available databases if requested +if [ "${DB_SELECTION}" = "list" ]; then + echo "Available RNAcentral database subsets:" + echo "" + echo "Fetching list from RNAcentral..." + listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/") + echo "${listing}" | \ + grep -oE '' | \ + sed 's///' | \ + sort | \ + while read db; do + size=$(echo "${listing}" | grep -A 1 "${db}" | grep -oE '[0-9.]+[GMK]' | head -1 || echo "unknown") + echo " - ${db%.fasta}: ${size}" + done + echo "" + echo "Usage: $0 [all|list|selected|database_name...]" + echo " Example: $0 refseq # Download only RefSeq sequences (~98M)" + echo " Example: $0 rfam # Download only Rfam sequences (~1.5G)" + echo " Example: $0 selected # Download predefined databases (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase, rfam)" + echo " Example: $0 refseq mirbase # Download multiple databases" + echo " Example: $0 all # Download complete active database (~8.4G)" + exit 0 +fi + +# Determine which databases to download +if [ "${DB_SELECTION}" = "selected" ]; then + # Use predefined database list + DATABASES=("${SELECTED_DATABASES[@]}") + echo "Downloading selected databases: ${DATABASES[*]}" +elif [ "${DB_SELECTION}" = "all" ]; then + # Single database mode (all) + DATABASES=("all") +else + # Multiple databases provided as arguments + DATABASES=("$@") +fi + +# Get RNAcentral release version from release notes (once for all databases) +echo "Getting RNAcentral release information..." +RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt" +RELEASE_NOTES_TMP=$(mktemp) +wget -q "${RELEASE_NOTES_URL}" -O "${RELEASE_NOTES_TMP}" 2>/dev/null || { + echo "Warning: Could not download release notes, using current date as release identifier" + RELEASE=$(date +%Y%m%d) +} + +if [ -f "${RELEASE_NOTES_TMP}" ] && [ -s "${RELEASE_NOTES_TMP}" ]; then + # Try to extract version from release notes (first line usually contains version info) + RELEASE=$(head -1 "${RELEASE_NOTES_TMP}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.') + rm -f "${RELEASE_NOTES_TMP}" +fi + +if [ -z "${RELEASE}" ]; then + RELEASE=$(date +%Y%m%d) + echo "Using date as release identifier: ${RELEASE}" +else + echo "RNAcentral release: ${RELEASE}" +fi + +# Process each database +DB_COUNT=${#DATABASES[@]} +DB_INDEX=0 + +for DB_SELECTION in "${DATABASES[@]}"; do + DB_INDEX=$((DB_INDEX + 1)) + echo "" + echo "==========================================" + echo "Processing database ${DB_INDEX}/${DB_COUNT}: ${DB_SELECTION}" + echo "==========================================" + echo "" + + # Check if database already exists and is complete + # First check with current release version + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + DB_NAME="rnacentral" + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + else + OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}" + DB_NAME="${DB_SELECTION}" + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + fi + + # Check if BLAST database already exists with current release + if [ -d "${OUTPUT_DIR}" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nhr" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nin" ]; then + echo "✓ Database ${DB_SELECTION} already exists and appears complete: ${OUTPUT_DIR}/" + echo " BLAST database: ${OUTPUT_DIR}/${DB_OUTPUT_NAME}" + echo " Skipping download and database creation..." + continue + fi + + # Also check for any existing version of this database (e.g., different release dates) + EXISTING_DIR=$(ls -d rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1) + if [ -n "${EXISTING_DIR}" ] && [ "${DB_SELECTION}" != "all" ]; then + EXISTING_DB_NAME=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//") + if [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nhr" ] && [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nin" ]; then + echo "✓ Database ${DB_SELECTION} already exists (version ${EXISTING_DB_NAME}): ${EXISTING_DIR}/" + echo " BLAST database: ${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}" + echo " Skipping download and database creation..." + echo " Note: Using existing version ${EXISTING_DB_NAME} instead of ${RELEASE}" + continue + fi + fi + + # Better to use a stable DOWNLOAD_TMP name to support resuming downloads + DOWNLOAD_TMP="_downloading_rnacentral_${DB_SELECTION}" + mkdir -p ${DOWNLOAD_TMP} + cd ${DOWNLOAD_TMP} + + # Download RNAcentral FASTA file + if [ "${DB_SELECTION}" = "all" ]; then + # Download complete active database + FASTA_FILE="rnacentral_active.fasta.gz" + DB_NAME="rnacentral" + echo "Downloading RNAcentral active sequences (~8.4G)..." + echo " Contains sequences currently present in at least one expert database" + echo " Uses standard URS IDs (e.g., URS000149A9AF)" + echo " ⭐ MATCHES the online RNAcentral API database - ensures consistency" + FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}" + IS_COMPRESSED=true + else + # Download specific database subset + DB_NAME="${DB_SELECTION}" + FASTA_FILE="${DB_SELECTION}.fasta" + echo "Downloading RNAcentral database subset: ${DB_SELECTION}" + echo " This is a subset of the active database from a specific expert database" + echo " File: ${FASTA_FILE}" + FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}" + IS_COMPRESSED=false + + # Check if database exists (use HTTP status code check for HTTPS) + HTTP_CODE=$(curl -s --max-time 10 -o /dev/null -w "%{http_code}" "${FASTA_URL}" 2>/dev/null | tail -1 || echo "000") + if ! echo "${HTTP_CODE}" | grep -q "^200$"; then + echo "Error: Database '${DB_SELECTION}' not found (HTTP code: ${HTTP_CODE})" + echo "Run '$0 list' to see available databases" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + fi + fi + + echo "Downloading from: ${FASTA_URL}" + echo "This may take a while depending on your internet connection..." + if [ "${DB_SELECTION}" = "all" ]; then + echo "File size is approximately 8-9GB, please be patient..." + else + echo "Downloading database subset..." + fi + + wget -c "${FASTA_URL}" || { + echo "Error: Failed to download RNAcentral FASTA file" + echo "Please check your internet connection and try again" + echo "URL: ${FASTA_URL}" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + } + + if [ ! -f "${FASTA_FILE}" ]; then + echo "Error: Downloaded file not found" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + fi + + cd .. + + # Create release directory + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + else + OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}" + fi + mkdir -p ${OUTPUT_DIR} + mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true + rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + + cd ${OUTPUT_DIR} + + # Extract FASTA file if compressed + echo "Preparing RNAcentral sequences..." + if [ -f "${FASTA_FILE}" ]; then + if [ "${IS_COMPRESSED}" = "true" ]; then + echo "Decompressing ${FASTA_FILE}..." + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || { + echo "Error: Failed to decompress FASTA file" + cd .. + exit 1 + } + # Optionally remove the compressed file to save space + # rm "${FASTA_FILE}" + else + # File is not compressed, just copy/rename + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || { + echo "Error: Failed to copy FASTA file" + cd .. + exit 1 + } + fi + else + echo "Error: FASTA file not found" + cd .. + exit 1 + fi + + # Check if we have sequences + if [ ! -s "${OUTPUT_FASTA}" ]; then + echo "Error: FASTA file is empty" + cd .. + exit 1 + fi + + # Get file size for user information + FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1) + echo "FASTA file size: ${FILE_SIZE}" + + echo "Creating BLAST database..." + # Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) + # Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers, + # which matches the format expected by the RNACentralSearch class + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + makeblastdb -in "${OUTPUT_FASTA}" \ + -out "${DB_OUTPUT_NAME}" \ + -dbtype nucl \ + -parse_seqids \ + -title "RNAcentral_${DB_NAME}_${RELEASE}" + + echo "" + echo "BLAST database created successfully!" + echo "Database location: $(pwd)/${DB_OUTPUT_NAME}" + echo "" + echo "To use this database, set in your config (search_rna_config.yaml):" + echo " rnacentral_params:" + echo " use_local_blast: true" + echo " local_blast_db: $(pwd)/${DB_OUTPUT_NAME}" + echo "" + echo "Note: The database files are:" + ls -lh ${DB_OUTPUT_NAME}.* | head -5 + echo "" + if [ "${DB_SELECTION}" = "all" ]; then + echo "This database uses RNAcentral IDs (URS...), which matches the online" + echo "RNAcentral search API, ensuring consistent results between local and online searches." + else + echo "This is a subset database from ${DB_SELECTION} expert database." + echo "For full coverage matching online API, use 'all' option." + fi + + cd .. +done + +echo "" +echo "==========================================" +echo "All databases processed successfully!" +echo "==========================================" +echo "" + +# If multiple databases were downloaded, offer to merge them +if [ ${#DATABASES[@]} -gt 1 ] && [ "${DATABASES[0]}" != "all" ]; then + echo "Multiple databases downloaded. Creating merged database for unified search..." + MERGED_DIR="rnacentral_merged_${RELEASE}" + mkdir -p ${MERGED_DIR} + cd ${MERGED_DIR} + + MERGED_FASTA="rnacentral_merged_${RELEASE}.fasta" + MERGED_FASTA_TMP="${MERGED_FASTA}.tmp" + echo "Combining FASTA files from all databases..." + echo " Note: Duplicate sequence IDs will be removed (keeping first occurrence)..." + + # Combine all FASTA files into a temporary file + # Find actual database directories (may have different release versions) + FOUND_ANY=false + for DB_SELECTION in "${DATABASES[@]}"; do + [ "${DB_SELECTION}" = "all" ] && continue + + # Try current release version first, then search for any existing version + OUTPUT_FASTA="../rnacentral_${DB_SELECTION}_${RELEASE}/${DB_SELECTION}_${RELEASE}.fasta" + [ ! -f "${OUTPUT_FASTA}" ] && { + EXISTING_DIR=$(ls -d ../rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1) + [ -n "${EXISTING_DIR}" ] && { + EXISTING_VERSION=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//") + OUTPUT_FASTA="${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_VERSION}.fasta" + } + } + + if [ -f "${OUTPUT_FASTA}" ]; then + echo " Adding ${DB_SELECTION} sequences..." + cat "${OUTPUT_FASTA}" >> "${MERGED_FASTA_TMP}" + FOUND_ANY=true + else + echo " Warning: Could not find FASTA file for ${DB_SELECTION}" + fi + done + + # Validate that we have files to merge + if [ "${FOUND_ANY}" = "false" ] || [ ! -s "${MERGED_FASTA_TMP}" ]; then + echo "Error: No FASTA files found to merge" + cd .. + rm -rf ${MERGED_DIR} + exit 1 + fi + + # Remove duplicates based on sequence ID (keeping first occurrence) + echo " Removing duplicate sequence IDs..." + awk ' + /^>/ { + # Process previous sequence if we have one + if (current_id != "" && !seen[current_id]) { + print current_header ORS current_seq + seen[current_id] = 1 + } + # Start new sequence + current_header = $0 + current_id = substr($0, 2) + sub(/[ \t].*/, "", current_id) # Extract ID up to first space/tab + current_seq = "" + next + } + { + # Accumulate sequence data by concatenating lines + current_seq = current_seq $0 + } + END { + # Process last sequence + if (current_id != "" && !seen[current_id]) { + print current_header ORS current_seq + } + } + ' "${MERGED_FASTA_TMP}" > "${MERGED_FASTA}" + rm -f "${MERGED_FASTA_TMP}" + + # Check if merged file was created and has content + if [ ! -s "${MERGED_FASTA}" ]; then + echo "Warning: Merged FASTA file is empty or not created" + cd .. + rm -rf ${MERGED_DIR} + else + FILE_SIZE=$(du -h "${MERGED_FASTA}" | cut -f1) + echo "Merged FASTA file size: ${FILE_SIZE}" + + echo "Creating merged BLAST database..." + MERGED_DB_NAME="rnacentral_merged_${RELEASE}" + makeblastdb -in "${MERGED_FASTA}" \ + -out "${MERGED_DB_NAME}" \ + -dbtype nucl \ + -parse_seqids \ + -title "RNAcentral_Merged_${RELEASE}" + + echo "" + echo "✓ Merged BLAST database created successfully!" + echo "Database location: $(pwd)/${MERGED_DB_NAME}" + echo "" + echo "To use the merged database, set in your config (search_rna_config.yaml):" + echo " rnacentral_params:" + echo " use_local_blast: true" + echo " local_blast_db: $(pwd)/${MERGED_DB_NAME}" + echo "" + echo "Note: The merged database includes: ${DATABASES[*]}" + cd .. + fi +fi + +echo "" +echo "Summary of downloaded databases:" +for DB_SELECTION in "${DATABASES[@]}"; do + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + DB_NAME="rnacentral" + else + OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}" + DB_NAME="${DB_SELECTION}" + fi + if [ -d "${OUTPUT_DIR}" ]; then + echo " - ${DB_NAME}: ${OUTPUT_DIR}/" + fi +done + +if [ -d "rnacentral_merged_${RELEASE}" ]; then + echo " - merged (all databases): rnacentral_merged_${RELEASE}/" + echo "" + echo "💡 Recommendation: Use the merged database for searching across all databases." +fi + diff --git a/examples/search/search_dna.sh b/examples/search/search_dna.sh new file mode 100644 index 00000000..e05ab751 --- /dev/null +++ b/examples/search/search_dna.sh @@ -0,0 +1,4 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_dna_config.yaml \ +--output_dir cache/ + diff --git a/examples/search/search_dna_config.yaml b/examples/search/search_dna_config.yaml new file mode 100644 index 00000000..9a469aff --- /dev/null +++ b/examples/search/search_dna_config.yaml @@ -0,0 +1,30 @@ +global_params: + working_dir: cache + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + params: + data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + ncbi_params: + email: test@example.com # NCBI requires an email address + tool: GraphGen # tool name for NCBI API + use_local_blast: true # whether to use local blast for DNA search + local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension) + blast_num_threads: 2 # number of threads for BLAST search (reduce to save memory) + max_concurrent: 5 # maximum number of concurrent search tasks (reduce to prevent OOM, default: unlimited) + diff --git a/examples/search/search_protein_config.yaml b/examples/search/search_protein_config.yaml new file mode 100644 index 00000000..c33e5096 --- /dev/null +++ b/examples/search/search_protein_config.yaml @@ -0,0 +1,28 @@ +global_params: + working_dir: cache + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + params: + data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot + uniprot_params: + use_local_blast: true # whether to use local blast for uniprot search + local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot + # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) + blast_num_threads: 2 # number of threads for BLAST search (reduce to save memory) + max_concurrent: 5 # maximum number of concurrent search tasks (reduce to prevent OOM, default: unlimited) diff --git a/examples/search/search_rna.sh b/examples/search/search_rna.sh new file mode 100644 index 00000000..2bf5a406 --- /dev/null +++ b/examples/search/search_rna.sh @@ -0,0 +1,4 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_rna_config.yaml \ +--output_dir cache/ + diff --git a/examples/search/search_rna_config.yaml b/examples/search/search_rna_config.yaml new file mode 100644 index 00000000..ba6ba3f3 --- /dev/null +++ b/examples/search/search_rna_config.yaml @@ -0,0 +1,27 @@ +global_params: + working_dir: cache + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + params: + data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + rnacentral_params: + use_local_blast: true # whether to use local blast for RNA search + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension) + blast_num_threads: 2 # number of threads for BLAST search (reduce to save memory) + max_concurrent: 5 # maximum number of concurrent search tasks (reduce to prevent OOM, default: unlimited) diff --git a/examples/search/search_uniprot.sh b/examples/search/search_uniprot.sh new file mode 100644 index 00000000..fbb33d70 --- /dev/null +++ b/examples/search/search_uniprot.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_protein_config.yaml \ +--output_dir cache/ diff --git a/graphgen/version.py b/graphgen/_version.py similarity index 72% rename from graphgen/version.py rename to graphgen/_version.py index 73315e64..a7ae27a2 100644 --- a/graphgen/version.py +++ b/graphgen/_version.py @@ -1,7 +1,6 @@ - from typing import Tuple -__version__ = '20250416' +__version__ = "0.1.0.post20250930" short_version = __version__ @@ -15,13 +14,13 @@ def parse_version_info(version_str: str) -> Tuple: tuple: A sequence of integer and string represents version. """ _version_info = [] - for x in version_str.split('.'): + for x in version_str.split("."): if x.isdigit(): _version_info.append(int(x)) - elif x.find('rc') != -1: - patch_version = x.split('rc') + elif x.find("rc") != -1: + patch_version = x.split("rc") _version_info.append(int(patch_version[0])) - _version_info.append(f'rc{patch_version[1]}') + _version_info.append(f"rc{patch_version[1]}") return tuple(_version_info) diff --git a/graphgen/bases/__init__.py b/graphgen/bases/__init__.py new file mode 100644 index 00000000..41136974 --- /dev/null +++ b/graphgen/bases/__init__.py @@ -0,0 +1,12 @@ +from .base_extractor import BaseExtractor +from .base_generator import BaseGenerator +from .base_kg_builder import BaseKGBuilder +from .base_llm_wrapper import BaseLLMWrapper +from .base_operator import BaseOperator +from .base_partitioner import BasePartitioner +from .base_reader import BaseReader +from .base_searcher import BaseSearcher +from .base_splitter import BaseSplitter +from .base_storage import BaseGraphStorage, BaseKVStorage, StorageNameSpace +from .base_tokenizer import BaseTokenizer +from .datatypes import Chunk, Config, Node, QAPair, Token diff --git a/graphgen/bases/base_extractor.py b/graphgen/bases/base_extractor.py new file mode 100644 index 00000000..054e6e3a --- /dev/null +++ b/graphgen/bases/base_extractor.py @@ -0,0 +1,22 @@ +from abc import ABC, abstractmethod +from typing import Any + +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper + + +class BaseExtractor(ABC): + """ + Extract information from given text. + + """ + + def __init__(self, llm_client: BaseLLMWrapper): + self.llm_client = llm_client + + @abstractmethod + async def extract(self, chunk: dict) -> Any: + """Extract information from the given text""" + + @abstractmethod + def build_prompt(self, text: str) -> str: + """Build prompt for LLM based on the given text""" diff --git a/graphgen/bases/base_generator.py b/graphgen/bases/base_generator.py new file mode 100644 index 00000000..85de5877 --- /dev/null +++ b/graphgen/bases/base_generator.py @@ -0,0 +1,83 @@ +from abc import ABC, abstractmethod +from typing import Any + +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper + + +class BaseGenerator(ABC): + """ + Generate QAs based on given prompts. + """ + + def __init__(self, llm_client: BaseLLMWrapper): + self.llm_client = llm_client + + @staticmethod + @abstractmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + """Build prompt for LLM based on the given batch""" + + @staticmethod + @abstractmethod + def parse_response(response: str) -> Any: + """Parse the LLM response and return the generated QAs""" + + async def generate( + self, + batch: tuple[ + list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]] + ], + ) -> dict[str, Any]: + """ + Generate QAs based on a given batch. + :param batch + :return: QA pairs + """ + result = {} + prompt = self.build_prompt(batch) + response = await self.llm_client.generate_answer(prompt) + qa_pairs = self.parse_response(response) # generate one or more QA pairs + result.update(qa_pairs) + return result + + @staticmethod + def format_generation_results( + results: list[dict], output_data_format: str + ) -> list[dict[str, Any]]: + if output_data_format == "Alpaca": + results = [ + { + "instruction": v["question"], + "input": "", + "output": v["answer"], + } + for item in results + for k, v in item.items() + ] + elif output_data_format == "Sharegpt": + results = [ + { + "conversations": [ + {"from": "human", "value": v["question"]}, + {"from": "gpt", "value": v["answer"]}, + ] + } + for item in results + for k, v in item.items() + ] + elif output_data_format == "ChatML": + results = [ + { + "messages": [ + {"role": "user", "content": v["question"]}, + {"role": "assistant", "content": v["answer"]}, + ] + } + for item in results + for k, v in item.items() + ] + else: + raise ValueError(f"Unknown output data format: {output_data_format}") + return results diff --git a/graphgen/bases/base_kg_builder.py b/graphgen/bases/base_kg_builder.py new file mode 100644 index 00000000..d8a5d66a --- /dev/null +++ b/graphgen/bases/base_kg_builder.py @@ -0,0 +1,39 @@ +from abc import ABC, abstractmethod +from collections import defaultdict +from typing import Dict, List, Tuple + +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk + + +class BaseKGBuilder(ABC): + def __init__(self, llm_client: BaseLLMWrapper): + self.llm_client = llm_client + self._nodes: Dict[str, List[dict]] = defaultdict(list) + self._edges: Dict[Tuple[str, str], List[dict]] = defaultdict(list) + + @abstractmethod + async def extract( + self, chunk: Chunk + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """Extract nodes and edges from a single chunk.""" + raise NotImplementedError + + @abstractmethod + async def merge_nodes( + self, + node_data: tuple[str, List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + """Merge extracted nodes into the knowledge graph.""" + raise NotImplementedError + + @abstractmethod + async def merge_edges( + self, + edges_data: tuple[Tuple[str, str], List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + """Merge extracted edges into the knowledge graph.""" + raise NotImplementedError diff --git a/graphgen/bases/base_llm_wrapper.py b/graphgen/bases/base_llm_wrapper.py new file mode 100644 index 00000000..8b6dbec7 --- /dev/null +++ b/graphgen/bases/base_llm_wrapper.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import abc +import re +from typing import Any, List, Optional + +from graphgen.bases.base_tokenizer import BaseTokenizer +from graphgen.bases.datatypes import Token + + +class BaseLLMWrapper(abc.ABC): + """ + LLM client base class, agnostic to specific backends (OpenAI / Ollama / ...). + """ + + def __init__( + self, + *, + system_prompt: str = "", + temperature: float = 0.0, + max_tokens: int = 4096, + repetition_penalty: float = 1.05, + top_p: float = 0.95, + top_k: int = 50, + tokenizer: Optional[BaseTokenizer] = None, + **kwargs: Any, + ): + self.system_prompt = system_prompt + self.temperature = temperature + self.max_tokens = max_tokens + self.repetition_penalty = repetition_penalty + self.top_p = top_p + self.top_k = top_k + self.tokenizer = tokenizer + + for k, v in kwargs.items(): + setattr(self, k, v) + + @abc.abstractmethod + async def generate_answer( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> str: + """Generate answer from the model.""" + raise NotImplementedError + + @abc.abstractmethod + async def generate_topk_per_token( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + """Generate top-k tokens for the next token prediction.""" + raise NotImplementedError + + @abc.abstractmethod + async def generate_inputs_prob( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + """Generate probabilities for each token in the input.""" + raise NotImplementedError + + @staticmethod + def filter_think_tags(text: str, think_tag: str = "think") -> str: + """ + Remove tags from the text. + - If the text contains and , it removes everything between them and the tags themselves. + - If the text contains only , it removes content before the tag. + """ + paired_pattern = re.compile(rf"<{think_tag}>.*?", re.DOTALL) + filtered = paired_pattern.sub("", text) + + orphan_pattern = re.compile(rf"^.*?", re.DOTALL) + filtered = orphan_pattern.sub("", filtered) + + filtered = filtered.strip() + return filtered if filtered else text.strip() diff --git a/graphgen/bases/base_operator.py b/graphgen/bases/base_operator.py new file mode 100644 index 00000000..8d0cddb5 --- /dev/null +++ b/graphgen/bases/base_operator.py @@ -0,0 +1,59 @@ +import inspect +import logging +import os +from abc import ABC, abstractmethod +from typing import Iterable, Union + +import pandas as pd +import ray + +from graphgen.utils.log import CURRENT_LOGGER_VAR, set_logger + + +class BaseOperator(ABC): + def __init__(self, working_dir: str = "cache", op_name: str = None): + log_dir = os.path.join(working_dir, "logs") + self.op_name = op_name or self.__class__.__name__ + + try: + ctx = ray.get_runtime_context() + worker_id = ctx.get_actor_id() or ctx.get_worker_id() + worker_id_short = worker_id[-6:] if worker_id else "driver" + except Exception as e: + print( + "Warning: Could not get Ray worker ID, defaulting to 'local'. Exception:", + e, + ) + worker_id_short = "local" + + # e.g. cache/logs/ChunkService_a1b2c3.log + log_file = os.path.join(log_dir, f"{self.op_name}_{worker_id_short}.log") + + self.logger = set_logger( + log_file=log_file, name=f"{self.op_name}.{worker_id_short}", + console_level=logging.ERROR, force=True + ) + + self.logger.info( + "[%s] Operator initialized on Worker %s", self.op_name, worker_id_short + ) + + def __call__( + self, batch: pd.DataFrame + ) -> Union[pd.DataFrame, Iterable[pd.DataFrame]]: + logger_token = CURRENT_LOGGER_VAR.set(self.logger) + try: + result = self.process(batch) + if inspect.isgenerator(result): + yield from result + else: + yield result + finally: + CURRENT_LOGGER_VAR.reset(logger_token) + + @abstractmethod + def process(self, batch): + raise NotImplementedError("Subclasses must implement the process method.") + + def get_logger(self): + return self.logger diff --git a/graphgen/bases/base_partitioner.py b/graphgen/bases/base_partitioner.py new file mode 100644 index 00000000..d948e3a7 --- /dev/null +++ b/graphgen/bases/base_partitioner.py @@ -0,0 +1,69 @@ +from abc import ABC, abstractmethod +from typing import Any, List + +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Community + + +class BasePartitioner(ABC): + @abstractmethod + def partition( + self, + g: BaseGraphStorage, + **kwargs: Any, + ) -> List[Community]: + """ + Graph -> Communities + :param g: Graph storage instance + :param kwargs: Additional parameters for partitioning + :return: List of communities + """ + + @staticmethod + def community2batch( + comm: Community, g: BaseGraphStorage + ) -> tuple[ + list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]] + ]: + """ + Convert communities to batches of nodes and edges. + :param comm: Community + :param g: Graph storage instance + :return: List of batches, each batch is a tuple of (nodes, edges) + """ + nodes = comm.nodes + edges = comm.edges + nodes_data = [] + for node in nodes: + node_data = g.get_node(node) + if node_data: + nodes_data.append((node, node_data)) + edges_data = [] + for u, v in edges: + edge_data = g.get_edge(u, v) + if edge_data: + edges_data.append((u, v, edge_data)) + else: + edge_data = g.get_edge(v, u) + if edge_data: + edges_data.append((v, u, edge_data)) + return nodes_data, edges_data + + @staticmethod + def _build_adjacency_list( + nodes: List[tuple[str, dict]], edges: List[tuple[str, str, dict]] + ) -> tuple[dict[str, List[str]], set[tuple[str, str]]]: + """ + Build adjacency list and edge set from nodes and edges. + :param nodes + :param edges + :return: adjacency list, edge set + """ + adj: dict[str, List[str]] = {n[0]: [] for n in nodes} + edge_set: set[tuple[str, str]] = set() + for e in edges: + adj[e[0]].append(e[1]) + adj[e[1]].append(e[0]) + edge_set.add((e[0], e[1])) + edge_set.add((e[1], e[0])) + return adj, edge_set diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py new file mode 100644 index 00000000..ba72f410 --- /dev/null +++ b/graphgen/bases/base_reader.py @@ -0,0 +1,83 @@ +import os +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Union + +import pandas as pd +import requests +from ray.data import Dataset + + +class BaseReader(ABC): + """ + Abstract base class for reading and processing data. + """ + + def __init__(self, text_column: str = "content", modalities: list = None): + self.text_column = text_column + self.modalities = modalities if modalities is not None else ["text"] + + @abstractmethod + def read(self, input_path: Union[str, List[str]]) -> Dataset: + """ + Read data from the specified file path. + + :param input_path: Path to the input file or list of file paths. + :return: Ray Dataset containing the read data. + """ + + def _should_keep_item(self, item: Dict[str, Any]) -> bool: + """ + Determine whether to keep the given item based on the text column. + + :param item: Dictionary representing a data entry. + :return: True if the item should be kept, False otherwise. + """ + item_type = item.get("type") + assert item_type in [ + "text", + "image", + "table", + "equation", + "protein", + "dna", + "rna", + ], f"Unsupported item type: {item_type}" + if item_type == "text": + content = item.get(self.text_column, "").strip() + return bool(content) + return True + + def _validate_batch(self, batch: pd.DataFrame) -> pd.DataFrame: + """ + Validate data format. + """ + if "type" not in batch.columns: + raise ValueError(f"Missing 'type' column. Found: {list(batch.columns)}") + + if "text" in batch["type"].values: + if self.text_column not in batch.columns: + raise ValueError( + f"Missing '{self.text_column}' column for text documents" + ) + + return batch + + @staticmethod + def _image_exists(path_or_url: str, timeout: int = 3) -> bool: + """ + Check if an image exists at the given local path or URL. + :param path_or_url: Local file path or remote URL of the image. + :param timeout: Timeout for remote URL requests in seconds. + :return: True if the image exists, False otherwise. + """ + if not path_or_url: + return False + if not path_or_url.startswith(("http://", "https://", "ftp://")): + path = path_or_url.replace("file://", "", 1) + path = os.path.abspath(path) + return os.path.isfile(path) + try: + resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout) + return resp.status_code == 200 + except requests.RequestException: + return False diff --git a/graphgen/bases/base_searcher.py b/graphgen/bases/base_searcher.py new file mode 100644 index 00000000..cba0315e --- /dev/null +++ b/graphgen/bases/base_searcher.py @@ -0,0 +1,47 @@ +import logging +import os +from abc import ABC, abstractmethod +from typing import Any, Dict, List + +from graphgen.utils.log import set_logger + + +class BaseSearcher(ABC): + """ + Abstract base class for searching and retrieving data. + """ + + def __init__(self, working_dir: str = "cache"): + """ + Initialize the base searcher with a logger. + + :param working_dir: Working directory for log files. + """ + log_dir = os.path.join(working_dir, "logs") + searcher_name = self.__class__.__name__ + + # e.g. cache/logs/NCBISearch.log + log_file = os.path.join(log_dir, f"{searcher_name}.log") + + self.logger = set_logger( + log_file=log_file, name=searcher_name, + console_level=logging.ERROR, force=True + ) + + self.logger.info( + "[%s] Searcher initialized", searcher_name + ) + + @abstractmethod + async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]: + """ + Search for data based on the given query. + + :param query: The searcher query. + :param kwargs: Additional keyword arguments for the searcher. + :return: List of dictionaries containing the searcher results. + """ + + def get_logger(self): + """Get the logger instance.""" + return self.logger diff --git a/graphgen/bases/base_splitter.py b/graphgen/bases/base_splitter.py new file mode 100644 index 00000000..f77be6e4 --- /dev/null +++ b/graphgen/bases/base_splitter.py @@ -0,0 +1,142 @@ +import copy +import re +from abc import ABC, abstractmethod +from typing import Callable, Iterable, List, Literal, Optional, Union + +from graphgen.bases.datatypes import Chunk +from graphgen.utils.log import logger + + +class BaseSplitter(ABC): + """ + Abstract base class for splitting text into smaller chunks. + """ + + def __init__( + self, + chunk_size: int = 1024, + chunk_overlap: int = 100, + length_function: Callable[[str], int] = len, + keep_separator: bool = False, + add_start_index: bool = False, + strip_whitespace: bool = True, + ): + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.length_function = length_function + self.keep_separator = keep_separator + self.add_start_index = add_start_index + self.strip_whitespace = strip_whitespace + + @abstractmethod + def split_text(self, text: str) -> List[str]: + """ + Split the input text into smaller chunks. + + :param text: The input text to be chunk. + :return: A list of text chunks. + """ + + def create_chunks( + self, texts: List[str], metadatas: Optional[List[dict]] = None + ) -> List[Chunk]: + """Create chunks from a list of texts.""" + _metadatas = metadatas or [{}] * len(texts) + chunks = [] + for i, text in enumerate(texts): + index = 0 + previous_chunk_len = 0 + for chunk in self.split_text(text): + metadata = copy.deepcopy(_metadatas[i]) + if self.add_start_index: + offset = index + previous_chunk_len - self.chunk_overlap + index = text.find(chunk, max(0, offset)) + metadata["start_index"] = index + previous_chunk_len = len(chunk) + new_chunk = Chunk(content=chunk, metadata=metadata) + chunks.append(new_chunk) + return chunks + + def _join_chunks(self, chunks: List[str], separator: str) -> Optional[str]: + text = separator.join(chunks) + if self.strip_whitespace: + text = text.strip() + if text == "": + return None + return text + + def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]: + # We now want to combine these smaller pieces into medium size chunks to send to the LLM. + separator_len = self.length_function(separator) + + chunks = [] + current_chunk: List[str] = [] + total = 0 + for d in splits: + _len = self.length_function(d) + if ( + total + _len + (separator_len if len(current_chunk) > 0 else 0) + > self.chunk_size + ): + if total > self.chunk_size: + logger.warning( + "Created a chunk of size %s, which is longer than the specified %s", + total, + self.chunk_size, + ) + if len(current_chunk) > 0: + chunk = self._join_chunks(current_chunk, separator) + if chunk is not None: + chunks.append(chunk) + # Keep on popping if: + # - we have a larger chunk than in the chunk overlap + # - or if we still have any chunks and the length is long + while total > self.chunk_overlap or ( + total + _len + (separator_len if len(current_chunk) > 0 else 0) + > self.chunk_size + and total > 0 + ): + total -= self.length_function(current_chunk[0]) + ( + separator_len if len(current_chunk) > 1 else 0 + ) + current_chunk = current_chunk[1:] + current_chunk.append(d) + total += _len + (separator_len if len(current_chunk) > 1 else 0) + chunk = self._join_chunks(current_chunk, separator) + if chunk is not None: + chunks.append(chunk) + return chunks + + @staticmethod + def _split_text_with_regex( + text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]] + ) -> List[str]: + # Now that we have the separator, chunk the text + if separator: + if keep_separator: + # The parentheses in the pattern keep the delimiters in the result. + _splits = re.split(f"({separator})", text) + splits = ( + ( + [ + _splits[i] + _splits[i + 1] + for i in range(0, len(_splits) - 1, 2) + ] + ) + if keep_separator == "end" + else ( + [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] + ) + ) + if len(_splits) % 2 == 0: + splits += _splits[-1:] + splits = ( + (splits + [_splits[-1]]) + if keep_separator == "end" + else ([_splits[0]] + splits) + ) + else: + splits = re.split(separator, text) + else: + splits = list(text) + return [s for s in splits if s != ""] diff --git a/graphgen/bases/base_storage.py b/graphgen/bases/base_storage.py new file mode 100644 index 00000000..ff7d2d1a --- /dev/null +++ b/graphgen/bases/base_storage.py @@ -0,0 +1,96 @@ +from dataclasses import dataclass +from typing import Generic, TypeVar, Union + +T = TypeVar("T") + + +@dataclass +class StorageNameSpace: + working_dir: str = None + namespace: str = None + + def index_done_callback(self): + """commit the storage operations after indexing""" + + def query_done_callback(self): + """commit the storage operations after querying""" + + +class BaseKVStorage(Generic[T], StorageNameSpace): + def all_keys(self) -> list[str]: + raise NotImplementedError + + def get_by_id(self, id: str) -> Union[T, None]: + raise NotImplementedError + + def get_by_ids( + self, ids: list[str], fields: Union[set[str], None] = None + ) -> list[Union[T, None]]: + raise NotImplementedError + + def get_all(self) -> dict[str, T]: + raise NotImplementedError + + def filter_keys(self, data: list[str]) -> set[str]: + """return un-exist keys""" + raise NotImplementedError + + def upsert(self, data: dict[str, T]): + raise NotImplementedError + + def drop(self): + raise NotImplementedError + + def reload(self): + raise NotImplementedError + + +class BaseGraphStorage(StorageNameSpace): + def has_node(self, node_id: str) -> bool: + raise NotImplementedError + + def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + raise NotImplementedError + + def node_degree(self, node_id: str) -> int: + raise NotImplementedError + + def edge_degree(self, src_id: str, tgt_id: str) -> int: + raise NotImplementedError + + def get_node(self, node_id: str) -> Union[dict, None]: + raise NotImplementedError + + def update_node(self, node_id: str, node_data: dict[str, str]): + raise NotImplementedError + + def get_all_nodes(self) -> Union[list[tuple[str, dict]], None]: + raise NotImplementedError + + def get_edge(self, source_node_id: str, target_node_id: str) -> Union[dict, None]: + raise NotImplementedError + + def update_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + raise NotImplementedError + + def get_all_edges(self) -> Union[list[tuple[str, str, dict]], None]: + raise NotImplementedError + + def get_node_edges(self, source_node_id: str) -> Union[list[tuple[str, str]], None]: + raise NotImplementedError + + def upsert_node(self, node_id: str, node_data: dict[str, str]): + raise NotImplementedError + + def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + raise NotImplementedError + + def delete_node(self, node_id: str): + raise NotImplementedError + + def reload(self): + raise NotImplementedError diff --git a/graphgen/bases/base_tokenizer.py b/graphgen/bases/base_tokenizer.py new file mode 100644 index 00000000..346d500b --- /dev/null +++ b/graphgen/bases/base_tokenizer.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import List + + +class BaseTokenizer(ABC): + def __init__(self, model_name: str = "cl100k_base"): + self.model_name = model_name + + @abstractmethod + def encode(self, text: str) -> List[int]: + """Encode text -> token ids.""" + raise NotImplementedError + + @abstractmethod + def decode(self, token_ids: List[int]) -> str: + """Decode token ids -> text.""" + raise NotImplementedError + + def count_tokens(self, text: str) -> int: + return len(self.encode(text)) + + def chunk_by_token_size( + self, + content: str, + *, + overlap_token_size: int = 128, + max_token_size: int = 1024, + ) -> List[dict]: + tokens = self.encode(content) + results = [] + step = max_token_size - overlap_token_size + for index, start in enumerate(range(0, len(tokens), step)): + chunk_ids = tokens[start : start + max_token_size] + results.append( + { + "tokens": len(chunk_ids), + "content": self.decode(chunk_ids).strip(), + "chunk_order_index": index, + } + ) + return results diff --git a/graphgen/bases/datatypes.py b/graphgen/bases/datatypes.py new file mode 100644 index 00000000..df719fdf --- /dev/null +++ b/graphgen/bases/datatypes.py @@ -0,0 +1,94 @@ +import math +from dataclasses import dataclass, field +from typing import List, Union + +from pydantic import BaseModel, Field, field_validator + + +@dataclass +class Chunk: + id: str + content: str + type: str + metadata: dict = field(default_factory=dict) + + @staticmethod + def from_dict(key: str, data: dict) -> "Chunk": + return Chunk( + id=key, + content=data.get("content", ""), + type=data.get("type", "text"), + metadata={k: v for k, v in data.items() if k != "content"}, + ) + + +@dataclass +class QAPair: + """ + A pair of question and answer. + """ + + question: str + answer: str + + +@dataclass +class Token: + text: str + prob: float + top_candidates: List = field(default_factory=list) + ppl: Union[float, None] = field(default=None) + + @property + def logprob(self) -> float: + return math.log(self.prob) + + +@dataclass +class Community: + id: Union[int, str] + nodes: List[str] = field(default_factory=list) + edges: List[tuple] = field(default_factory=list) + metadata: dict = field(default_factory=dict) + + +class Node(BaseModel): + id: str = Field(..., description="unique node id") + op_name: str = Field(..., description="operator name") + type: str = Field( + ..., description="task type, e.g., map, filter, flatmap, aggregate, map_batch" + ) + params: dict = Field(default_factory=dict, description="operator parameters") + dependencies: List[str] = Field( + default_factory=list, description="list of dependent node ids" + ) + execution_params: dict = Field( + default_factory=dict, description="execution parameters like replicas, batch_size" + ) + + @classmethod + @field_validator("type") + def validate_type(cls, v: str) -> str: + valid_types = {"map", "filter", "flatmap", "aggregate", "map_batch"} + if v not in valid_types: + raise ValueError(f"Invalid node type: {v}. Must be one of {valid_types}.") + return v + + +class Config(BaseModel): + global_params: dict = Field( + default_factory=dict, description="global context for the computation graph" + ) + + nodes: List[Node] = Field( + ..., min_length=1, description="list of nodes in the computation graph" + ) + + @classmethod + @field_validator("nodes") + def validate_unique_ids(cls, v: List[Node]) -> List[Node]: + ids = [node.id for node in v] + if len(ids) != len(set(ids)): + duplicates = {id_ for id_ in ids if ids.count(id_) > 1} + raise ValueError(f"Duplicate node ids found: {duplicates}") + return v diff --git a/graphgen/common/__init__.py b/graphgen/common/__init__.py new file mode 100644 index 00000000..deb99459 --- /dev/null +++ b/graphgen/common/__init__.py @@ -0,0 +1,2 @@ +from .init_llm import init_llm +from .init_storage import init_storage diff --git a/graphgen/common/init_llm.py b/graphgen/common/init_llm.py new file mode 100644 index 00000000..af53709a --- /dev/null +++ b/graphgen/common/init_llm.py @@ -0,0 +1,177 @@ +import os +from typing import Any, Dict, Optional + +import ray + +from graphgen.bases import BaseLLMWrapper +from graphgen.common.init_storage import get_actor_handle +from graphgen.models import Tokenizer + + +class LLMServiceActor: + """ + A Ray actor class to wrap LLM wrapper instances for distributed usage. + """ + + def __init__(self, backend: str, config: Dict[str, Any]): + self.backend = backend + tokenizer_model = os.environ.get("TOKENIZER_MODEL", "cl100k_base") + tokenizer = Tokenizer(model_name=tokenizer_model) + config["tokenizer"] = tokenizer + + if backend == "http_api": + from graphgen.models.llm.api.http_client import HTTPClient + + self.llm_instance = HTTPClient(**config) + elif backend in ("openai_api", "azure_openai_api"): + from graphgen.models.llm.api.openai_client import OpenAIClient + + # pass in concrete backend to the OpenAIClient so that internally we can distinguish + # between OpenAI and Azure OpenAI + self.llm_instance = OpenAIClient(**config, backend=backend) + elif backend == "ollama_api": + from graphgen.models.llm.api.ollama_client import OllamaClient + + self.llm_instance = OllamaClient(**config) + elif backend == "huggingface": + from graphgen.models.llm.local.hf_wrapper import HuggingFaceWrapper + + self.llm_instance = HuggingFaceWrapper(**config) + elif backend == "sglang": + from graphgen.models.llm.local.sglang_wrapper import SGLangWrapper + + self.llm_instance = SGLangWrapper(**config) + + elif backend == "vllm": + from graphgen.models.llm.local.vllm_wrapper import VLLMWrapper + + self.llm_instance = VLLMWrapper(**config) + else: + raise NotImplementedError(f"Backend {backend} is not implemented yet.") + + async def generate_answer( + self, text: str, history: Optional[list[str]] = None, **extra: Any + ) -> str: + return await self.llm_instance.generate_answer(text, history, **extra) + + async def generate_topk_per_token( + self, text: str, history: Optional[list[str]] = None, **extra: Any + ) -> list: + return await self.llm_instance.generate_topk_per_token(text, history, **extra) + + async def generate_inputs_prob( + self, text: str, history: Optional[list[str]] = None, **extra: Any + ) -> list: + return await self.llm_instance.generate_inputs_prob(text, history, **extra) + + def ready(self) -> bool: + """A simple method to check if the actor is ready.""" + return True + + +class LLMServiceProxy(BaseLLMWrapper): + """ + A proxy class to interact with the LLMServiceActor for distributed LLM operations. + """ + + def __init__(self, actor_name: str): + super().__init__() + self.actor_handle = get_actor_handle(actor_name) + self._create_local_tokenizer() + + async def generate_answer( + self, text: str, history: Optional[list[str]] = None, **extra: Any + ) -> str: + object_ref = self.actor_handle.generate_answer.remote(text, history, **extra) + return await object_ref + + async def generate_topk_per_token( + self, text: str, history: Optional[list[str]] = None, **extra: Any + ) -> list: + object_ref = self.actor_handle.generate_topk_per_token.remote( + text, history, **extra + ) + return await object_ref + + async def generate_inputs_prob( + self, text: str, history: Optional[list[str]] = None, **extra: Any + ) -> list: + object_ref = self.actor_handle.generate_inputs_prob.remote( + text, history, **extra + ) + return await object_ref + + def _create_local_tokenizer(self): + tokenizer_model = os.environ.get("TOKENIZER_MODEL", "cl100k_base") + self.tokenizer = Tokenizer(model_name=tokenizer_model) + + +class LLMFactory: + """ + A factory class to create LLM wrapper instances based on the specified backend. + Supported backends include: + - http_api: HTTPClient + - openai_api: OpenAIClient + - ollama_api: OllamaClient + - huggingface: HuggingFaceWrapper + - sglang: SGLangWrapper + """ + + @staticmethod + def create_llm( + model_type: str, backend: str, config: Dict[str, Any] + ) -> BaseLLMWrapper: + if not config: + raise ValueError( + f"No configuration provided for LLM {model_type} with backend {backend}." + ) + + actor_name = f"Actor_LLM_{model_type}" + try: + ray.get_actor(actor_name) + except ValueError: + print(f"Creating Ray actor for LLM {model_type} with backend {backend}.") + num_gpus = float(config.pop("num_gpus", 0)) + actor = ( + ray.remote(LLMServiceActor) + .options( + name=actor_name, + num_gpus=num_gpus, + lifetime="detached", + get_if_exists=True, + ) + .remote(backend, config) + ) + + # wait for actor to be ready + ray.get(actor.ready.remote()) + + return LLMServiceProxy(actor_name) + + +def _load_env_group(prefix: str) -> Dict[str, Any]: + """ + Collect environment variables with the given prefix into a dictionary, + stripping the prefix from the keys. + """ + return { + k[len(prefix) :].lower(): v + for k, v in os.environ.items() + if k.startswith(prefix) + } + + +def init_llm(model_type: str) -> Optional[BaseLLMWrapper]: + if model_type == "synthesizer": + prefix = "SYNTHESIZER_" + elif model_type == "trainee": + prefix = "TRAINEE_" + else: + raise NotImplementedError(f"Model type {model_type} is not implemented yet.") + config = _load_env_group(prefix) + # if config is empty, return None + if not config: + return None + backend = config.pop("backend") + llm_wrapper = LLMFactory.create_llm(model_type, backend, config) + return llm_wrapper diff --git a/graphgen/common/init_storage.py b/graphgen/common/init_storage.py new file mode 100644 index 00000000..b9358485 --- /dev/null +++ b/graphgen/common/init_storage.py @@ -0,0 +1,262 @@ +from typing import Any, Dict, Union + +import ray + +from graphgen.bases.base_storage import BaseGraphStorage, BaseKVStorage + + +class KVStorageActor: + def __init__(self, backend: str, working_dir: str, namespace: str): + if backend == "json_kv": + from graphgen.models import JsonKVStorage + + self.kv = JsonKVStorage(working_dir, namespace) + elif backend == "rocksdb": + from graphgen.models import RocksDBKVStorage + + self.kv = RocksDBKVStorage(working_dir, namespace) + else: + raise ValueError(f"Unknown KV backend: {backend}") + + def data(self) -> Dict[str, Dict]: + return self.kv.data + + def all_keys(self) -> list[str]: + return self.kv.all_keys() + + def index_done_callback(self): + return self.kv.index_done_callback() + + def get_by_id(self, id: str) -> Dict: + return self.kv.get_by_id(id) + + def get_by_ids(self, ids: list[str], fields=None) -> list: + return self.kv.get_by_ids(ids, fields) + + def get_all(self) -> Dict[str, Dict]: + return self.kv.get_all() + + def filter_keys(self, data: list[str]) -> set[str]: + return self.kv.filter_keys(data) + + def upsert(self, data: dict) -> dict: + return self.kv.upsert(data) + + def drop(self): + return self.kv.drop() + + def reload(self): + return self.kv.reload() + + +class GraphStorageActor: + def __init__(self, backend: str, working_dir: str, namespace: str): + if backend == "networkx": + from graphgen.models import NetworkXStorage + + self.graph = NetworkXStorage(working_dir, namespace) + elif backend == "kuzu": + from graphgen.models import KuzuStorage + + self.graph = KuzuStorage(working_dir, namespace) + else: + raise ValueError(f"Unknown Graph backend: {backend}") + + def index_done_callback(self): + return self.graph.index_done_callback() + + def has_node(self, node_id: str) -> bool: + return self.graph.has_node(node_id) + + def has_edge(self, source_node_id: str, target_node_id: str): + return self.graph.has_edge(source_node_id, target_node_id) + + def node_degree(self, node_id: str) -> int: + return self.graph.node_degree(node_id) + + def edge_degree(self, src_id: str, tgt_id: str) -> int: + return self.graph.edge_degree(src_id, tgt_id) + + def get_node(self, node_id: str) -> Any: + return self.graph.get_node(node_id) + + def update_node(self, node_id: str, node_data: dict[str, str]): + return self.graph.update_node(node_id, node_data) + + def get_all_nodes(self) -> Any: + return self.graph.get_all_nodes() + + def get_edge(self, source_node_id: str, target_node_id: str): + return self.graph.get_edge(source_node_id, target_node_id) + + def update_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + return self.graph.update_edge(source_node_id, target_node_id, edge_data) + + def get_all_edges(self) -> Any: + return self.graph.get_all_edges() + + def get_node_edges(self, source_node_id: str) -> Any: + return self.graph.get_node_edges(source_node_id) + + def upsert_node(self, node_id: str, node_data: dict[str, str]): + return self.graph.upsert_node(node_id, node_data) + + def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + return self.graph.upsert_edge(source_node_id, target_node_id, edge_data) + + def delete_node(self, node_id: str): + return self.graph.delete_node(node_id) + + def reload(self): + return self.graph.reload() + + +def get_actor_handle(name: str): + try: + return ray.get_actor(name) + except ValueError as exc: + raise RuntimeError( + f"Actor {name} not found. Make sure it is created before accessing." + ) from exc + + +class RemoteKVStorageProxy(BaseKVStorage): + def __init__(self, namespace: str): + super().__init__() + self.namespace = namespace + self.actor_name = f"Actor_KV_{namespace}" + self.actor = get_actor_handle(self.actor_name) + + def data(self) -> Dict[str, Any]: + return ray.get(self.actor.data.remote()) + + def all_keys(self) -> list[str]: + return ray.get(self.actor.all_keys.remote()) + + def index_done_callback(self): + return ray.get(self.actor.index_done_callback.remote()) + + def get_by_id(self, id: str) -> Union[Any, None]: + return ray.get(self.actor.get_by_id.remote(id)) + + def get_by_ids(self, ids: list[str], fields=None) -> list[Any]: + return ray.get(self.actor.get_by_ids.remote(ids, fields)) + + def get_all(self) -> Dict[str, Any]: + return ray.get(self.actor.get_all.remote()) + + def filter_keys(self, data: list[str]) -> set[str]: + return ray.get(self.actor.filter_keys.remote(data)) + + def upsert(self, data: Dict[str, Any]): + return ray.get(self.actor.upsert.remote(data)) + + def drop(self): + return ray.get(self.actor.drop.remote()) + + def reload(self): + return ray.get(self.actor.reload.remote()) + + +class RemoteGraphStorageProxy(BaseGraphStorage): + def __init__(self, namespace: str): + super().__init__() + self.namespace = namespace + self.actor_name = f"Actor_Graph_{namespace}" + self.actor = get_actor_handle(self.actor_name) + + def index_done_callback(self): + return ray.get(self.actor.index_done_callback.remote()) + + def has_node(self, node_id: str) -> bool: + return ray.get(self.actor.has_node.remote(node_id)) + + def has_edge(self, source_node_id: str, target_node_id: str): + return ray.get(self.actor.has_edge.remote(source_node_id, target_node_id)) + + def node_degree(self, node_id: str) -> int: + return ray.get(self.actor.node_degree.remote(node_id)) + + def edge_degree(self, src_id: str, tgt_id: str) -> int: + return ray.get(self.actor.edge_degree.remote(src_id, tgt_id)) + + def get_node(self, node_id: str) -> Any: + return ray.get(self.actor.get_node.remote(node_id)) + + def update_node(self, node_id: str, node_data: dict[str, str]): + return ray.get(self.actor.update_node.remote(node_id, node_data)) + + def get_all_nodes(self) -> Any: + return ray.get(self.actor.get_all_nodes.remote()) + + def get_edge(self, source_node_id: str, target_node_id: str): + return ray.get(self.actor.get_edge.remote(source_node_id, target_node_id)) + + def update_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + return ray.get( + self.actor.update_edge.remote(source_node_id, target_node_id, edge_data) + ) + + def get_all_edges(self) -> Any: + return ray.get(self.actor.get_all_edges.remote()) + + def get_node_edges(self, source_node_id: str) -> Any: + return ray.get(self.actor.get_node_edges.remote(source_node_id)) + + def upsert_node(self, node_id: str, node_data: dict[str, str]): + return ray.get(self.actor.upsert_node.remote(node_id, node_data)) + + def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + return ray.get( + self.actor.upsert_edge.remote(source_node_id, target_node_id, edge_data) + ) + + def delete_node(self, node_id: str): + return ray.get(self.actor.delete_node.remote(node_id)) + + def reload(self): + return ray.get(self.actor.reload.remote()) + + +class StorageFactory: + """ + Factory class to create storage instances based on backend. + """ + + @staticmethod + def create_storage(backend: str, working_dir: str, namespace: str): + if backend in ["json_kv", "rocksdb"]: + actor_name = f"Actor_KV_{namespace}" + try: + ray.get_actor(actor_name) + except ValueError: + ray.remote(KVStorageActor).options( + name=actor_name, + lifetime="detached", + get_if_exists=True, + ).remote(backend, working_dir, namespace) + return RemoteKVStorageProxy(namespace) + if backend in ["networkx", "kuzu"]: + actor_name = f"Actor_Graph_{namespace}" + try: + ray.get_actor(actor_name) + except ValueError: + ray.remote(GraphStorageActor).options( + name=actor_name, + lifetime="detached", + get_if_exists=True, + ).remote(backend, working_dir, namespace) + return RemoteGraphStorageProxy(namespace) + raise ValueError(f"Unknown storage backend: {backend}") + + +def init_storage(backend: str, working_dir: str, namespace: str): + return StorageFactory.create_storage(backend, working_dir, namespace) diff --git a/graphgen/configs/config.yaml.example b/graphgen/configs/config.yaml.example deleted file mode 100644 index eeb804af..00000000 --- a/graphgen/configs/config.yaml.example +++ /dev/null @@ -1,16 +0,0 @@ -data_type: raw -input_file: resources/examples/raw_demo.jsonl -tokenizer: cl100k_base -quiz_samples: 2 -traverse_strategy: - qa_form: atomic - bidirectional: true - edge_sampling: max_loss - expand_method: max_tokens - isolated_node_strategy: add - max_depth: 2 - max_extra_edges: 5 - max_tokens: 256 - loss_strategy: only_edge -web_search: false -re_judge: false diff --git a/graphgen/configs/graphgen_config.yaml b/graphgen/configs/graphgen_config.yaml deleted file mode 100644 index 4ddb66c7..00000000 --- a/graphgen/configs/graphgen_config.yaml +++ /dev/null @@ -1,16 +0,0 @@ -data_type: raw -input_file: resources/examples/raw_demo.jsonl -tokenizer: cl100k_base -quiz_samples: 2 -traverse_strategy: - qa_form: aggregated - bidirectional: true - edge_sampling: max_loss - expand_method: max_width - isolated_node_strategy: ignore - max_depth: 1 - max_extra_edges: 2 - max_tokens: 256 - loss_strategy: only_edge -web_search: false -re_judge: false diff --git a/graphgen/engine.py b/graphgen/engine.py new file mode 100644 index 00000000..63f35689 --- /dev/null +++ b/graphgen/engine.py @@ -0,0 +1,225 @@ +import inspect +import logging +import os +from collections import defaultdict, deque +from functools import wraps +from typing import Any, Callable, Dict, List, Set + +import ray +import ray.data + +from graphgen.bases import Config, Node +from graphgen.utils import logger + + +class Engine: + def __init__( + self, config: Dict[str, Any], functions: Dict[str, Callable], **ray_init_kwargs + ): + self.config = Config(**config) + self.global_params = self.config.global_params + self.functions = functions + self.datasets: Dict[str, ray.data.Dataset] = {} + + # Disable Ray Data progress bars and verbose output + os.environ.setdefault("RAY_DATA_DISABLE_PROGRESS_BARS", "1") + # Disable metrics exporter to avoid RpcError + os.environ.setdefault("RAY_DISABLE_IMPORTANT_WARNING", "1") + try: + from ray.data import DataContext + ctx = DataContext.get_current() + ctx.enable_rich_progress_bars = False + ctx.use_ray_tqdm = False + except Exception: + pass # Ray Data context might not be available + + if not ray.is_initialized(): + # Disable metrics exporter to avoid RpcError + ray_init_kwargs.setdefault("_metrics_export_port", 0) + context = ray.init( + ignore_reinit_error=True, + logging_level=logging.ERROR, + log_to_driver=False, # Disable Ray logs to driver + **ray_init_kwargs, + ) + logger.info("Ray Dashboard URL: %s", context.dashboard_url) + + @staticmethod + def _topo_sort(nodes: List[Node]) -> List[Node]: + id_to_node: Dict[str, Node] = {} + for n in nodes: + id_to_node[n.id] = n + + indeg: Dict[str, int] = {nid: 0 for nid in id_to_node} + adj: Dict[str, List[str]] = defaultdict(list) + + for n in nodes: + nid = n.id + deps: List[str] = n.dependencies + uniq_deps: Set[str] = set(deps) + for d in uniq_deps: + if d not in id_to_node: + raise ValueError( + f"The dependency node id {d} of node {nid} is not defined in the configuration." + ) + indeg[nid] += 1 + adj[d].append(nid) + + zero_deg: deque = deque( + [id_to_node[nid] for nid, deg in indeg.items() if deg == 0] + ) + sorted_nodes: List[Node] = [] + + while zero_deg: + cur = zero_deg.popleft() + sorted_nodes.append(cur) + cur_id = cur.id + for nb_id in adj.get(cur_id, []): + indeg[nb_id] -= 1 + if indeg[nb_id] == 0: + zero_deg.append(id_to_node[nb_id]) + + if len(sorted_nodes) != len(nodes): + remaining = [nid for nid, deg in indeg.items() if deg > 0] + raise ValueError( + f"The configuration contains cycles, unable to execute. Remaining nodes with indegree > 0: {remaining}" + ) + + return sorted_nodes + + def _get_input_dataset( + self, node: Node, initial_ds: ray.data.Dataset + ) -> ray.data.Dataset: + deps = node.dependencies + + if not deps: + return initial_ds + + if len(deps) == 1: + return self.datasets[deps[0]] + + main_ds = self.datasets[deps[0]] + other_dss = [self.datasets[d] for d in deps[1:]] + return main_ds.union(*other_dss) + + def _execute_node(self, node: Node, initial_ds: ray.data.Dataset): + def _filter_kwargs( + func_or_class: Callable, + global_params: Dict[str, Any], + func_params: Dict[str, Any], + ) -> Dict[str, Any]: + """ + 1. global_params: only when specified in function signature, will be passed + 2. func_params: pass specified params first, then **kwargs if exists + """ + try: + sig = inspect.signature(func_or_class) + except ValueError: + return {} + + params = sig.parameters + final_kwargs = {} + + has_var_keywords = any( + p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values() + ) + valid_keys = set(params.keys()) + for k, v in global_params.items(): + if k in valid_keys: + final_kwargs[k] = v + + for k, v in func_params.items(): + if k in valid_keys or has_var_keywords: + final_kwargs[k] = v + return final_kwargs + + if node.op_name not in self.functions: + raise ValueError(f"Operator {node.op_name} not found for node {node.id}") + + op_handler = self.functions[node.op_name] + node_params = _filter_kwargs(op_handler, self.global_params, node.params or {}) + + if node.type == "source": + self.datasets[node.id] = op_handler(**node_params) + return + + input_ds = self._get_input_dataset(node, initial_ds) + + if inspect.isclass(op_handler): + execution_params = node.execution_params or {} + replicas = execution_params.get("replicas", 1) + batch_size = ( + int(execution_params.get("batch_size")) + if "batch_size" in execution_params + else "default" + ) + compute_resources = execution_params.get("compute_resources", {}) + + if node.type == "aggregate": + self.datasets[node.id] = input_ds.repartition(1).map_batches( + op_handler, + compute=ray.data.ActorPoolStrategy(min_size=1, max_size=1), + batch_size=None, # aggregate processes the whole dataset at once + num_gpus=compute_resources.get("num_gpus", 0) + if compute_resources + else 0, + fn_constructor_kwargs=node_params, + batch_format="pandas", + ) + else: + # others like map, filter, flatmap, map_batch let actors process data inside batches + self.datasets[node.id] = input_ds.map_batches( + op_handler, + compute=ray.data.ActorPoolStrategy(min_size=1, max_size=replicas), + batch_size=batch_size, + num_gpus=compute_resources.get("num_gpus", 0) + if compute_resources + else 0, + fn_constructor_kwargs=node_params, + batch_format="pandas", + ) + + else: + + @wraps(op_handler) + def func_wrapper(row_or_batch: Dict[str, Any]) -> Dict[str, Any]: + return op_handler(row_or_batch, **node_params) + + if node.type == "map": + self.datasets[node.id] = input_ds.map(func_wrapper) + elif node.type == "filter": + self.datasets[node.id] = input_ds.filter(func_wrapper) + elif node.type == "flatmap": + self.datasets[node.id] = input_ds.flat_map(func_wrapper) + elif node.type == "aggregate": + self.datasets[node.id] = input_ds.repartition(1).map_batches( + func_wrapper, batch_format="default" + ) + elif node.type == "map_batch": + self.datasets[node.id] = input_ds.map_batches(func_wrapper) + else: + raise ValueError( + f"Unsupported node type {node.type} for node {node.id}" + ) + + @staticmethod + def _find_leaf_nodes(nodes: List[Node]) -> Set[str]: + all_ids = {n.id for n in nodes} + deps_set = set() + for n in nodes: + deps_set.update(n.dependencies) + return all_ids - deps_set + + def execute(self, initial_ds: ray.data.Dataset) -> Dict[str, ray.data.Dataset]: + sorted_nodes = self._topo_sort(self.config.nodes) + + for node in sorted_nodes: + self._execute_node(node, initial_ds) + + leaf_nodes = self._find_leaf_nodes(sorted_nodes) + + @ray.remote + def _fetch_result(ds: ray.data.Dataset) -> List[Any]: + return ds.take_all() + + return {node_id: self.datasets[node_id] for node_id in leaf_nodes} diff --git a/graphgen/evaluate.py b/graphgen/evaluate.py deleted file mode 100644 index da74a308..00000000 --- a/graphgen/evaluate.py +++ /dev/null @@ -1,142 +0,0 @@ -"""Evaluate the quality of the generated text using various metrics""" - -import os -import json -import argparse -import pandas as pd -from dotenv import load_dotenv -from .models import LengthEvaluator, MTLDEvaluator, RewardEvaluator, TextPair, UniEvaluator -from .utils import logger, set_logger - -sys_path = os.path.abspath(os.path.dirname(__file__)) -set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log")) - -load_dotenv() - -def evaluate_length(corpus, tokenizer_name): - length_evaluator = LengthEvaluator( - tokenizer_name=tokenizer_name - ) - logger.info("Length evaluator loaded") - scores = length_evaluator.get_average_score(corpus) - logger.info("Length scores: %s", scores) - return scores - -def evaluate_mtld(corpus): - mtld_evaluator = MTLDEvaluator() - logger.info("MTLD evaluator loaded") - scores = mtld_evaluator.get_average_score(corpus) - logger.info("MTLD scores: %s", scores) - min_max_scores = mtld_evaluator.get_min_max_score(corpus) - logger.info("MTLD min max scores: %s", min_max_scores) - return scores, min_max_scores - -def evaluate_reward(corpus, reward_model_names): - scores = [] - for reward_name in reward_model_names: - reward_evaluator = RewardEvaluator( - reward_name=reward_name - ) - logger.info("Loaded reward model: %s", reward_name) - average_score = reward_evaluator.get_average_score(corpus) - logger.info("%s scores: %s", reward_name, average_score) - min_max_scores = reward_evaluator.get_min_max_score(corpus) - logger.info("%s min max scores: %s", reward_name, min_max_scores) - scores.append({ - 'reward_name': reward_name.split('/')[-1], - 'score': average_score, - 'min_max_scores': min_max_scores - }) - del reward_evaluator - clean_gpu_cache() - return scores - -def evaluate_uni(corpus, uni_model_name): - uni_evaluator = UniEvaluator( - model_name=uni_model_name - ) - logger.info("Uni evaluator loaded with model %s", uni_model_name) - uni_scores = uni_evaluator.get_average_score(corpus) - for key, value in uni_scores.items(): - logger.info("Uni %s scores: %s", key, value) - min_max_scores = uni_evaluator.get_min_max_score(corpus) - for key, value in min_max_scores.items(): - logger.info("Uni %s min max scores: %s", key, value) - del uni_evaluator - clean_gpu_cache() - return (uni_scores['naturalness'], uni_scores['coherence'], uni_scores['understandability'], - min_max_scores['naturalness'], min_max_scores['coherence'], min_max_scores['understandability']) - - -def clean_gpu_cache(): - import torch - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - -if __name__ == '__main__': - import torch.multiprocessing as mp - parser = argparse.ArgumentParser() - - parser.add_argument('--folder', type=str, default='cache/data', help='folder to load data') - parser.add_argument('--output', type=str, default='cache/output', help='path to save output') - - parser.add_argument('--tokenizer', type=str, default='cl100k_base', help='tokenizer name') - parser.add_argument('--reward', type=str, default='OpenAssistant/reward-model-deberta-v3-large-v2', - help='Comma-separated list of reward models') - parser.add_argument('--uni', type=str, default='MingZhong/unieval-sum', help='uni model name') - - args = parser.parse_args() - - if not os.path.exists(args.folder): - raise ValueError(f"Folder {args.folder} does not exist") - - if not os.path.exists(args.output): - os.makedirs(args.output) - - reward_models = args.reward.split(',') - - - results = [] - - logger.info("Data loaded from %s", args.folder) - mp.set_start_method('spawn') - - for file in os.listdir(args.folder): - if file.endswith('.json'): - logger.info("Processing %s", file) - with open(os.path.join(args.folder, file), 'r', encoding='utf-8') as f: - data = json.load(f) - data = [TextPair( - question=data[key]['question'], - answer=data[key]['answer'] - ) for key in data] - - length_scores = evaluate_length(data, args.tokenizer) - mtld_scores, min_max_mtld_scores = evaluate_mtld(data) - reward_scores = evaluate_reward(data, reward_models) - uni_naturalness_scores, uni_coherence_scores, uni_understandability_scores, \ - min_max_uni_naturalness_scores, min_max_uni_coherence_scores, min_max_uni_understandability_scores \ - = evaluate_uni(data, args.uni) - - result = { - 'file': file, - 'number': len(data), - 'length': length_scores, - 'mtld': mtld_scores, - 'mtld_min_max': min_max_mtld_scores, - 'uni_naturalness': uni_naturalness_scores, - 'uni_coherence': uni_coherence_scores, - 'uni_understandability': uni_understandability_scores, - 'uni_naturalness_min_max': min_max_uni_naturalness_scores, - 'uni_coherence_min_max': min_max_uni_coherence_scores, - 'uni_understandability_min_max': min_max_uni_understandability_scores - } - for reward_score in reward_scores: - result[reward_score['reward_name']] = reward_score['score'] - result[f"{reward_score['reward_name']}_min_max"] = reward_score['min_max_scores'] - - results.append(result) - - results = pd.DataFrame(results) - results.to_csv(os.path.join(args.output, 'evaluation.csv'), index=False) diff --git a/graphgen/generate.py b/graphgen/generate.py deleted file mode 100644 index 14693471..00000000 --- a/graphgen/generate.py +++ /dev/null @@ -1,101 +0,0 @@ -import os -import json -import time -import argparse -from importlib.resources import files -import yaml -from dotenv import load_dotenv - -from .graphgen import GraphGen -from .models import OpenAIModel, Tokenizer, TraverseStrategy -from .utils import set_logger - -sys_path = os.path.abspath(os.path.dirname(__file__)) - -load_dotenv() - -def set_working_dir(folder): - os.makedirs(folder, exist_ok=True) - os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True) - os.makedirs(os.path.join(folder, "logs"), exist_ok=True) - -def save_config(config_path, global_config): - if not os.path.exists(os.path.dirname(config_path)): - os.makedirs(os.path.dirname(config_path)) - with open(config_path, "w", encoding='utf-8') as config_file: - yaml.dump(global_config, config_file, default_flow_style=False, allow_unicode=True) - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--config_file', - help='Config parameters for GraphGen.', - # default=os.path.join(sys_path, "configs", "graphgen_config.yaml"), - default=files('graphgen').joinpath("configs", "graphgen_config.yaml"), - type=str) - parser.add_argument('--output_dir', - help='Output directory for GraphGen.', - default=sys_path, - required=True, - type=str) - - args = parser.parse_args() - - working_dir = args.output_dir - set_working_dir(working_dir) - unique_id = int(time.time()) - set_logger(os.path.join(working_dir, "logs", f"graphgen_{unique_id}.log"), if_stream=False) - - with open(args.config_file, "r", encoding='utf-8') as f: - config = yaml.load(f, Loader=yaml.FullLoader) - - input_file = config['input_file'] - - if config['data_type'] == 'raw': - with open(input_file, "r", encoding='utf-8') as f: - data = [json.loads(line) for line in f] - elif config['data_type'] == 'chunked': - with open(input_file, "r", encoding='utf-8') as f: - data = json.load(f) - else: - raise ValueError(f"Invalid data type: {config['data_type']}") - - synthesizer_llm_client = OpenAIModel( - model_name=os.getenv("SYNTHESIZER_MODEL"), - api_key=os.getenv("SYNTHESIZER_API_KEY"), - base_url=os.getenv("SYNTHESIZER_BASE_URL") - ) - trainee_llm_client = OpenAIModel( - model_name=os.getenv("TRAINEE_MODEL"), - api_key=os.getenv("TRAINEE_API_KEY"), - base_url=os.getenv("TRAINEE_BASE_URL") - ) - - traverse_strategy = TraverseStrategy( - **config['traverse_strategy'] - ) - - graph_gen = GraphGen( - working_dir=working_dir, - unique_id=unique_id, - synthesizer_llm_client=synthesizer_llm_client, - trainee_llm_client=trainee_llm_client, - if_web_search=config['web_search'], - tokenizer_instance=Tokenizer( - model_name=config['tokenizer'] - ), - traverse_strategy=traverse_strategy - ) - - graph_gen.insert(data, config['data_type']) - - graph_gen.quiz(max_samples=config['quiz_samples']) - - graph_gen.judge(re_judge=config["re_judge"]) - - graph_gen.traverse() - - path = os.path.join(working_dir, "data", "graphgen", str(unique_id), f"config-{unique_id}.yaml") - save_config(path, config) - -if __name__ == '__main__': - main() diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py deleted file mode 100644 index 265d32a9..00000000 --- a/graphgen/graphgen.py +++ /dev/null @@ -1,260 +0,0 @@ -# Adapt from https://github.com/HKUDS/LightRAG - -import asyncio -import os -import time -from dataclasses import dataclass, field -from typing import List, Union, cast - -import gradio as gr -from tqdm.asyncio import tqdm as tqdm_async - -from .models import ( - Chunk, - JsonKVStorage, - NetworkXStorage, - OpenAIModel, - Tokenizer, - TraverseStrategy, - WikiSearch, -) -from .models.storage.base_storage import StorageNameSpace -from .operators import ( - extract_kg, - judge_statement, - quiz, - search_wikipedia, - skip_judge_statement, - traverse_graph_atomically, - traverse_graph_by_edge, - traverse_graph_for_multi_hop, -) -from .utils import compute_content_hash, create_event_loop, logger - -sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) - -@dataclass -class GraphGen: - unique_id: int = int(time.time()) - working_dir: str = os.path.join(sys_path, "cache") - - # text chunking - chunk_size: int = 1024 - chunk_overlap_size: int = 100 - - # llm - synthesizer_llm_client: OpenAIModel = None - trainee_llm_client: OpenAIModel = None - tokenizer_instance: Tokenizer = None - - # web search - if_web_search: bool = False - wiki_client: WikiSearch = field(default_factory=WikiSearch) - - # traverse strategy - traverse_strategy: TraverseStrategy = field(default_factory=TraverseStrategy) - - # webui - progress_bar: gr.Progress = None - - def __post_init__(self): - self.full_docs_storage: JsonKVStorage = JsonKVStorage( - self.working_dir, namespace="full_docs" - ) - self.text_chunks_storage: JsonKVStorage = JsonKVStorage( - self.working_dir, namespace="text_chunks" - ) - self.wiki_storage: JsonKVStorage = JsonKVStorage( - self.working_dir, namespace="wiki" - ) - self.graph_storage: NetworkXStorage = NetworkXStorage( - self.working_dir, namespace="graph" - ) - self.rephrase_storage: JsonKVStorage = JsonKVStorage( - self.working_dir, namespace="rephrase" - ) - self.qa_storage: JsonKVStorage = JsonKVStorage( - os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)), namespace=f"qa-{self.unique_id}" - ) - - async def async_split_chunks(self, data: Union[List[list], List[dict]], data_type: str) -> dict: - # TODO: 是否进行指代消解 - if len(data) == 0: - return {} - - new_docs = {} - inserting_chunks = {} - if data_type == "raw": - assert isinstance(data, list) and isinstance(data[0], dict) - # compute hash for each document - new_docs = { - compute_content_hash(doc['content'], prefix="doc-"): {'content': doc['content']} for doc in data - } - _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys())) - new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} - if len(new_docs) == 0: - logger.warning("All docs are already in the storage") - return {} - logger.info("[New Docs] inserting %d docs", len(new_docs)) - - cur_index = 1 - doc_number = len(new_docs) - async for doc_key, doc in tqdm_async( - new_docs.items(), desc="[1/4]Chunking documents", unit="doc" - ): - chunks = { - compute_content_hash(dp["content"], prefix="chunk-"): { - **dp, - 'full_doc_id': doc_key - } for dp in self.tokenizer_instance.chunk_by_token_size(doc["content"], - self.chunk_overlap_size, self.chunk_size) - } - inserting_chunks.update(chunks) - - if self.progress_bar is not None: - self.progress_bar( - cur_index / doc_number, f"Chunking {doc_key}" - ) - cur_index += 1 - - _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys())) - inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys} - elif data_type == "chunked": - assert isinstance(data, list) and isinstance(data[0], list) - new_docs = { - compute_content_hash("".join(chunk['content']), prefix="doc-"): {'content': "".join(chunk['content'])} - for doc in data for chunk in doc - } - _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys())) - new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} - if len(new_docs) == 0: - logger.warning("All docs are already in the storage") - return {} - logger.info("[New Docs] inserting %d docs", len(new_docs)) - async for doc in tqdm_async(data, desc="[1/4]Chunking documents", unit="doc"): - doc_str = "".join([chunk['content'] for chunk in doc]) - for chunk in doc: - chunk_key = compute_content_hash(chunk['content'], prefix="chunk-") - inserting_chunks[chunk_key] = { - **chunk, - 'full_doc_id': compute_content_hash(doc_str, prefix="doc-") - } - _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys())) - inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys} - - await self.full_docs_storage.upsert(new_docs) - await self.text_chunks_storage.upsert(inserting_chunks) - - return inserting_chunks - - def insert(self, data: Union[List[list], List[dict]], data_type: str): - loop = create_event_loop() - loop.run_until_complete(self.async_insert(data, data_type)) - - async def async_insert(self, data: Union[List[list], List[dict]], data_type: str): - """ - - insert chunks into the graph - """ - - inserting_chunks = await self.async_split_chunks(data, data_type) - - if len(inserting_chunks) == 0: - logger.warning("All chunks are already in the storage") - return - logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks)) - - logger.info("[Entity and Relation Extraction]...") - _add_entities_and_relations = await extract_kg( - llm_client=self.synthesizer_llm_client, - kg_instance=self.graph_storage, - tokenizer_instance=self.tokenizer_instance, - chunks=[Chunk(id=k, content=v['content']) for k, v in inserting_chunks.items()], - progress_bar = self.progress_bar, - ) - if not _add_entities_and_relations: - logger.warning("No entities or relations extracted") - return - - logger.info("[Wiki Search] is %s", 'enabled' if self.if_web_search else 'disabled') - if self.if_web_search: - logger.info("[Wiki Search]...") - _add_wiki_data = await search_wikipedia( - llm_client= self.synthesizer_llm_client, - wiki_search_client=self.wiki_client, - knowledge_graph_instance=_add_entities_and_relations - ) - await self.wiki_storage.upsert(_add_wiki_data) - - await self._insert_done() - - async def _insert_done(self): - tasks = [] - for storage_instance in [self.full_docs_storage, self.text_chunks_storage, - self.graph_storage, self.wiki_storage]: - if storage_instance is None: - continue - tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback()) - await asyncio.gather(*tasks) - - def quiz(self, max_samples=1): - loop = create_event_loop() - loop.run_until_complete(self.async_quiz(max_samples)) - - async def async_quiz(self, max_samples=1): - await quiz(self.synthesizer_llm_client, self.graph_storage, self.rephrase_storage, max_samples) - await self.rephrase_storage.index_done_callback() - - def judge(self, re_judge=False, skip=False): - loop = create_event_loop() - loop.run_until_complete(self.async_judge(re_judge, skip)) - - async def async_judge(self, re_judge=False, skip=False): - if skip: - _update_relations = await skip_judge_statement(self.graph_storage) - else: - _update_relations = await judge_statement(self.trainee_llm_client, self.graph_storage, - self.rephrase_storage, re_judge) - await _update_relations.index_done_callback() - - def traverse(self): - loop = create_event_loop() - loop.run_until_complete(self.async_traverse()) - - async def async_traverse(self): - if self.traverse_strategy.qa_form == "atomic": - results = await traverse_graph_atomically(self.synthesizer_llm_client, - self.tokenizer_instance, - self.graph_storage, - self.traverse_strategy, - self.text_chunks_storage, - self.progress_bar) - elif self.traverse_strategy.qa_form == "multi_hop": - results = await traverse_graph_for_multi_hop(self.synthesizer_llm_client, - self.tokenizer_instance, - self.graph_storage, - self.traverse_strategy, - self.text_chunks_storage, - self.progress_bar) - elif self.traverse_strategy.qa_form == "aggregated": - results = await traverse_graph_by_edge(self.synthesizer_llm_client, self.tokenizer_instance, - self.graph_storage, self.traverse_strategy, self.text_chunks_storage, - self.progress_bar) - else: - raise ValueError(f"Unknown qa_form: {self.traverse_strategy.qa_form}") - await self.qa_storage.upsert(results) - await self.qa_storage.index_done_callback() - - def clear(self): - loop = create_event_loop() - loop.run_until_complete(self.async_clear()) - - async def async_clear(self): - await self.full_docs_storage.drop() - await self.text_chunks_storage.drop() - await self.wiki_storage.drop() - await self.graph_storage.clear() - await self.rephrase_storage.drop() - await self.qa_storage.drop() - - logger.info("All caches are cleared") diff --git a/graphgen/judge.py b/graphgen/judge.py deleted file mode 100644 index f05bdf1d..00000000 --- a/graphgen/judge.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import argparse -import asyncio -from dotenv import load_dotenv - -from .models import NetworkXStorage, JsonKVStorage, OpenAIModel -from .operators import judge_statement - -sys_path = os.path.abspath(os.path.dirname(__file__)) - -load_dotenv() - -def calculate_average_loss(graph: NetworkXStorage): - """ - Calculate the average loss of the graph. - - :param graph: NetworkXStorage - :return: float - """ - edges = asyncio.run(graph.get_all_edges()) - total_loss = 0 - for edge in edges: - total_loss += edge[2]['loss'] - return total_loss / len(edges) - - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--input', type=str, default=os.path.join(sys_path, "cache"), help='path to load input graph') - parser.add_argument('--output', type=str, default='cache/output/new_graph.graphml', help='path to save output') - - args = parser.parse_args() - - llm_client = OpenAIModel( - model_name=os.getenv("TRAINEE_MODEL"), - api_key=os.getenv("TRAINEE_API_KEY"), - base_url=os.getenv("TRAINEE_BASE_URL") - ) - - graph_storage = NetworkXStorage( - args.input, - namespace="graph" - ) - average_loss = calculate_average_loss(graph_storage) - print(f"Average loss of the graph: {average_loss}") - - rephrase_storage = JsonKVStorage( - os.path.join(sys_path, "cache"), - namespace="rephrase" - ) - - new_graph = asyncio.run(judge_statement(llm_client, graph_storage, rephrase_storage, re_judge=True)) - - graph_file = asyncio.run(graph_storage.get_graph()) - - new_graph.write_nx_graph(graph_file, args.output) - - average_loss = calculate_average_loss(new_graph) - print(f"Average loss of the graph: {average_loss}") diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index c2f9e714..4606715b 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -1,41 +1,43 @@ -from .text.chunk import Chunk -from .text.text_pair import TextPair - -from .llm.topk_token_model import Token, TopkTokenModel -from .llm.openai_model import OpenAIModel -from .llm.tokenizer import Tokenizer - -from .storage.networkx_storage import NetworkXStorage -from .storage.json_storage import JsonKVStorage - -from .search.wiki_search import WikiSearch - -from .evaluate.length_evaluator import LengthEvaluator -from .evaluate.mtld_evaluator import MTLDEvaluator -from .evaluate.reward_evaluator import RewardEvaluator -from .evaluate.uni_evaluator import UniEvaluator - -from .strategy.travserse_strategy import TraverseStrategy - - -__all__ = [ - # llm models - "OpenAIModel", - "TopkTokenModel", - "Token", - "Tokenizer", - # storage models - "Chunk", - "NetworkXStorage", - "JsonKVStorage", - # search models - "WikiSearch", - # evaluate models - "TextPair", - "LengthEvaluator", - "MTLDEvaluator", - "RewardEvaluator", - "UniEvaluator", - # strategy models - "TraverseStrategy", -] +from .evaluator import LengthEvaluator, MTLDEvaluator, RewardEvaluator, UniEvaluator +from .generator import ( + AggregatedGenerator, + AtomicGenerator, + CoTGenerator, + MultiHopGenerator, + OmicsQAGenerator, + QuizGenerator, + VQAGenerator, +) +from .kg_builder import LightRAGKGBuilder, MMKGBuilder, OmicsKGBuilder +from .llm import HTTPClient, OllamaClient, OpenAIClient +from .partitioner import ( + AnchorBFSPartitioner, + BFSPartitioner, + DFSPartitioner, + ECEPartitioner, + LeidenPartitioner, +) +from .reader import ( + CSVReader, + JSONReader, + ParquetReader, + PDFReader, + PickleReader, + RDFReader, + TXTReader, +) +from .searcher.db.ncbi_searcher import NCBISearch +from .searcher.db.rnacentral_searcher import RNACentralSearch +from .searcher.db.uniprot_searcher import UniProtSearch +from .searcher.kg.wiki_search import WikiSearch +from .searcher.web.bing_search import BingSearch +from .searcher.web.google_search import GoogleSearch +from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter +from .storage import ( + JsonKVStorage, + KuzuStorage, + NetworkXStorage, + RocksDBCache, + RocksDBKVStorage, +) +from .tokenizer import Tokenizer diff --git a/graphgen/models/embed/embedding.py b/graphgen/models/embed/embedding.py deleted file mode 100644 index 8213b90f..00000000 --- a/graphgen/models/embed/embedding.py +++ /dev/null @@ -1,29 +0,0 @@ -from dataclasses import dataclass -import asyncio -import numpy as np - -class UnlimitedSemaphore: - """A context manager that allows unlimited access.""" - - async def __aenter__(self): - pass - - async def __aexit__(self, exc_type, exc, tb): - pass - -@dataclass -class EmbeddingFunc: - embedding_dim: int - max_token_size: int - func: callable - concurrent_limit: int = 16 - - def __post_init__(self): - if self.concurrent_limit != 0: - self._semaphore = asyncio.Semaphore(self.concurrent_limit) - else: - self._semaphore = UnlimitedSemaphore() - - async def __call__(self, *args, **kwargs) -> np.ndarray: - async with self._semaphore: - return await self.func(*args, **kwargs) diff --git a/graphgen/models/evaluate/length_evaluator.py b/graphgen/models/evaluate/length_evaluator.py deleted file mode 100644 index ba53ff6b..00000000 --- a/graphgen/models/evaluate/length_evaluator.py +++ /dev/null @@ -1,22 +0,0 @@ -from dataclasses import dataclass -from graphgen.models.evaluate.base_evaluator import BaseEvaluator -from graphgen.models.llm.tokenizer import Tokenizer -from graphgen.models.text.text_pair import TextPair -from graphgen.utils import create_event_loop - - -@dataclass -class LengthEvaluator(BaseEvaluator): - tokenizer_name: str = "cl100k_base" - def __post_init__(self): - self.tokenizer = Tokenizer( - model_name=self.tokenizer_name - ) - - async def evaluate_single(self, pair: TextPair) -> float: - loop = create_event_loop() - return await loop.run_in_executor(None, self._calculate_length, pair.answer) - - def _calculate_length(self, text: str) -> float: - tokens = self.tokenizer.encode_string(text) - return len(tokens) diff --git a/graphgen/models/evaluator/__init__.py b/graphgen/models/evaluator/__init__.py new file mode 100644 index 00000000..a9b445b4 --- /dev/null +++ b/graphgen/models/evaluator/__init__.py @@ -0,0 +1,4 @@ +from .length_evaluator import LengthEvaluator +from .mtld_evaluator import MTLDEvaluator +from .reward_evaluator import RewardEvaluator +from .uni_evaluator import UniEvaluator diff --git a/graphgen/models/evaluate/base_evaluator.py b/graphgen/models/evaluator/base_evaluator.py similarity index 55% rename from graphgen/models/evaluate/base_evaluator.py rename to graphgen/models/evaluator/base_evaluator.py index 6c5ae2d5..e93ea262 100644 --- a/graphgen/models/evaluate/base_evaluator.py +++ b/graphgen/models/evaluator/base_evaluator.py @@ -1,22 +1,32 @@ import asyncio -from dataclasses import dataclass from tqdm.asyncio import tqdm as tqdm_async + +from graphgen.bases.datatypes import QAPair from graphgen.utils import create_event_loop -from graphgen.models.text.text_pair import TextPair -@dataclass + class BaseEvaluator: - max_concurrent: int = 100 - results: list[float] = None + def __init__(self, max_concurrent: int = 100): + self.max_concurrent = max_concurrent + self.results: list[float] = None - def evaluate(self, pairs: list[TextPair]) -> list[float]: + def evaluate(self, pairs: list[QAPair]) -> list[float]: """ Evaluate the text and return a score. """ - return create_event_loop().run_until_complete(self.async_evaluate(pairs)) - - async def async_evaluate(self, pairs: list[TextPair]) -> list[float]: + loop, created = create_event_loop() + try: + if loop.is_running(): + raise RuntimeError( + "Cannot use evaluate when event loop is already running." + ) + return loop.run_until_complete(self.async_evaluate(pairs)) + finally: + if created: + loop.close() + + async def async_evaluate(self, pairs: list[QAPair]) -> list[float]: semaphore = asyncio.Semaphore(self.max_concurrent) async def evaluate_with_semaphore(pair): @@ -31,10 +41,10 @@ async def evaluate_with_semaphore(pair): results.append(await result) return results - async def evaluate_single(self, pair: TextPair) -> float: + async def evaluate_single(self, pair: QAPair) -> float: raise NotImplementedError() - def get_average_score(self, pairs: list[TextPair]) -> float: + def get_average_score(self, pairs: list[QAPair]) -> float: """ Get the average score of a batch of texts. """ @@ -42,7 +52,7 @@ def get_average_score(self, pairs: list[TextPair]) -> float: self.results = results return sum(self.results) / len(pairs) - def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]: + def get_min_max_score(self, pairs: list[QAPair]) -> tuple[float, float]: """ Get the min and max score of a batch of texts. """ diff --git a/graphgen/models/evaluator/length_evaluator.py b/graphgen/models/evaluator/length_evaluator.py new file mode 100644 index 00000000..9fbc6f3c --- /dev/null +++ b/graphgen/models/evaluator/length_evaluator.py @@ -0,0 +1,21 @@ +import asyncio + +from graphgen.bases.datatypes import QAPair +from graphgen.models.evaluator.base_evaluator import BaseEvaluator +from graphgen.models.tokenizer import Tokenizer + + +class LengthEvaluator(BaseEvaluator): + def __init__(self, tokenizer_name: str = "cl100k_base", max_concurrent: int = 100): + super().__init__(max_concurrent) + self.tokenizer_name = tokenizer_name + self.tokenizer = Tokenizer(model_name=self.tokenizer_name) + + async def evaluate_single(self, pair: QAPair) -> float: + # In async context, we should use the running loop + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, self._calculate_length, pair.answer) + + def _calculate_length(self, text: str) -> float: + tokens = self.tokenizer.encode(text) + return len(tokens) diff --git a/graphgen/models/evaluate/mtld_evaluator.py b/graphgen/models/evaluator/mtld_evaluator.py similarity index 74% rename from graphgen/models/evaluate/mtld_evaluator.py rename to graphgen/models/evaluator/mtld_evaluator.py index 4ea68875..3423425b 100644 --- a/graphgen/models/evaluate/mtld_evaluator.py +++ b/graphgen/models/evaluator/mtld_evaluator.py @@ -1,23 +1,26 @@ -from dataclasses import dataclass, field from typing import Set -from graphgen.models.evaluate.base_evaluator import BaseEvaluator -from graphgen.models.text.text_pair import TextPair -from graphgen.utils import detect_main_language, NLTKHelper, create_event_loop - +from graphgen.bases.datatypes import QAPair +from graphgen.models.evaluator.base_evaluator import BaseEvaluator +from graphgen.utils import NLTKHelper, detect_main_language nltk_helper = NLTKHelper() -@dataclass + class MTLDEvaluator(BaseEvaluator): """ 衡量文本词汇多样性的指标 """ - stopwords_en: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("english"))) - stopwords_zh: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("chinese"))) - async def evaluate_single(self, pair: TextPair) -> float: - loop = create_event_loop() + def __init__(self, max_concurrent: int = 100): + super().__init__(max_concurrent) + self.stopwords_en: Set[str] = set(nltk_helper.get_stopwords("english")) + self.stopwords_zh: Set[str] = set(nltk_helper.get_stopwords("chinese")) + + async def evaluate_single(self, pair: QAPair) -> float: + # In async context, we should use the running loop + import asyncio + loop = asyncio.get_running_loop() return await loop.run_in_executor(None, self._calculate_mtld_score, pair.answer) def _calculate_mtld_score(self, text: str, threshold=0.72) -> float: @@ -71,6 +74,6 @@ def _compute_factors(tokens: list, threshold: float) -> float: if ttr <= threshold: factors += 1 else: - factors += (1 - (ttr - threshold) / (1 - threshold)) + factors += 1 - (ttr - threshold) / (1 - threshold) return len(tokens) / factors if factors > 0 else len(tokens) diff --git a/graphgen/models/evaluate/reward_evaluator.py b/graphgen/models/evaluator/reward_evaluator.py similarity index 89% rename from graphgen/models/evaluate/reward_evaluator.py rename to graphgen/models/evaluator/reward_evaluator.py index 2e4c021c..4d2c2fb9 100644 --- a/graphgen/models/evaluate/reward_evaluator.py +++ b/graphgen/models/evaluator/reward_evaluator.py @@ -1,6 +1,8 @@ from dataclasses import dataclass + from tqdm import tqdm -from graphgen.models.text.text_pair import TextPair + +from graphgen.bases.datatypes import QAPair @dataclass @@ -9,19 +11,22 @@ class RewardEvaluator: Reward Model Evaluator. OpenAssistant/reward-model-deberta-v3-large-v2: 分数范围为[-inf, inf],越高越好 """ + reward_name: str = "OpenAssistant/reward-model-deberta-v3-large-v2" max_length: int = 2560 results: list[float] = None def __post_init__(self): import torch + self.num_gpus = torch.cuda.device_count() @staticmethod def process_chunk(rank, pairs, reward_name, max_length, return_dict): import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer - device = f'cuda:{rank}' + + device = f"cuda:{rank}" torch.cuda.set_device(rank) rank_model = AutoModelForSequenceClassification.from_pretrained(reward_name) @@ -37,7 +42,7 @@ def process_chunk(rank, pairs, reward_name, max_length, return_dict): pair.answer, return_tensors="pt", max_length=max_length, - truncation=True + truncation=True, ) inputs = {k: v.to(device) for k, v in inputs.items()} score = rank_model(**inputs).logits[0].item() @@ -45,8 +50,9 @@ def process_chunk(rank, pairs, reward_name, max_length, return_dict): return_dict[rank] = results - def evaluate(self, pairs: list[TextPair]) -> list[float]: + def evaluate(self, pairs: list[QAPair]) -> list[float]: import torch.multiprocessing as mp + chunk_size = len(pairs) // self.num_gpus chunks = [] for i in range(self.num_gpus): @@ -64,7 +70,7 @@ def evaluate(self, pairs: list[TextPair]) -> list[float]: for rank, chunk in enumerate(chunks): p = mp.Process( target=self.process_chunk, - args=(rank, chunk, self.reward_name, self.max_length, return_dict) + args=(rank, chunk, self.reward_name, self.max_length, return_dict), ) p.start() processes.append(p) @@ -84,7 +90,7 @@ def evaluate(self, pairs: list[TextPair]) -> list[float]: return results - def get_average_score(self, pairs: list[TextPair]) -> float: + def get_average_score(self, pairs: list[QAPair]) -> float: """ Get the average score of a batch of texts. """ @@ -92,7 +98,7 @@ def get_average_score(self, pairs: list[TextPair]) -> float: self.results = results return sum(self.results) / len(pairs) - def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]: + def get_min_max_score(self, pairs: list[QAPair]) -> tuple[float, float]: """ Get the min and max score of a batch of texts. """ diff --git a/graphgen/models/evaluate/uni_evaluator.py b/graphgen/models/evaluator/uni_evaluator.py similarity index 70% rename from graphgen/models/evaluate/uni_evaluator.py rename to graphgen/models/evaluator/uni_evaluator.py index a334f0a9..20fa3517 100644 --- a/graphgen/models/evaluate/uni_evaluator.py +++ b/graphgen/models/evaluator/uni_evaluator.py @@ -1,40 +1,58 @@ # https://github.com/maszhongming/UniEval/tree/main from dataclasses import dataclass, field + from tqdm import tqdm -from graphgen.models.text.text_pair import TextPair + +from graphgen.bases.datatypes import QAPair def _add_questions(dimension: str, question: str, answer: str): if dimension == "naturalness": - cur_input = 'question: Is this a natural response in the dialogue? response: ' + answer + cur_input = ( + "question: Is this a natural response in the dialogue? response: " + + answer + ) elif dimension == "coherence": - cur_input = 'question: Is this a coherent response given the dialogue history? response: ' \ - + answer + ' dialogue history: ' + question + cur_input = ( + "question: Is this a coherent response given the dialogue history? response: " + + answer + + " dialogue history: " + + question + ) elif dimension == "understandability": - cur_input = 'question: Is this an understandable response in the dialogue? response: ' + answer + cur_input = ( + "question: Is this an understandable response in the dialogue? response: " + + answer + ) else: raise NotImplementedError( - 'The input format for this dimension is still undefined. Please customize it first.') + "The input format for this dimension is still undefined. Please customize it first." + ) return cur_input + @dataclass class UniEvaluator: model_name: str = "MingZhong/unieval-sum" - dimensions: list = field(default_factory=lambda: ['naturalness', 'coherence', 'understandability']) + dimensions: list = field( + default_factory=lambda: ["naturalness", "coherence", "understandability"] + ) max_length: int = 2560 results: dict = None def __post_init__(self): import torch + self.num_gpus = torch.cuda.device_count() self.results = {} @staticmethod def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict): import torch - from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - device = f'cuda:{rank}' + from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + + device = f"cuda:{rank}" torch.cuda.set_device(rank) rank_model = AutoModelForSeq2SeqLM.from_pretrained(model_name) @@ -59,26 +77,26 @@ def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict): max_length=max_length, truncation=True, padding=True, - return_tensors='pt' + return_tensors="pt", ) encoded_tgt = tokenizer( tgt, max_length=max_length, truncation=True, padding=True, - return_tensors='pt' + return_tensors="pt", ) - src_tokens = encoded_src['input_ids'].to(device) - src_mask = encoded_src['attention_mask'].to(device) + src_tokens = encoded_src["input_ids"].to(device) + src_mask = encoded_src["attention_mask"].to(device) - tgt_tokens = encoded_tgt['input_ids'].to(device)[:, 0].unsqueeze(-1) + tgt_tokens = encoded_tgt["input_ids"].to(device)[:, 0].unsqueeze(-1) output = rank_model( input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens, - use_cache = False + use_cache=False, ) logits = output.logits.view(-1, rank_model.config.vocab_size) @@ -91,8 +109,9 @@ def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict): return_dict[rank] = results - def evaluate(self, pairs: list[TextPair]) -> list[dict]: + def evaluate(self, pairs: list[QAPair]) -> list[dict]: import torch.multiprocessing as mp + final_results = [] for dimension in self.dimensions: chunk_size = len(pairs) // self.num_gpus @@ -112,7 +131,14 @@ def evaluate(self, pairs: list[TextPair]) -> list[dict]: for rank, chunk in enumerate(chunks): p = mp.Process( target=self.process_chunk, - args=(rank, chunk, self.model_name, self.max_length, dimension, return_dict) + args=( + rank, + chunk, + self.model_name, + self.max_length, + dimension, + return_dict, + ), ) p.start() processes.append(p) @@ -130,12 +156,10 @@ def evaluate(self, pairs: list[TextPair]) -> list[dict]: p.terminate() p.join() - final_results.append({ - dimension: results - }) + final_results.append({dimension: results}) return final_results - def get_average_score(self, pairs: list[TextPair]) -> dict: + def get_average_score(self, pairs: list[QAPair]) -> dict: """ Get the average score of a batch of texts. """ @@ -147,7 +171,7 @@ def get_average_score(self, pairs: list[TextPair]) -> dict: self.results[key] = value return final_results - def get_min_max_score(self, pairs: list[TextPair]) -> dict: + def get_min_max_score(self, pairs: list[QAPair]) -> dict: """ Get the min and max score of a batch of texts. """ diff --git a/graphgen/models/extractor/__init__.py b/graphgen/models/extractor/__init__.py new file mode 100644 index 00000000..49db9fc2 --- /dev/null +++ b/graphgen/models/extractor/__init__.py @@ -0,0 +1 @@ +from .schema_guided_extractor import SchemaGuidedExtractor diff --git a/graphgen/models/extractor/key_information_extractor.py b/graphgen/models/extractor/key_information_extractor.py new file mode 100644 index 00000000..1a894fee --- /dev/null +++ b/graphgen/models/extractor/key_information_extractor.py @@ -0,0 +1 @@ +# TODO: text2json diff --git a/graphgen/models/extractor/schema_guided_extractor.py b/graphgen/models/extractor/schema_guided_extractor.py new file mode 100644 index 00000000..74801946 --- /dev/null +++ b/graphgen/models/extractor/schema_guided_extractor.py @@ -0,0 +1,108 @@ +import json +from typing import Dict, List + +from graphgen.bases import BaseExtractor, BaseLLMWrapper +from graphgen.templates import SCHEMA_GUIDED_EXTRACTION_PROMPT +from graphgen.utils import compute_dict_hash, detect_main_language, logger + + +class SchemaGuidedExtractor(BaseExtractor): + """ + Use JSON/YAML Schema or Pydantic Model to guide the LLM to extract structured information from text. + + Usage example: + schema = { + "type": "legal contract", + "description": "A legal contract for leasing property.", + "properties": { + "end_date": {"type": "string", "description": "The end date of the lease."}, + "leased_space": {"type": "string", "description": "Description of the space that is being leased."}, + "lessee": {"type": "string", "description": "The lessee's name (and possibly address)."}, + "lessor": {"type": "string", "description": "The lessor's name (and possibly address)."}, + "signing_date": {"type": "string", "description": "The date the contract was signed."}, + "start_date": {"type": "string", "description": "The start date of the lease."}, + "term_of_payment": {"type": "string", "description": "Description of the payment terms."}, + "designated_use": {"type": "string", + "description": "Description of the designated use of the property being leased."}, + "extension_period": {"type": "string", + "description": "Description of the extension options for the lease."}, + "expiration_date_of_lease": {"type": "string", "description": "The expiration data of the lease."} + }, + "required": ["lessee", "lessor", "start_date", "end_date"] + } + extractor = SchemaGuidedExtractor(llm_client, schema) + result = extractor.extract(text) + + """ + + def __init__(self, llm_client: BaseLLMWrapper, schema: dict): + super().__init__(llm_client) + self.schema = schema + self.required_keys = self.schema.get("required") + if not self.required_keys: + # If no required keys are specified, use all keys from the schema as default + self.required_keys = list(self.schema.get("properties", {}).keys()) + + def build_prompt(self, text: str) -> str: + schema_explanation = "" + for field, details in self.schema.get("properties", {}).items(): + description = details.get("description", "No description provided.") + schema_explanation += f'- "{field}": {description}\n' + + lang = detect_main_language(text) + + prompt = SCHEMA_GUIDED_EXTRACTION_PROMPT[lang].format( + field=self.schema.get("name", "the document"), + schema_explanation=schema_explanation, + examples="", + text=text, + ) + return prompt + + async def extract(self, chunk: dict) -> dict: + _chunk_id = chunk.get("_chunk_id", "") + text = chunk.get("content", "") + + prompt = self.build_prompt(text) + response = await self.llm_client.generate_answer(prompt) + try: + extracted_info = json.loads(response) + # Ensure all required keys are present + for key in self.required_keys: + if key not in extracted_info: + extracted_info[key] = "" + if any(extracted_info[key] == "" for key in self.required_keys): + logger.debug("Missing required keys in extraction: %s", extracted_info) + return {} + main_keys_info = {key: extracted_info[key] for key in self.required_keys} + logger.debug("Extracted info: %s", extracted_info) + + # add chunk metadata + extracted_info["_chunk_id"] = _chunk_id + + return { + compute_dict_hash(main_keys_info, prefix="extract-"): extracted_info + } + except json.JSONDecodeError: + logger.error("Failed to parse extraction response: %s", response) + return {} + + @staticmethod + def merge_extractions(extraction_list: List[Dict[str, dict]]) -> Dict[str, dict]: + """ + Merge multiple extraction results based on their hashes. + :param extraction_list: List of extraction results, each is a dict with hash as key and record as value. + :return: Merged extraction results. + """ + merged: Dict[str, dict] = {} + for ext in extraction_list: + for h, rec in ext.items(): + if h not in merged: + merged[h] = rec.copy() + else: + for k, v in rec.items(): + if k not in merged[h] or merged[h][k] == v: + merged[h][k] = v + else: + merged[h][k] = f"{merged[h][k]}{v}" + return merged diff --git a/graphgen/models/generator/__init__.py b/graphgen/models/generator/__init__.py new file mode 100644 index 00000000..ec41f5dc --- /dev/null +++ b/graphgen/models/generator/__init__.py @@ -0,0 +1,7 @@ +from .aggregated_generator import AggregatedGenerator +from .atomic_generator import AtomicGenerator +from .cot_generator import CoTGenerator +from .multi_hop_generator import MultiHopGenerator +from .omics_qa_generator import OmicsQAGenerator +from .quiz_generator import QuizGenerator +from .vqa_generator import VQAGenerator diff --git a/graphgen/models/generator/aggregated_generator.py b/graphgen/models/generator/aggregated_generator.py new file mode 100644 index 00000000..4bad8e99 --- /dev/null +++ b/graphgen/models/generator/aggregated_generator.py @@ -0,0 +1,125 @@ +from typing import Any + +from graphgen.bases import BaseGenerator +from graphgen.templates import AGGREGATED_GENERATION_PROMPT +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +class AggregatedGenerator(BaseGenerator): + """ + Aggregated Generator follows a TWO-STEP process: + 1. rephrase: Rephrase the input nodes and edges into a coherent text that maintains the original meaning. + The rephrased text is considered as answer to be used in the next step. + 2. question generation: Generate relevant questions based on the rephrased text. + """ + + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + """ + Build prompts for REPHRASE. + :param batch + :return: + """ + nodes, edges = batch + entities_str = "\n".join( + [ + f"{index + 1}. {node[0]}: {node[1]['description']}" + for index, node in enumerate(nodes) + ] + ) + relations_str = "\n".join( + [ + f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" + for index, edge in enumerate(edges) + ] + ) + language = detect_main_language(entities_str + relations_str) + + # TODO: configure add_context + # if add_context: + # original_ids = [ + # node["source_id"].split("")[0] for node in _process_nodes + # ] + [edge[2]["source_id"].split("")[0] for edge in _process_edges] + # original_ids = list(set(original_ids)) + # original_text = await text_chunks_storage.get_by_ids(original_ids) + # original_text = "\n".join( + # [ + # f"{index + 1}. {text['content']}" + # for index, text in enumerate(original_text) + # ] + # ) + prompt = AGGREGATED_GENERATION_PROMPT[language]["ANSWER_REPHRASING"].format( + entities=entities_str, relationships=relations_str + ) + return prompt + + @staticmethod + def parse_rephrased_text(response: str) -> str: + """ + Parse the rephrased text from the response. + :param response: + :return: rephrased text + """ + if "Rephrased Text:" in response: + rephrased_text = response.split("Rephrased Text:")[1].strip() + elif "重述文本:" in response: + rephrased_text = response.split("重述文本:")[1].strip() + else: + rephrased_text = response.strip() + return rephrased_text.strip('"') + + @staticmethod + def _build_prompt_for_question_generation(answer: str) -> str: + """ + Build prompts for QUESTION GENERATION. + :param answer: + :return: + """ + language = detect_main_language(answer) + prompt = AGGREGATED_GENERATION_PROMPT[language]["QUESTION_GENERATION"].format( + answer=answer + ) + return prompt + + @staticmethod + def parse_response(response: str) -> dict: + if response.startswith("Question:"): + question = response[len("Question:") :].strip() + elif response.startswith("问题:"): + question = response[len("问题:") :].strip() + else: + question = response.strip() + return { + "question": question, + } + + async def generate( + self, + batch: tuple[ + list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]] + ], + ) -> dict[str, Any]: + """ + Generate QAs based on a given batch. + :param batch + :return: QA pairs + """ + result = {} + rephrasing_prompt = self.build_prompt(batch) + response = await self.llm_client.generate_answer(rephrasing_prompt) + context = self.parse_rephrased_text(response) + question_generation_prompt = self._build_prompt_for_question_generation(context) + response = await self.llm_client.generate_answer(question_generation_prompt) + question = self.parse_response(response)["question"] + logger.debug("Question: %s", question) + logger.debug("Answer: %s", context) + qa_pairs = { + compute_content_hash(question): { + "question": question, + "answer": context, + } + } + result.update(qa_pairs) + return result diff --git a/graphgen/models/generator/atomic_generator.py b/graphgen/models/generator/atomic_generator.py new file mode 100644 index 00000000..713140d2 --- /dev/null +++ b/graphgen/models/generator/atomic_generator.py @@ -0,0 +1,50 @@ +from typing import Any + +from graphgen.bases import BaseGenerator +from graphgen.templates import ATOMIC_GENERATION_PROMPT +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +class AtomicGenerator(BaseGenerator): + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + nodes, edges = batch + context = "" + for node in nodes: + context += f"- {node[0]}: {node[1]['description']}\n" + for edge in edges: + context += f"- {edge[0]} - {edge[1]}: {edge[2]['description']}\n" + language = detect_main_language(context) + + prompt = ATOMIC_GENERATION_PROMPT[language].format(context=context) + return prompt + + @staticmethod + def parse_response(response: str) -> dict: + """ + AtomicGenerator normally generates one QA pair per response. + So we just need to parse one QA pair from the response. + :param response: + :return: + """ + if "Question:" in response and "Answer:" in response: + question = response.split("Question:")[1].split("Answer:")[0].strip() + answer = response.split("Answer:")[1].strip() + elif "问题:" in response and "答案:" in response: + question = response.split("问题:")[1].split("答案:")[0].strip() + answer = response.split("答案:")[1].strip() + else: + logger.warning("Failed to parse response: %s", response) + return {} + question = question.strip('"') + answer = answer.strip('"') + logger.debug("Question: %s", question) + logger.debug("Answer: %s", answer) + return { + compute_content_hash(question): { + "question": question, + "answer": answer, + } + } diff --git a/graphgen/models/generator/cot_generator.py b/graphgen/models/generator/cot_generator.py new file mode 100644 index 00000000..a111a6f6 --- /dev/null +++ b/graphgen/models/generator/cot_generator.py @@ -0,0 +1,120 @@ +from typing import Any + +from graphgen.bases import BaseGenerator +from graphgen.templates import COT_GENERATION_PROMPT +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +class CoTGenerator(BaseGenerator): + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + """ + Build prompts for COT Template Design. + :param batch: + :return: + """ + nodes, edges = batch + entities_str = "\n".join( + [ + f"{index + 1}. {node[0]}: {node[1]['description']}" + for index, node in enumerate(nodes) + ] + ) + relationships_str = "\n".join( + [ + f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" + for index, edge in enumerate(edges) + ] + ) + language = detect_main_language(entities_str + relationships_str) + prompt = COT_GENERATION_PROMPT[language]["COT_TEMPLATE_DESIGN"].format( + entities=entities_str, relationships=relationships_str + ) + return prompt + + @staticmethod + def build_prompt_for_cot_generation( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]], + question: str, + reasoning_path: str, + ) -> str: + """ + Build prompts for COT Generation. + """ + nodes, edges = batch + entities_str = "\n".join( + [ + f"{index + 1}. {node[0]}: {node[1]['description']}" + for index, node in enumerate(nodes) + ] + ) + relationships_str = "\n".join( + [ + f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" + for index, edge in enumerate(edges) + ] + ) + language = detect_main_language(entities_str + relationships_str) + prompt = COT_GENERATION_PROMPT[language]["COT_GENERATION"].format( + entities=entities_str, + relationships=relationships_str, + question=question, + reasoning_template=reasoning_path, + ) + return prompt + + @staticmethod + def parse_response(response: str) -> dict: + if "Question:" in response and "Reasoning-Path Design:" in response: + question = ( + response.split("Question:")[1] + .split("Reasoning-Path Design:")[0] + .strip() + ) + reasoning_path = response.split("Reasoning-Path Design:")[1].strip() + elif "问题:" in response and "推理路径设计:" in response: + question = response.split("问题:")[1].split("推理路径设计:")[0].strip() + reasoning_path = response.split("推理路径设计:")[1].strip() + else: + logger.warning("Failed to parse CoT template: %s", response) + return {} + + question = question.strip('"') + reasoning_path = reasoning_path.strip('"') + logger.debug("CoT Question: %s", question) + logger.debug("CoT Reasoning Path: %s", reasoning_path) + return { + "question": question, + "reasoning_path": reasoning_path, + } + + async def generate( + self, + batch: tuple[ + list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]] + ], + ) -> dict[str, Any]: + """ + Generate QAs based on a given batch. + :param batch + :return: QA pairs + """ + result = {} + prompt = self.build_prompt(batch) + response = await self.llm_client.generate_answer(prompt) + response = self.parse_response(response) + question, reasoning_path = response["question"], response["reasoning_path"] + prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path) + cot_answer = await self.llm_client.generate_answer(prompt) + logger.debug("CoT Answer: %s", cot_answer) + qa_pairs = { + compute_content_hash(question): { + "question": question, + "answer": cot_answer, + "reasoning_path": reasoning_path, + } + } + result.update(qa_pairs) + return result diff --git a/graphgen/models/generator/multi_hop_generator.py b/graphgen/models/generator/multi_hop_generator.py new file mode 100644 index 00000000..9098b102 --- /dev/null +++ b/graphgen/models/generator/multi_hop_generator.py @@ -0,0 +1,53 @@ +from typing import Any + +from graphgen.bases import BaseGenerator +from graphgen.templates import MULTI_HOP_GENERATION_PROMPT +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +class MultiHopGenerator(BaseGenerator): + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + nodes, edges = batch + entities_str = "\n".join( + [ + f"{index + 1}. {node[0]}: {node[1]['description']}" + for index, node in enumerate(nodes) + ] + ) + + relationships_str = "\n".join( + [ + f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" + for index, edge in enumerate(edges) + ] + ) + language = detect_main_language(entities_str + relationships_str) + prompt = MULTI_HOP_GENERATION_PROMPT[language].format( + entities=entities_str, relationships=relationships_str + ) + return prompt + + @staticmethod + def parse_response(response: str) -> dict: + if "Question:" in response and "Answer:" in response: + question = response.split("Question:")[1].split("Answer:")[0].strip() + answer = response.split("Answer:")[1].strip() + elif "问题:" in response and "答案:" in response: + question = response.split("问题:")[1].split("答案:")[0].strip() + answer = response.split("答案:")[1].strip() + else: + logger.warning("Failed to parse response: %s", response) + return {} + question = question.strip('"') + answer = answer.strip('"') + logger.debug("Question: %s", question) + logger.debug("Answer: %s", answer) + return { + compute_content_hash(question): { + "question": question, + "answer": answer, + } + } diff --git a/graphgen/models/generator/omics_qa_generator.py b/graphgen/models/generator/omics_qa_generator.py new file mode 100644 index 00000000..d5ac7ddb --- /dev/null +++ b/graphgen/models/generator/omics_qa_generator.py @@ -0,0 +1,365 @@ +import re +from typing import Any, Optional + +from graphgen.bases import BaseGenerator +from graphgen.templates import OMICS_QA_GENERATION_PROMPT +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +class OmicsQAGenerator(BaseGenerator): + """ + Unified QA generator for multi-omics data (DNA, RNA, Protein). + Automatically extracts and attaches molecule-specific caption information to QA pairs. + """ + + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + nodes, edges = batch + entities_str = "\n".join( + [ + f"{index + 1}. {node[0]}: {node[1]['description']}" + for index, node in enumerate(nodes) + ] + ) + + relationships_str = "\n".join( + [ + f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" + for index, edge in enumerate(edges) + ] + ) + language = detect_main_language(entities_str + relationships_str) + prompt = OMICS_QA_GENERATION_PROMPT[language].format( + entities=entities_str, relationships=relationships_str + ) + return prompt + + @staticmethod + def parse_response(response: str) -> Any: + """ + Parse the LLM response and return the generated QAs + :param response + :return: QA pairs + """ + qa_pairs = {} + qa_list = response.strip().split("\n\n") + for qa in qa_list: + match = re.search( + r"Question:\s*(.*?)\s*Answer:\s*(.*)", qa, re.DOTALL + ) or re.search(r"问题:\s*(.*?)\s*答案:\s*(.*)", qa, re.DOTALL) + + if match: + question = match.group(1).strip() + answer = match.group(2).strip() + else: + if qa: + logger.error("Failed to parse QA pair: %s", qa) + continue + question = question.strip('"') + answer = answer.strip('"') + logger.debug("Question: %s", question) + logger.debug("Answer: %s", answer) + qa_pairs[compute_content_hash(question)] = { + "question": question, + "answer": answer, + } + return qa_pairs + + @staticmethod + def _extract_caption(node_data: dict, molecule_type: str) -> Optional[dict]: + """ + Extract molecule-specific caption information from node data. + + :param node_data: Node data dictionary + :param molecule_type: Type of molecule ("dna", "rna", or "protein") + :return: Caption dictionary or None + """ + molecule_type_lower = molecule_type.lower() + + # Check if there's already a caption field (e.g., protein_caption, dna_caption, rna_caption) + caption_key = f"{molecule_type_lower}_caption" + if caption_key in node_data and node_data[caption_key]: + if isinstance(node_data[caption_key], list) and len(node_data[caption_key]) > 0: + # Always return the first element if it's a dict, otherwise return None for consistency + caption_val = node_data[caption_key][0] + return caption_val if isinstance(caption_val, dict) else None + if isinstance(node_data[caption_key], dict): + return node_data[caption_key] + + # Field mappings for each molecule type + field_mapping = { + "protein": [ + "protein_name", "gene_names", "organism", "function", + "sequence", "id", "database", "entry_name", "uniprot_id" + ], + "dna": [ + "gene_name", "gene_description", "organism", "chromosome", + "genomic_location", "function", "gene_type", "id", + "database", "sequence" + ], + "rna": [ + "rna_type", "description", "organism", "related_genes", + "gene_name", "so_term", "id", "database", + "rnacentral_id", "sequence" + ], + } + + # Extract fields based on molecule type + caption = {} + caption_fields = field_mapping.get(molecule_type_lower, []) + for field in caption_fields: + if field in node_data and node_data[field]: + caption[field] = node_data[field] + + # Special handling for protein: check search results and existing protein field + if molecule_type_lower == "protein": + # Check for search result data (from UniProt search) + if "_search_results" in node_data: + search_results = node_data["_search_results"] + if isinstance(search_results, list) and len(search_results) > 0: + first_result = search_results[0] + if isinstance(first_result, dict): + search_caption = { + "id": first_result.get("id", ""), + "protein_name": first_result.get("protein_name", ""), + "gene_names": first_result.get("gene_names", []), + "organism": first_result.get("organism", ""), + "function": first_result.get("function", []), + "sequence": node_data.get("sequence") or first_result.get("sequence", ""), + "database": "UniProt" + } + # Remove empty fields and return if any data exists + search_caption = {k: v for k, v in search_caption.items() if v} + if search_caption: + return search_caption + + # Merge with existing protein field if present + if "protein" in node_data and node_data["protein"]: + existing_protein = node_data["protein"] + if isinstance(existing_protein, list) and len(existing_protein) > 0: + existing_protein = ( + existing_protein[0] + if isinstance(existing_protein[0], dict) + else existing_protein + ) + if isinstance(existing_protein, dict): + for key, value in existing_protein.items(): + if key not in caption and value: + caption[key] = value + # Ensure sequence from node_data takes precedence + if "sequence" in node_data and node_data["sequence"]: + caption["sequence"] = node_data["sequence"] + + # Fallback to description if no caption found + if not caption and "description" in node_data: + description = node_data["description"] + if isinstance(description, str) and len(description) > 10: + caption["description"] = description + + return caption if caption else None + + @staticmethod + def _detect_molecule_type(nodes: list[tuple[str, dict]]) -> str: + """ + Detect molecule type from nodes. + Priority: Check node type, then check metadata, then check node data fields. + + :param nodes: List of (node_id, node_data) tuples + :return: Detected molecule type ("dna", "rna", "protein", or "unknown") + """ + if not nodes: + return "unknown" + + # Type indicators for each molecule type + type_indicators = { + "protein": { + "fields": ["protein_name", "uniprot_id", "entry_name", "protein_caption"], + "source_prefix": "protein-", + "description_keywords": ["protein"], + }, + "dna": { + "fields": ["gene_name", "chromosome", "genomic_location"], + "source_prefix": "dna-", + "description_keywords": ["gene", "dna", "chromosome"], + }, + "rna": { + "fields": ["rna_type", "rnacentral_id"], + "source_prefix": "rna-", + "description_keywords": ["rna", "transcript"], + }, + } + + for _, node_data in nodes: + # Priority 1: Check explicit type fields (most reliable) + for field in ["type", "molecule_type"]: + value = node_data.get(field, "").lower() + if value in ("dna", "rna", "protein"): + return value + + # Priority 2: Check source_id prefix + source_id = node_data.get("source_id", "").lower() + for mol_type, indicators in type_indicators.items(): + if source_id.startswith(indicators["source_prefix"]): + return mol_type + + # Priority 3: Check type-specific fields + for mol_type, indicators in type_indicators.items(): + if any(key in node_data for key in indicators["fields"]): + # Special check for DNA: need chromosome or genomic_location + if mol_type == "dna" and not any(key in node_data for key in ["chromosome", "genomic_location"]): + continue + return mol_type + + # Priority 4: Check description keywords + description = node_data.get("description", "").lower() + for mol_type, indicators in type_indicators.items(): + keywords = indicators["description_keywords"] + if any(kw in description for kw in keywords): + # Special check: "protein" in description but not "gene" + if mol_type == "protein" and "gene" in description: + continue + return mol_type + + return "unknown" + + async def generate( + self, + batch: tuple[ + list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]] + ], + ) -> dict[str, Any]: + """ + Generate QAs based on a given batch. + Automatically extracts and attaches molecule-specific caption information. + + :param batch + :return: QA pairs with attached molecule captions + """ + result = {} + prompt = self.build_prompt(batch) + response = await self.llm_client.generate_answer(prompt) + qa_pairs = self.parse_response(response) # generate one or more QA pairs + + nodes, _ = batch + + # Detect molecule type from nodes + molecule_type = self._detect_molecule_type(nodes) + + # Extract captions for all molecule types from nodes + captions = {"dna": None, "rna": None, "protein": None} + caption_attached = False + + for node in nodes: + _, node_data = node + + # Check for pre-extracted captions (from partition_service) + for mol_type in ["dna", "rna", "protein"]: + caption_key = f"{mol_type}_caption" + if caption_key in node_data and node_data[caption_key]: + captions[mol_type] = node_data[caption_key] + caption_attached = True + + # If no pre-extracted captions, extract from node_data using the detected molecule_type + if not caption_attached: + caption = self._extract_caption(node_data, molecule_type) + if caption: + captions[molecule_type] = caption + caption_attached = True + break # Only need to extract once per batch + + # Attach all captions to QA pairs + for qa in qa_pairs.values(): + qa["dna"] = captions["dna"] if captions["dna"] else "" + qa["rna"] = captions["rna"] if captions["rna"] else "" + qa["protein"] = captions["protein"] if captions["protein"] else "" + + if not caption_attached: + node_sample = ( + dict(list(nodes[0][1].items())[:5]) if nodes else 'No nodes' + ) + logger.warning( + "No caption extracted for molecule_type=%s. Node data sample: %s", + molecule_type, node_sample + ) + + result.update(qa_pairs) + return result + + @staticmethod + def format_generation_results( + results: list[dict], output_data_format: str + ) -> list[dict[str, Any]]: + """ + Format generation results with molecule-specific caption fields. + Supports dna, rna, and protein fields in output. + """ + # Extract QA pairs and molecule captions + qa_items = [ + { + "question": v["question"], + "answer": v["answer"], + "dna": v.get("dna", ""), + "rna": v.get("rna", ""), + "protein": v.get("protein", ""), + } + for item in results + for k, v in item.items() + ] + + # Format based on output format + if output_data_format == "Alpaca": + return [ + { + "instruction": qa["question"], + "input": "", + "output": qa["answer"], + "dna": qa["dna"], + "rna": qa["rna"], + "protein": qa["protein"], + } + for qa in qa_items + ] + if output_data_format == "Sharegpt": + return [ + { + "conversations": [ + { + "from": "human", + "value": [ + { + "text": qa["question"], + "dna": qa["dna"], + "rna": qa["rna"], + "protein": qa["protein"], + } + ], + }, + {"from": "gpt", "value": qa["answer"]}, + ] + } + for qa in qa_items + ] + if output_data_format == "ChatML": + return [ + { + "messages": [ + { + "role": "user", + "content": [ + { + "text": qa["question"], + "dna": qa["dna"], + "rna": qa["rna"], + "protein": qa["protein"], + } + ], + }, + {"role": "assistant", "content": qa["answer"]}, + ] + } + for qa in qa_items + ] + else: + raise ValueError(f"Unknown output data format: {output_data_format}") diff --git a/graphgen/models/generator/quiz_generator.py b/graphgen/models/generator/quiz_generator.py new file mode 100644 index 00000000..d117092d --- /dev/null +++ b/graphgen/models/generator/quiz_generator.py @@ -0,0 +1,70 @@ +from typing import Any + +from graphgen.bases import BaseGenerator +from graphgen.templates import DESCRIPTION_REPHRASING_PROMPT +from graphgen.utils import detect_main_language, logger + + +class QuizGenerator(BaseGenerator): + """ + Quiz Generator rephrases given descriptions to create quiz questions. + """ + + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + """ + Build prompt for rephrasing the description. + :param batch: A tuple containing (nodes, edges) where nodes/edges + contain description information + :return: Prompt string + """ + # Extract description from batch + # For quiz generator, we expect a special format where + # the description is passed as the first node's description + nodes, edges = batch + if nodes: + description = nodes[0][1].get("description", "") + template_type = nodes[0][1].get("template_type", "TEMPLATE") + elif edges: + description = edges[0][2].get("description", "") + template_type = edges[0][2].get("template_type", "TEMPLATE") + else: + raise ValueError("Batch must contain at least one node or edge with description") + + return QuizGenerator.build_prompt_for_description(description, template_type) + + @staticmethod + def build_prompt_for_description(description: str, template_type: str = "TEMPLATE") -> str: + """ + Build prompt for rephrasing a single description. + :param description: The description to rephrase + :param template_type: Either "TEMPLATE" (same meaning) or "ANTI_TEMPLATE" (opposite meaning) + :return: Prompt string + """ + language = detect_main_language(description) + prompt = DESCRIPTION_REPHRASING_PROMPT[language][template_type].format( + input_sentence=description + ) + return prompt + + @staticmethod + def parse_rephrased_text(response: str) -> str: + """ + Parse the rephrased text from the response. + :param response: + :return: + """ + rephrased_text = response.strip().strip('"') + logger.debug("Rephrased Text: %s", rephrased_text) + return rephrased_text + + @staticmethod + def parse_response(response: str) -> Any: + """ + Parse the LLM response. For quiz generator, this returns the rephrased text. + :param response: LLM response + :return: Rephrased text + """ + return QuizGenerator.parse_rephrased_text(response) diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py new file mode 100644 index 00000000..91b44862 --- /dev/null +++ b/graphgen/models/generator/vqa_generator.py @@ -0,0 +1,136 @@ +from typing import Any + +from graphgen.bases import BaseGenerator +from graphgen.templates import VQA_GENERATION_PROMPT +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +class VQAGenerator(BaseGenerator): + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + nodes, edges = batch + entities_str = "\n".join( + [ + f"{index + 1}. {node[0]}: {node[1]['description']}" + for index, node in enumerate(nodes) + ] + ) + + relationships_str = "\n".join( + [ + f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" + for index, edge in enumerate(edges) + ] + ) + language = detect_main_language(entities_str + relationships_str) + prompt = VQA_GENERATION_PROMPT[language].format( + entities=entities_str, relationships=relationships_str + ) + return prompt + + @staticmethod + def parse_response(response: str) -> Any: + """ + Parse the LLM response and return the generated QAs + :param response + :return: QA pairs + """ + qa_pairs = {} + qa_list = response.strip().split("\n\n") + for qa in qa_list: + if "Question:" in qa and "Answer:" in qa: + question = qa.split("Question:")[1].split("Answer:")[0].strip() + answer = qa.split("Answer:")[1].strip() + elif "问题:" in qa and "答案:" in qa: + question = qa.split("问题:")[1].split("答案:")[0].strip() + answer = qa.split("答案:")[1].strip() + else: + logger.error("Failed to parse QA pair: %s", qa) + continue + question = question.strip('"') + answer = answer.strip('"') + logger.debug("Question: %s", question) + logger.debug("Answer: %s", answer) + qa_pairs[compute_content_hash(question)] = { + "question": question, + "answer": answer, + } + return qa_pairs + + async def generate( + self, + batch: tuple[ + list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]] + ], + ) -> dict[str, Any]: + """ + Generate QAs based on a given batch. + :param batch + :return: QA pairs + """ + result = {} + prompt = self.build_prompt(batch) + response = await self.llm_client.generate_answer(prompt) + qa_pairs = self.parse_response(response) # generate one or more QA pairs + nodes, _ = batch + for node in nodes: + node_data = node[1] + if "image_data" in node_data and node_data["image_data"]: + img_path = node_data["image_data"]["img_path"] + for qa in qa_pairs.values(): + qa["img_path"] = img_path + result.update(qa_pairs) + return result + + @staticmethod + def format_generation_results( + results: list[dict], output_data_format: str + ) -> list[dict[str, Any]]: + if output_data_format == "Alpaca": + results = [ + { + "instruction": v["question"], + "input": "", + "output": v["answer"], + "image": v.get("img_path", ""), + } + for item in results + for k, v in item.items() + ] + elif output_data_format == "Sharegpt": + results = [ + { + "conversations": [ + { + "from": "human", + "value": [ + {"text": v["question"], "image": v.get("img_path", "")} + ], + }, + {"from": "gpt", "value": v["answer"]}, + ] + } + for item in results + for k, v in item.items() + ] + elif output_data_format == "ChatML": + results = [ + { + "messages": [ + { + "role": "user", + "content": [ + {"text": v["question"], "image": v.get("img_path", "")} + ], + }, + {"role": "assistant", "content": v["answer"]}, + ] + } + for item in results + for k, v in item.items() + ] + else: + raise ValueError(f"Unknown output data format: {output_data_format}") + return results diff --git a/graphgen/models/kg_builder/__init__.py b/graphgen/models/kg_builder/__init__.py new file mode 100644 index 00000000..aa0339c6 --- /dev/null +++ b/graphgen/models/kg_builder/__init__.py @@ -0,0 +1,3 @@ +from .light_rag_kg_builder import LightRAGKGBuilder +from .mm_kg_builder import MMKGBuilder +from .omics_kg_builder import OmicsKGBuilder diff --git a/graphgen/models/kg_builder/light_rag_kg_builder.py b/graphgen/models/kg_builder/light_rag_kg_builder.py new file mode 100644 index 00000000..a6185f44 --- /dev/null +++ b/graphgen/models/kg_builder/light_rag_kg_builder.py @@ -0,0 +1,218 @@ +import re +from collections import Counter, defaultdict +from typing import Dict, List, Tuple + +from graphgen.bases import BaseGraphStorage, BaseKGBuilder, BaseLLMWrapper, Chunk +from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT +from graphgen.utils import ( + detect_main_language, + handle_single_entity_extraction, + handle_single_relationship_extraction, + logger, + pack_history_conversations, + split_string_by_multi_markers, +) + + +class LightRAGKGBuilder(BaseKGBuilder): + def __init__(self, llm_client: BaseLLMWrapper, max_loop: int = 3): + super().__init__(llm_client) + self.max_loop = max_loop + + async def extract( + self, chunk: Chunk + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """ + Extract entities and relationships from a single chunk using the LLM client. + :param chunk + :return: (nodes_data, edges_data) + """ + chunk_id = chunk.id + content = chunk.content + + # step 1: language_detection + language = detect_main_language(content) + + hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format( + **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content + ) + + # step 2: initial glean + final_result = await self.llm_client.generate_answer(hint_prompt) + logger.debug("First extraction result: %s", final_result) + + # step3: iterative refinement + history = pack_history_conversations(hint_prompt, final_result) + for loop_idx in range(self.max_loop): + if_loop_result = await self.llm_client.generate_answer( + text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history + ) + if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() + if if_loop_result != "yes": + break + + glean_result = await self.llm_client.generate_answer( + text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history + ) + logger.debug("Loop %s glean: %s", loop_idx + 1, glean_result) + + history += pack_history_conversations( + KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result + ) + final_result += glean_result + + # step 4: parse the final result + records = split_string_by_multi_markers( + final_result, + [ + KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + + for record in records: + match = re.search(r"\((.*)\)", record) + if not match: + continue + inner = match.group(1) + + attributes = split_string_by_multi_markers( + inner, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] + ) + + entity = await handle_single_entity_extraction(attributes, chunk_id) + if entity is not None: + nodes[entity["entity_name"]].append(entity) + continue + + relation = await handle_single_relationship_extraction(attributes, chunk_id) + if relation is not None: + key = (relation["src_id"], relation["tgt_id"]) + edges[key].append(relation) + + return dict(nodes), dict(edges) + + async def merge_nodes( + self, + node_data: tuple[str, List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + entity_name, node_data = node_data + entity_types = [] + source_ids = [] + descriptions = [] + + node = kg_instance.get_node(entity_name) + if node is not None: + entity_types.append(node["entity_type"]) + source_ids.extend( + split_string_by_multi_markers(node["source_id"], [""]) + ) + descriptions.append(node["description"]) + + # take the most frequent entity_type + entity_type = sorted( + Counter([dp["entity_type"] for dp in node_data] + entity_types).items(), + key=lambda x: x[1], + reverse=True, + )[0][0] + + description = "".join( + sorted(set([dp["description"] for dp in node_data] + descriptions)) + ) + description = await self._handle_kg_summary(entity_name, description) + + source_id = "".join( + set([dp["source_id"] for dp in node_data] + source_ids) + ) + + node_data = { + "entity_type": entity_type, + "description": description, + "source_id": source_id, + } + kg_instance.upsert_node(entity_name, node_data=node_data) + + async def merge_edges( + self, + edges_data: tuple[Tuple[str, str], List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + (src_id, tgt_id), edge_data = edges_data + + source_ids = [] + descriptions = [] + + edge = kg_instance.get_edge(src_id, tgt_id) + if edge is not None: + source_ids.extend( + split_string_by_multi_markers(edge["source_id"], [""]) + ) + descriptions.append(edge["description"]) + + description = "".join( + sorted(set([dp["description"] for dp in edge_data] + descriptions)) + ) + source_id = "".join( + set([dp["source_id"] for dp in edge_data] + source_ids) + ) + + for insert_id in [src_id, tgt_id]: + if not kg_instance.has_node(insert_id): + kg_instance.upsert_node( + insert_id, + node_data={ + "source_id": source_id, + "description": description, + "entity_type": "UNKNOWN", + }, + ) + + description = await self._handle_kg_summary( + f"({src_id}, {tgt_id})", description + ) + + kg_instance.upsert_edge( + src_id, + tgt_id, + edge_data={"source_id": source_id, "description": description}, + ) + + async def _handle_kg_summary( + self, + entity_or_relation_name: str, + description: str, + max_summary_tokens: int = 200, + ) -> str: + """ + Handle knowledge graph summary + + :param entity_or_relation_name + :param description + :param max_summary_tokens + :return summary + """ + + tokenizer_instance = self.llm_client.tokenizer + language = detect_main_language(description) + + tokens = tokenizer_instance.encode(description) + if len(tokens) < max_summary_tokens: + return description + + use_description = tokenizer_instance.decode(tokens[:max_summary_tokens]) + prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format( + entity_name=entity_or_relation_name, + description_list=use_description.split(""), + **KG_SUMMARIZATION_PROMPT["FORMAT"], + ) + new_description = await self.llm_client.generate_answer(prompt) + logger.info( + "Entity or relation %s summary: %s", + entity_or_relation_name, + new_description, + ) + return new_description diff --git a/graphgen/models/kg_builder/mm_kg_builder.py b/graphgen/models/kg_builder/mm_kg_builder.py new file mode 100644 index 00000000..f352cb2a --- /dev/null +++ b/graphgen/models/kg_builder/mm_kg_builder.py @@ -0,0 +1,91 @@ +import re +from collections import defaultdict +from typing import Dict, List, Tuple + +from graphgen.bases import Chunk +from graphgen.templates import MMKG_EXTRACTION_PROMPT +from graphgen.utils import ( + detect_main_language, + handle_single_entity_extraction, + handle_single_relationship_extraction, + logger, + split_string_by_multi_markers, +) + +from .light_rag_kg_builder import LightRAGKGBuilder + + +class MMKGBuilder(LightRAGKGBuilder): + async def extract( + self, chunk: Chunk + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """ + Extract entities and relationships from a single multi-modal chunk using the LLM client. + Expect to get a mini graph which contains a central multi-modal entity + and its related text entities and relationships. + Like: + (image: "image_of_eiffel_tower") --[located_in]--> (text: "Paris") + (image: "image_of_eiffel_tower") --[built_in]--> (text: "1889") + (text: "Eiffel Tower") --[height]--> (text: "324 meters") + :param chunk + """ + chunk_id = chunk.id + chunk_type = chunk.type # image | table | formula | ... + metadata = chunk.metadata + + # choose different extraction strategies based on chunk type + if chunk_type == "image": + image_caption = "\n".join(metadata.get("image_caption", "")) + language = detect_main_language(image_caption) + prompt_template = MMKG_EXTRACTION_PROMPT[language].format( + **MMKG_EXTRACTION_PROMPT["FORMAT"], + chunk_type=chunk_type, + chunk_id=chunk_id, + chunk_text=image_caption, + ) + result = await self.llm_client.generate_answer(prompt_template) + logger.debug("Image chunk extraction result: %s", result) + + # parse the result + records = split_string_by_multi_markers( + result, + [ + MMKG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + MMKG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + + for record in records: + match = re.search(r"\((.*)\)", record) + if not match: + continue + inner = match.group(1) + + attributes = split_string_by_multi_markers( + inner, [MMKG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] + ) + + entity = await handle_single_entity_extraction(attributes, chunk_id) + if entity is not None: + nodes[entity["entity_name"]].append(entity) + continue + + relation = await handle_single_relationship_extraction( + attributes, chunk_id + ) + if relation is not None: + key = (relation["src_id"], relation["tgt_id"]) + edges[key].append(relation) + + return dict(nodes), dict(edges) + + if chunk_type == "table": + pass # TODO: implement table-based entity and relationship extraction + if chunk_type == "formula": + pass # TODO: implement formula-based entity and relationship extraction + + logger.error("Unsupported chunk type for MMKGBuilder: %s", chunk_type) + return defaultdict(list), defaultdict(list) diff --git a/graphgen/models/kg_builder/omics_kg_builder.py b/graphgen/models/kg_builder/omics_kg_builder.py new file mode 100644 index 00000000..6317a2b6 --- /dev/null +++ b/graphgen/models/kg_builder/omics_kg_builder.py @@ -0,0 +1,280 @@ +import re +from collections import Counter, defaultdict +from typing import Dict, List, Tuple + +from graphgen.bases import BaseGraphStorage, BaseKGBuilder, BaseLLMWrapper, Chunk +from graphgen.templates import KG_SUMMARIZATION_PROMPT, OMICS_KG_EXTRACTION_PROMPT +from graphgen.utils import ( + detect_main_language, + handle_single_entity_extraction, + handle_single_relationship_extraction, + logger, + pack_history_conversations, + split_string_by_multi_markers, +) + + +class OmicsKGBuilder(BaseKGBuilder): + """ + Knowledge graph builder for multi-omics data (DNA, RNA, protein). + Extracts entities and relationships from sequence chunks and their metadata. + """ + + def __init__(self, llm_client: BaseLLMWrapper, max_loop: int = 3): + super().__init__(llm_client) + self.max_loop = max_loop + + async def extract( + self, chunk: Chunk + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """ + Extract entities and relationships from a sequence chunk using the LLM client. + + :param chunk: Sequence chunk with metadata + :return: (nodes_data, edges_data) + """ + chunk_id = chunk.id + content = chunk.content + metadata = chunk.metadata + + # Extract sequence and metadata information + sequence_chunk = content or metadata.get("sequence", "") + # molecule_type is used in _format_metadata indirectly via metadata dict + _ = metadata.get("molecule_type", "").lower() + + # Build metadata text for prompt + metadata_text = self._format_metadata(metadata) + + # Detect language from metadata text (defaults to English if no Chinese detected) + language = detect_main_language(metadata_text) + + # Build prompt with sequence and metadata + hint_prompt = OMICS_KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format( + **OMICS_KG_EXTRACTION_PROMPT["FORMAT"], + metadata_text=metadata_text, + sequence_chunk=sequence_chunk[:500] if sequence_chunk else "", # Limit sequence length in prompt + ) + + # step 2: initial glean + final_result = await self.llm_client.generate_answer(hint_prompt) + logger.debug("First extraction result: %s", final_result) + + # step 3: iterative refinement + history = pack_history_conversations(hint_prompt, final_result) + for loop_idx in range(self.max_loop): + if_loop_result = await self.llm_client.generate_answer( + text=OMICS_KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history + ) + if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() + if if_loop_result != "yes": + break + + glean_result = await self.llm_client.generate_answer( + text=OMICS_KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history + ) + logger.debug("Loop %s glean: %s", loop_idx + 1, glean_result) + + history += pack_history_conversations( + OMICS_KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result + ) + final_result += glean_result + + # step 4: parse the final result + records = split_string_by_multi_markers( + final_result, + [ + OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + + for record in records: + match = re.search(r"\((.*)\)", record) + if not match: + continue + inner = match.group(1) + + attributes = split_string_by_multi_markers( + inner, [OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] + ) + + entity = await handle_single_entity_extraction(attributes, chunk_id) + if entity is not None: + nodes[entity["entity_name"]].append(entity) + continue + + relation = await handle_single_relationship_extraction(attributes, chunk_id) + if relation is not None: + key = (relation["src_id"], relation["tgt_id"]) + edges[key].append(relation) + + return dict(nodes), dict(edges) + + @staticmethod + def _format_metadata(metadata: dict) -> str: + """ + Format metadata dictionary into a readable text string for the prompt. + + :param metadata: Metadata dictionary from chunk + :return: Formatted metadata text + """ + # Filter out internal fields and format + exclude_fields = { + "_doc_id", + "_chunk_id", + "chunk_index", + "total_chunks", + "length", + "type", + "content", + "sequence", + } + + metadata_items = [] + for key, value in metadata.items(): + if key in exclude_fields: + continue + if value is None or value == "": + continue + if isinstance(value, list): + value = ", ".join(str(v) for v in value) + metadata_items.append(f"{key}: {value}") + + return "\n".join(metadata_items) if metadata_items else "No additional metadata available." + + async def merge_nodes( + self, + node_data: tuple[str, List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + """Merge extracted nodes into the knowledge graph.""" + entity_name, node_data = node_data + entity_types = [] + source_ids = [] + descriptions = [] + + node = kg_instance.get_node(entity_name) + if node is not None: + entity_types.append(node["entity_type"]) + source_ids.extend( + split_string_by_multi_markers(node["source_id"], [""]) + ) + descriptions.append(node["description"]) + + # Take the most frequent entity_type + entity_type = sorted( + Counter([dp["entity_type"] for dp in node_data] + entity_types).items(), + key=lambda x: x[1], + reverse=True, + )[0][0] + + description = "".join( + sorted(set([dp["description"] for dp in node_data] + descriptions)) + ) + description = await self._handle_kg_summary(entity_name, description) + + source_id = "".join( + set([dp["source_id"] for dp in node_data] + source_ids) + ) + + node_data_dict = { + "entity_type": entity_type, + "description": description, + "source_id": source_id, + } + + # Preserve sequence from existing node if present (e.g., added by partition_service) + if node is not None and "sequence" in node and node["sequence"]: + node_data_dict["sequence"] = node["sequence"] + + kg_instance.upsert_node(entity_name, node_data=node_data_dict) + + async def merge_edges( + self, + edges_data: tuple[Tuple[str, str], List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + """Merge extracted edges into the knowledge graph.""" + (src_id, tgt_id), edge_data = edges_data + + # Skip self-loops (edges where source and target are the same) + # This can happen when LLM extracts invalid relationships + if src_id == tgt_id: + logger.debug("Skipping self-loop edge: (%s, %s)", src_id, tgt_id) + return + + source_ids = [] + descriptions = [] + + edge = kg_instance.get_edge(src_id, tgt_id) + if edge is not None: + source_ids.extend( + split_string_by_multi_markers(edge["source_id"], [""]) + ) + descriptions.append(edge["description"]) + + description = "".join( + sorted(set([dp["description"] for dp in edge_data] + descriptions)) + ) + source_id = "".join( + set([dp["source_id"] for dp in edge_data] + source_ids) + ) + + for insert_id in [src_id, tgt_id]: + if not kg_instance.has_node(insert_id): + kg_instance.upsert_node( + insert_id, + node_data={ + "source_id": source_id, + "description": description, + "entity_type": "UNKNOWN", + }, + ) + + description = await self._handle_kg_summary( + f"({src_id}, {tgt_id})", description + ) + + kg_instance.upsert_edge( + src_id, + tgt_id, + edge_data={"source_id": source_id, "description": description}, + ) + + async def _handle_kg_summary( + self, + entity_or_relation_name: str, + description: str, + max_summary_tokens: int = 200, + ) -> str: + """ + Handle knowledge graph summary for omics entities/relations. + + :param entity_or_relation_name + :param description + :param max_summary_tokens + :return summary + """ + tokenizer_instance = self.llm_client.tokenizer + language = detect_main_language(description) + + tokens = tokenizer_instance.encode(description) + if len(tokens) < max_summary_tokens: + return description + + use_description = tokenizer_instance.decode(tokens[:max_summary_tokens]) + prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format( + entity_name=entity_or_relation_name, + description_list=use_description.split(""), + **KG_SUMMARIZATION_PROMPT["FORMAT"], + ) + new_description = await self.llm_client.generate_answer(prompt) + logger.info( + "Entity or relation %s summary: %s", + entity_or_relation_name, + new_description, + ) + return new_description diff --git a/graphgen/models/llm/__init__.py b/graphgen/models/llm/__init__.py index e69de29b..c70395d5 100644 --- a/graphgen/models/llm/__init__.py +++ b/graphgen/models/llm/__init__.py @@ -0,0 +1,4 @@ +from .api.http_client import HTTPClient +from .api.ollama_client import OllamaClient +from .api.openai_client import OpenAIClient +from .local.hf_wrapper import HuggingFaceWrapper diff --git a/graphgen/models/evaluate/__init__.py b/graphgen/models/llm/api/__init__.py similarity index 100% rename from graphgen/models/evaluate/__init__.py rename to graphgen/models/llm/api/__init__.py diff --git a/graphgen/models/llm/api/http_client.py b/graphgen/models/llm/api/http_client.py new file mode 100644 index 00000000..2c3b0acd --- /dev/null +++ b/graphgen/models/llm/api/http_client.py @@ -0,0 +1,197 @@ +import asyncio +import math +from typing import Any, Dict, List, Optional + +import aiohttp +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper +from graphgen.bases.datatypes import Token +from graphgen.models.llm.limitter import RPM, TPM + + +class HTTPClient(BaseLLMWrapper): + """ + A generic async HTTP client for LLMs compatible with OpenAI's chat/completions format. + It uses aiohttp for making requests and includes retry logic and token usage tracking. + Usage example: + client = HTTPClient( + model_name="gpt-4o-mini", + base_url="http://localhost:8080", + api_key="your_api_key", + json_mode=True, + seed=42, + topk_per_token=5, + request_limit=True, + ) + + answer = await client.generate_answer("Hello, world!") + tokens = await client.generate_topk_per_token("Hello, world!") + """ + + _instance: Optional["HTTPClient"] = None + _lock = asyncio.Lock() + + def __new__(cls, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__( + self, + *, + model: str, + base_url: str, + api_key: Optional[str] = None, + json_mode: bool = False, + seed: Optional[int] = None, + topk_per_token: int = 5, + request_limit: bool = False, + rpm: Optional[RPM] = None, + tpm: Optional[TPM] = None, + **kwargs: Any, + ): + # Initialize only once in the singleton pattern + if getattr(self, "_initialized", False): + return + self._initialized: bool = True + super().__init__(**kwargs) + self.model_name = model + self.base_url = base_url.rstrip("/") + self.api_key = api_key + self.json_mode = json_mode + self.seed = seed + self.topk_per_token = topk_per_token + self.request_limit = request_limit + self.rpm = rpm or RPM() + self.tpm = tpm or TPM() + + self.token_usage: List[Dict[str, int]] = [] + self._session: Optional[aiohttp.ClientSession] = None + + @property + def session(self) -> aiohttp.ClientSession: + if self._session is None or self._session.closed: + headers = ( + {"Authorization": f"Bearer {self.api_key}"} if self.api_key else {} + ) + self._session = aiohttp.ClientSession(headers=headers) + return self._session + + async def close(self): + if self._session and not self._session.closed: + await self._session.close() + + def _build_body(self, text: str, history: List[str]) -> Dict[str, Any]: + messages = [] + if self.system_prompt: + messages.append({"role": "system", "content": self.system_prompt}) + + # chatml format: alternating user and assistant messages + if history and isinstance(history[0], dict): + messages.extend(history) + + messages.append({"role": "user", "content": text}) + + body = { + "model": self.model_name, + "messages": messages, + "temperature": self.temperature, + "top_p": self.top_p, + "max_tokens": self.max_tokens, + } + if self.seed: + body["seed"] = self.seed + if self.json_mode: + body["response_format"] = {"type": "json_object"} + return body + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)), + ) + async def generate_answer( + self, + text: str, + history: Optional[List[str]] = None, + **extra: Any, + ) -> str: + body = self._build_body(text, history or []) + prompt_tokens = sum( + len(self.tokenizer.encode(m["content"])) for m in body["messages"] + ) + est = prompt_tokens + body["max_tokens"] + + if self.request_limit: + await self.rpm.wait(silent=True) + await self.tpm.wait(est, silent=True) + + async with self.session.post( + f"{self.base_url}/chat/completions", + json=body, + timeout=aiohttp.ClientTimeout(total=60), + ) as resp: + resp.raise_for_status() + data = await resp.json() + + msg = data["choices"][0]["message"]["content"] + if "usage" in data: + self.token_usage.append( + { + "prompt_tokens": data["usage"]["prompt_tokens"], + "completion_tokens": data["usage"]["completion_tokens"], + "total_tokens": data["usage"]["total_tokens"], + } + ) + return self.filter_think_tags(msg) + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)), + ) + async def generate_topk_per_token( + self, + text: str, + history: Optional[List[str]] = None, + **extra: Any, + ) -> List[Token]: + body = self._build_body(text, history or []) + body["max_tokens"] = 1 + if self.topk_per_token > 0: + body["logprobs"] = True + body["top_logprobs"] = self.topk_per_token + + async with self.session.post( + f"{self.base_url}/chat/completions", + json=body, + timeout=aiohttp.ClientTimeout(total=60), + ) as resp: + resp.raise_for_status() + data = await resp.json() + + token_logprobs = data["choices"][0]["logprobs"]["content"] + tokens = [] + for item in token_logprobs: + candidates = [ + Token(t["token"], math.exp(t["logprob"])) for t in item["top_logprobs"] + ] + tokens.append( + Token( + item["token"], math.exp(item["logprob"]), top_candidates=candidates + ) + ) + return tokens + + async def generate_inputs_prob( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + raise NotImplementedError( + "generate_inputs_prob is not implemented in HTTPClient" + ) diff --git a/graphgen/models/llm/api/ollama_client.py b/graphgen/models/llm/api/ollama_client.py new file mode 100644 index 00000000..9a4946a6 --- /dev/null +++ b/graphgen/models/llm/api/ollama_client.py @@ -0,0 +1,105 @@ +from typing import Any, Dict, List, Optional + +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper +from graphgen.bases.datatypes import Token +from graphgen.models.llm.limitter import RPM, TPM + + +class OllamaClient(BaseLLMWrapper): + """ + Requires a local or remote Ollama server to be running (default port 11434). + The top_logprobs field is not yet implemented by the official API. + """ + + def __init__( + self, + *, + model: str = "gemma3", + base_url: str = "http://localhost:11434", + json_mode: bool = False, + seed: Optional[int] = None, + topk_per_token: int = 5, + request_limit: bool = False, + rpm: Optional[RPM] = None, + tpm: Optional[TPM] = None, + **kwargs: Any, + ): + try: + import ollama + except ImportError as e: + raise ImportError( + "Ollama SDK is not installed." + "It is required to use OllamaClient." + "Please install it with `pip install ollama`." + ) from e + super().__init__(**kwargs) + self.model_name = model + self.base_url = base_url + self.json_mode = json_mode + self.seed = seed + self.topk_per_token = topk_per_token + self.request_limit = request_limit + self.rpm = rpm or RPM() + self.tpm = tpm or TPM() + self.token_usage: List[Dict[str, int]] = [] + + self.client = ollama.AsyncClient(host=self.base_url) + + async def generate_answer( + self, + text: str, + history: Optional[List[Dict[str, str]]] = None, + **extra: Any, + ) -> str: + messages = [] + if self.system_prompt: + messages.append({"role": "system", "content": self.system_prompt}) + if history: + messages.extend(history) + messages.append({"role": "user", "content": text}) + + options = { + "temperature": self.temperature, + "top_p": self.top_p, + "num_predict": self.max_tokens, + } + if self.seed is not None: + options["seed"] = self.seed + + prompt_tokens = sum(len(self.tokenizer.encode(m["content"])) for m in messages) + est = prompt_tokens + self.max_tokens + if self.request_limit: + await self.rpm.wait(silent=True) + await self.tpm.wait(est, silent=True) + + response = await self.client.chat( + model=self.model_name, + messages=messages, + format="json" if self.json_mode else "", + options=options, + stream=False, + ) + + usage = response.get("prompt_eval_count", 0), response.get("eval_count", 0) + self.token_usage.append( + { + "prompt_tokens": usage[0], + "completion_tokens": usage[1], + "total_tokens": sum(usage), + } + ) + content = response["message"]["content"] + return self.filter_think_tags(content) + + async def generate_topk_per_token( + self, + text: str, + history: Optional[List[Dict[str, str]]] = None, + **extra: Any, + ) -> List[Token]: + raise NotImplementedError("Ollama API does not support per-token top-k yet.") + + async def generate_inputs_prob( + self, text: str, history: Optional[List[Dict[str, str]]] = None, **extra: Any + ) -> List[Token]: + raise NotImplementedError("Ollama API does not support per-token logprobs yet.") diff --git a/graphgen/models/llm/api/openai_client.py b/graphgen/models/llm/api/openai_client.py new file mode 100644 index 00000000..532b981c --- /dev/null +++ b/graphgen/models/llm/api/openai_client.py @@ -0,0 +1,182 @@ +import math +from typing import Any, Dict, List, Optional + +import openai +from openai import APIConnectionError, APITimeoutError, AsyncOpenAI, AsyncAzureOpenAI, RateLimitError +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper +from graphgen.bases.datatypes import Token +from graphgen.models.llm.limitter import RPM, TPM + + +def get_top_response_tokens(response: openai.ChatCompletion) -> List[Token]: + token_logprobs = response.choices[0].logprobs.content + tokens = [] + for token_prob in token_logprobs: + prob = math.exp(token_prob.logprob) + candidate_tokens = [ + Token(t.token, math.exp(t.logprob)) for t in token_prob.top_logprobs + ] + token = Token(token_prob.token, prob, top_candidates=candidate_tokens) + tokens.append(token) + return tokens + + +class OpenAIClient(BaseLLMWrapper): + def __init__( + self, + *, + model: str = "gpt-4o-mini", + api_key: Optional[str] = None, + base_url: Optional[str] = None, + api_version: Optional[str] = None, + json_mode: bool = False, + seed: Optional[int] = None, + topk_per_token: int = 5, # number of topk tokens to generate for each token + request_limit: bool = False, + rpm: Optional[RPM] = None, + tpm: Optional[TPM] = None, + backend: str = "openai_api", + **kwargs: Any, + ): + super().__init__(**kwargs) + self.model = model + self.api_key = api_key + self.api_version = api_version # required for Azure OpenAI + self.base_url = base_url + self.json_mode = json_mode + self.seed = seed + self.topk_per_token = topk_per_token + + self.token_usage: list = [] + self.request_limit = request_limit + self.rpm = rpm or RPM() + self.tpm = tpm or TPM() + + assert ( + backend in ("openai_api", "azure_openai_api") + ), f"Unsupported backend '{backend}'. Use 'openai_api' or 'azure_openai_api'." + self.backend = backend + + self.__post_init__() + + def __post_init__(self): + + api_name = self.backend.replace("_", " ") + assert self.api_key is not None, f"Please provide api key to access {api_name}." + if self.backend == "openai_api": + self.client = AsyncOpenAI( + api_key=self.api_key or "dummy", base_url=self.base_url + ) + elif self.backend == "azure_openai_api": + assert self.api_version is not None, f"Please provide api_version for {api_name}." + assert self.base_url is not None, f"Please provide base_url for {api_name}." + self.client = AsyncAzureOpenAI( + api_key=self.api_key, + azure_endpoint=self.base_url, + api_version=self.api_version, + azure_deployment=self.model, + ) + else: + raise ValueError(f"Unsupported backend {self.backend}. Use 'openai_api' or 'azure_openai_api'.") + + def _pre_generate(self, text: str, history: List[str]) -> Dict: + kwargs = { + "temperature": self.temperature, + "top_p": self.top_p, + "max_tokens": self.max_tokens, + } + if self.seed: + kwargs["seed"] = self.seed + if self.json_mode: + kwargs["response_format"] = {"type": "json_object"} + + messages = [] + if self.system_prompt: + messages.append({"role": "system", "content": self.system_prompt}) + messages.append({"role": "user", "content": text}) + + if history: + assert len(history) % 2 == 0, "History should have even number of elements." + messages = history + messages + + kwargs["messages"] = messages + return kwargs + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type( + (RateLimitError, APIConnectionError, APITimeoutError) + ), + ) + async def generate_topk_per_token( + self, + text: str, + history: Optional[List[str]] = None, + **extra: Any, + ) -> List[Token]: + kwargs = self._pre_generate(text, history) + if self.topk_per_token > 0: + kwargs["logprobs"] = True + kwargs["top_logprobs"] = self.topk_per_token + + # Limit max_tokens to 1 to avoid long completions + kwargs["max_tokens"] = 1 + + completion = await self.client.chat.completions.create( # pylint: disable=E1125 + model=self.model, **kwargs + ) + + tokens = get_top_response_tokens(completion) + + return tokens + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type( + (RateLimitError, APIConnectionError, APITimeoutError) + ), + ) + async def generate_answer( + self, + text: str, + history: Optional[List[str]] = None, + **extra: Any, + ) -> str: + kwargs = self._pre_generate(text, history) + + prompt_tokens = 0 + for message in kwargs["messages"]: + prompt_tokens += len(self.tokenizer.encode(message["content"])) + estimated_tokens = prompt_tokens + kwargs["max_tokens"] + + if self.request_limit: + await self.rpm.wait(silent=True) + await self.tpm.wait(estimated_tokens, silent=True) + + completion = await self.client.chat.completions.create( # pylint: disable=E1125 + model=self.model, **kwargs + ) + if hasattr(completion, "usage"): + self.token_usage.append( + { + "prompt_tokens": completion.usage.prompt_tokens, + "completion_tokens": completion.usage.completion_tokens, + "total_tokens": completion.usage.total_tokens, + } + ) + return self.filter_think_tags(completion.choices[0].message.content) + + async def generate_inputs_prob( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + """Generate probabilities for each token in the input.""" + raise NotImplementedError diff --git a/graphgen/models/llm/limitter.py b/graphgen/models/llm/limitter.py index 01cb1f70..5aee4501 100644 --- a/graphgen/models/llm/limitter.py +++ b/graphgen/models/llm/limitter.py @@ -1,17 +1,17 @@ +import asyncio import time from datetime import datetime, timedelta -import asyncio from graphgen.utils import logger class RPM: - def __init__(self, rpm: int = 1000): self.rpm = rpm - self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0} + self.record = {"rpm_slot": self.get_minute_slot(), "counter": 0} - def get_minute_slot(self): + @staticmethod + def get_minute_slot(): current_time = time.time() dt_object = datetime.fromtimestamp(current_time) total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute @@ -22,37 +22,35 @@ async def wait(self, silent=False): dt_object = datetime.fromtimestamp(current) minute_slot = self.get_minute_slot() - if self.record['rpm_slot'] == minute_slot: + if self.record["rpm_slot"] == minute_slot: # check RPM exceed - if self.record['counter'] >= self.rpm: + if self.record["counter"] >= self.rpm: # wait until next minute - next_minute = dt_object.replace( - second=0, microsecond=0) + timedelta(minutes=1) + next_minute = dt_object.replace(second=0, microsecond=0) + timedelta( + minutes=1 + ) _next = next_minute.timestamp() sleep_time = abs(_next - current) if not silent: - logger.info('RPM sleep %s', sleep_time) + logger.info("RPM sleep %s", sleep_time) await asyncio.sleep(sleep_time) - self.record = { - 'rpm_slot': self.get_minute_slot(), - 'counter': 0 - } + self.record = {"rpm_slot": self.get_minute_slot(), "counter": 0} else: - self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0} - self.record['counter'] += 1 + self.record = {"rpm_slot": self.get_minute_slot(), "counter": 0} + self.record["counter"] += 1 if not silent: logger.debug(self.record) class TPM: - def __init__(self, tpm: int = 20000): self.tpm = tpm - self.record = {'tpm_slot': self.get_minute_slot(), 'counter': 0} + self.record = {"tpm_slot": self.get_minute_slot(), "counter": 0} - def get_minute_slot(self): + @staticmethod + def get_minute_slot(): current_time = time.time() dt_object = datetime.fromtimestamp(current_time) total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute @@ -64,25 +62,25 @@ async def wait(self, token_count, silent=False): minute_slot = self.get_minute_slot() # get next slot, skip - if self.record['tpm_slot'] != minute_slot: - self.record = {'tpm_slot': minute_slot, 'counter': token_count} + if self.record["tpm_slot"] != minute_slot: + self.record = {"tpm_slot": minute_slot, "counter": token_count} return # check RPM exceed - self.record['counter'] += token_count - if self.record['counter'] > self.tpm: + old_counter = self.record["counter"] + self.record["counter"] += token_count + if self.record["counter"] > self.tpm: + logger.info("Current TPM: %s, limit: %s", old_counter, self.tpm) # wait until next minute - next_minute = dt_object.replace( - second=0, microsecond=0) + timedelta(minutes=1) + next_minute = dt_object.replace(second=0, microsecond=0) + timedelta( + minutes=1 + ) _next = next_minute.timestamp() sleep_time = abs(_next - current) - logger.info('TPM sleep %s', sleep_time) + logger.warning("TPM limit exceeded, wait %s seconds", sleep_time) await asyncio.sleep(sleep_time) - self.record = { - 'tpm_slot': self.get_minute_slot(), - 'counter': token_count - } + self.record = {"tpm_slot": self.get_minute_slot(), "counter": token_count} if not silent: logger.debug(self.record) diff --git a/graphgen/models/search/__init__.py b/graphgen/models/llm/local/__init__.py similarity index 100% rename from graphgen/models/search/__init__.py rename to graphgen/models/llm/local/__init__.py diff --git a/graphgen/models/llm/local/hf_wrapper.py b/graphgen/models/llm/local/hf_wrapper.py new file mode 100644 index 00000000..b0538aad --- /dev/null +++ b/graphgen/models/llm/local/hf_wrapper.py @@ -0,0 +1,147 @@ +from typing import Any, List, Optional + +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper +from graphgen.bases.datatypes import Token + + +class HuggingFaceWrapper(BaseLLMWrapper): + """ + Async inference backend based on HuggingFace Transformers + """ + + def __init__( + self, + model: str, + torch_dtype="auto", + device_map="auto", + trust_remote_code=True, + temperature=0.0, + top_p=1.0, + topk=5, + **kwargs: Any, + ): + super().__init__(temperature=temperature, top_p=top_p, **kwargs) + + try: + import torch + from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + GenerationConfig, + ) + except ImportError as exc: + raise ImportError( + "HuggingFaceWrapper requires torch, transformers and accelerate. " + "Install them with: pip install torch transformers accelerate" + ) from exc + + self.torch = torch + self.AutoTokenizer = AutoTokenizer + self.AutoModelForCausalLM = AutoModelForCausalLM + self.GenerationConfig = GenerationConfig + + self.tokenizer = AutoTokenizer.from_pretrained( + model, trust_remote_code=trust_remote_code + ) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.model = AutoModelForCausalLM.from_pretrained( + model, + torch_dtype=torch_dtype, + device_map=device_map, + trust_remote_code=trust_remote_code, + ) + self.model.eval() + self.temperature = temperature + self.top_p = top_p + self.topk = topk + + @staticmethod + def _build_inputs(prompt: str, history: Optional[List[str]] = None) -> str: + msgs = history or [] + lines = [] + for m in msgs: + if isinstance(m, dict): + role = m.get("role", "") + content = m.get("content", "") + lines.append(f"{role}: {content}") + else: + lines.append(str(m)) + lines.append(prompt) + return "\n".join(lines) + + async def generate_answer( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> str: + full = self._build_inputs(text, history) + inputs = self.tokenizer(full, return_tensors="pt").to(self.model.device) + + gen_kwargs = { + "max_new_tokens": extra.get("max_new_tokens", 512), + "do_sample": self.temperature > 0, + "temperature": self.temperature if self.temperature > 0 else 1.0, + "pad_token_id": self.tokenizer.eos_token_id, + } + + # Add top_p and top_k only if temperature > 0 + if self.temperature > 0: + gen_kwargs.update(top_p=self.top_p, top_k=self.topk) + + gen_config = self.GenerationConfig(**gen_kwargs) + + with self.torch.no_grad(): + out = self.model.generate(**inputs, generation_config=gen_config) + + gen = out[0, inputs.input_ids.shape[-1] :] + return self.tokenizer.decode(gen, skip_special_tokens=True) + + async def generate_topk_per_token( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + full = self._build_inputs(text, history) + inputs = self.tokenizer(full, return_tensors="pt").to(self.model.device) + + with self.torch.no_grad(): + out = self.model.generate( + **inputs, + max_new_tokens=1, + do_sample=False, + temperature=1.0, + return_dict_in_generate=True, + output_scores=True, + pad_token_id=self.tokenizer.eos_token_id, + ) + + scores = out.scores[0][0] # (vocab,) + probs = self.torch.softmax(scores, dim=-1) + top_probs, top_idx = self.torch.topk(probs, k=self.topk) + + tokens = [] + for p, idx in zip(top_probs.cpu().numpy(), top_idx.cpu().numpy()): + tokens.append(Token(self.tokenizer.decode([idx]), float(p))) + return tokens + + async def generate_inputs_prob( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + full = self._build_inputs(text, history) + ids = self.tokenizer.encode(full) + logprobs = [] + + for i in range(1, len(ids) + 1): + trunc = ids[: i - 1] + ids[i:] if i < len(ids) else ids[:-1] + inputs = self.torch.tensor([trunc]).to(self.model.device) + + with self.torch.no_grad(): + logits = self.model(inputs).logits[0, -1, :] + probs = self.torch.softmax(logits, dim=-1) + + true_id = ids[i - 1] + logprobs.append( + Token( + self.tokenizer.decode([true_id]), + float(probs[true_id].cpu()), + ) + ) + return logprobs diff --git a/graphgen/models/llm/local/sglang_wrapper.py b/graphgen/models/llm/local/sglang_wrapper.py new file mode 100644 index 00000000..1918fc79 --- /dev/null +++ b/graphgen/models/llm/local/sglang_wrapper.py @@ -0,0 +1,140 @@ +import math +from typing import Any, Dict, List, Optional + +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper +from graphgen.bases.datatypes import Token + + +class SGLangWrapper(BaseLLMWrapper): + """ + Async inference backend based on SGLang offline engine. + """ + + def __init__( + self, + model: str, + tp_size: int = 1, + temperature: float = 0.0, + top_p: float = 1.0, + topk: int = 5, + **kwargs: Any, + ): + super().__init__(temperature=temperature, top_p=top_p, **kwargs) + try: + import sglang as sgl + from sglang.utils import async_stream_and_merge, stream_and_merge + except ImportError as exc: + raise ImportError( + "SGLangWrapper requires sglang. Install it with: " + "uv pip install sglang --prerelease=allow" + ) from exc + + self.model_path: str = model + self.temperature = temperature + self.top_p = top_p + self.topk = topk + self.tp_size = int(tp_size) + + # Initialise the offline engine + self.engine = sgl.Engine(model_path=self.model_path, tp_size=self.tp_size) + + # Keep helpers for streaming + self.async_stream_and_merge = async_stream_and_merge + self.stream_and_merge = stream_and_merge + + @staticmethod + def _build_sampling_params( + temperature: float, + top_p: float, + max_tokens: int, + topk: int, + logprobs: bool = False, + ) -> Dict[str, Any]: + """Build SGLang-compatible sampling-params dict.""" + params = { + "temperature": temperature, + "top_p": top_p, + "max_new_tokens": max_tokens, + } + if logprobs and topk > 0: + params["logprobs"] = topk + return params + + def _prep_prompt(self, text: str, history: Optional[List[dict]] = None) -> str: + """Convert raw text (+ optional history) into a single prompt string.""" + parts = [] + if self.system_prompt: + parts.append(self.system_prompt) + if history: + assert len(history) % 2 == 0, "History must have even length (u/a turns)." + parts.extend([item["content"] for item in history]) + parts.append(text) + return "\n".join(parts) + + def _tokens_from_output(self, output: Dict[str, Any]) -> List[Token]: + tokens: List[Token] = [] + + meta = output.get("meta_info", {}) + logprobs = meta.get("output_token_logprobs", []) + topks = meta.get("output_top_logprobs", []) + + tokenizer = self.engine.tokenizer_manager.tokenizer + + for idx, (lp, tid, _) in enumerate(logprobs): + prob = math.exp(lp) + tok_str = tokenizer.decode([tid]) + + top_candidates = [] + if self.topk > 0 and idx < len(topks): + for t_lp, t_tid, _ in topks[idx][: self.topk]: + top_candidates.append( + Token(text=tokenizer.decode([t_tid]), prob=math.exp(t_lp)) + ) + + tokens.append(Token(text=tok_str, prob=prob, top_candidates=top_candidates)) + + return tokens + + async def generate_answer( + self, + text: str, + history: Optional[List[str]] = None, + **extra: Any, + ) -> str: + prompt = self._prep_prompt(text, history) + sampling_params = self._build_sampling_params( + temperature=self.temperature, + top_p=self.top_p, + max_tokens=self.max_tokens, + topk=0, # no logprobs needed for simple generation + ) + + outputs = await self.engine.async_generate([prompt], sampling_params) + return self.filter_think_tags(outputs[0]["text"]) + + async def generate_topk_per_token( + self, + text: str, + history: Optional[List[str]] = None, + **extra: Any, + ) -> List[Token]: + prompt = self._prep_prompt(text, history) + sampling_params = self._build_sampling_params( + temperature=self.temperature, + top_p=self.top_p, + max_tokens=1, # keep short for token-level analysis + topk=self.topk, + ) + + outputs = await self.engine.async_generate( + [prompt], sampling_params, return_logprob=True, top_logprobs_num=5 + ) + print(outputs) + return self._tokens_from_output(outputs[0]) + + async def generate_inputs_prob( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + raise NotImplementedError( + "SGLangWrapper does not support per-token logprobs yet." + ) diff --git a/graphgen/models/llm/local/tgi_wrapper.py b/graphgen/models/llm/local/tgi_wrapper.py new file mode 100644 index 00000000..a722f6ea --- /dev/null +++ b/graphgen/models/llm/local/tgi_wrapper.py @@ -0,0 +1,36 @@ +from typing import Any, List, Optional + +from graphgen.bases import BaseLLMWrapper +from graphgen.bases.datatypes import Token + + +# TODO: implement TGIWrapper methods +class TGIWrapper(BaseLLMWrapper): + """ + Async inference backend based on TGI (Text-Generation-Inference) + """ + + def __init__( + self, + model_url: str, # e.g. "http://localhost:8080" + temperature: float = 0.0, + top_p: float = 1.0, + topk: int = 5, + **kwargs: Any + ): + super().__init__(temperature=temperature, top_p=top_p, **kwargs) + + async def generate_answer( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> str: + pass + + async def generate_topk_per_token( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + pass + + async def generate_inputs_prob( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + pass diff --git a/graphgen/models/llm/local/trt_wrapper.py b/graphgen/models/llm/local/trt_wrapper.py new file mode 100644 index 00000000..078f5ba9 --- /dev/null +++ b/graphgen/models/llm/local/trt_wrapper.py @@ -0,0 +1,26 @@ +from typing import Any, List, Optional + +from graphgen.bases import BaseLLMWrapper +from graphgen.bases.datatypes import Token + + +# TODO: implement TensorRTWrapper methods +class TensorRTWrapper(BaseLLMWrapper): + """ + Async inference backend based on TensorRT-LLM + """ + + async def generate_answer( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> str: + pass + + async def generate_topk_per_token( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + pass + + async def generate_inputs_prob( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + pass diff --git a/graphgen/models/llm/local/vllm_wrapper.py b/graphgen/models/llm/local/vllm_wrapper.py new file mode 100644 index 00000000..5d18e99c --- /dev/null +++ b/graphgen/models/llm/local/vllm_wrapper.py @@ -0,0 +1,126 @@ +import uuid +import math +from typing import Any, List, Optional +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper +from graphgen.bases.datatypes import Token + +class VLLMWrapper(BaseLLMWrapper): + """ + Async inference backend based on vLLM. + """ + def __init__( + self, + model: str, + tensor_parallel_size: int = 1, + gpu_memory_utilization: float = 0.9, + temperature: float = 0.0, + top_p: float = 1.0, + topk: int = 5, + **kwargs: Any, + ): + super().__init__(temperature=temperature, top_p=top_p, **kwargs) + try: + from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams + except ImportError as exc: + raise ImportError( + "VLLMWrapper requires vllm. Install it with: uv pip install vllm" + ) from exc + + self.SamplingParams = SamplingParams + + engine_args = AsyncEngineArgs( + model=model, + tensor_parallel_size=int(tensor_parallel_size), + gpu_memory_utilization=float(gpu_memory_utilization), + trust_remote_code=kwargs.get("trust_remote_code", True), + disable_log_stats=False, + ) + self.engine = AsyncLLMEngine.from_engine_args(engine_args) + self.temperature = temperature + self.top_p = top_p + self.topk = topk + + @staticmethod + def _build_inputs(prompt: str, history: Optional[List[str]] = None) -> str: + msgs = history or [] + lines = [] + for m in msgs: + if isinstance(m, dict): + role = m.get("role", "") + content = m.get("content", "") + lines.append(f"{role}: {content}") + else: + lines.append(str(m)) + lines.append(prompt) + return "\n".join(lines) + + async def generate_answer( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> str: + full_prompt = self._build_inputs(text, history) + request_id = f"graphgen_req_{uuid.uuid4()}" + + sp = self.SamplingParams( + temperature=self.temperature if self.temperature > 0 else 1.0, + top_p=self.top_p if self.temperature > 0 else 1.0, + max_tokens=extra.get("max_new_tokens", 512), + ) + + result_generator = self.engine.generate(full_prompt, sp, request_id=request_id) + + final_output = None + async for request_output in result_generator: + final_output = request_output + + if not final_output or not final_output.outputs: + return "" + + return final_output.outputs[0].text + + async def generate_topk_per_token( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + full_prompt = self._build_inputs(text, history) + request_id = f"graphgen_topk_{uuid.uuid4()}" + + sp = self.SamplingParams( + temperature=0, + max_tokens=1, + logprobs=self.topk, + prompt_logprobs=1, + ) + + result_generator = self.engine.generate(full_prompt, sp, request_id=request_id) + + final_output = None + async for request_output in result_generator: + final_output = request_output + + if not final_output or not final_output.outputs or not final_output.outputs[0].logprobs: + return [] + + top_logprobs = final_output.outputs[0].logprobs[0] + + candidate_tokens = [] + for _, logprob_obj in top_logprobs.items(): + tok_str = logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else "" + prob = float(math.exp(logprob_obj.logprob)) + candidate_tokens.append(Token(tok_str, prob)) + + candidate_tokens.sort(key=lambda x: -x.prob) + + if candidate_tokens: + main_token = Token( + text=candidate_tokens[0].text, + prob=candidate_tokens[0].prob, + top_candidates=candidate_tokens + ) + return [main_token] + return [] + + async def generate_inputs_prob( + self, text: str, history: Optional[List[str]] = None, **extra: Any + ) -> List[Token]: + raise NotImplementedError( + "VLLMWrapper does not support per-token logprobs yet." + ) diff --git a/graphgen/models/llm/openai_model.py b/graphgen/models/llm/openai_model.py deleted file mode 100644 index 6973c1ce..00000000 --- a/graphgen/models/llm/openai_model.py +++ /dev/null @@ -1,130 +0,0 @@ -import math -from dataclasses import dataclass, field -from typing import List, Dict, Optional -import openai -from openai import AsyncOpenAI, RateLimitError, APIConnectionError, APITimeoutError -from tenacity import ( - retry, - stop_after_attempt, - wait_exponential, - retry_if_exception_type, -) - -from graphgen.models.llm.topk_token_model import TopkTokenModel, Token -from graphgen.models.llm.tokenizer import Tokenizer -from graphgen.models.llm.limitter import RPM, TPM - -def get_top_response_tokens(response: openai.ChatCompletion) -> List[Token]: - token_logprobs = response.choices[0].logprobs.content - tokens = [] - for token_prob in token_logprobs: - prob = math.exp(token_prob.logprob) - candidate_tokens = [ - Token(t.token, math.exp(t.logprob)) - for t in token_prob.top_logprobs - ] - token = Token(token_prob.token, prob, top_candidates=candidate_tokens) - tokens.append(token) - return tokens - -@dataclass -class OpenAIModel(TopkTokenModel): - model_name: str = "gpt-4o-mini" - api_key: str = None - base_url: str = None - - system_prompt: str = "" - json_mode: bool = False - seed: int = None - - token_usage: list = field(default_factory=list) - request_limit: bool = False - rpm: RPM = field(default_factory=lambda: RPM(rpm=1000)) - tpm: TPM = field(default_factory=lambda: TPM(tpm=50000)) - - - def __post_init__(self): - assert self.api_key is not None, "Please provide api key to access openai api." - if self.api_key == "": - self.api_key = "none" - self.client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url) - - def _pre_generate(self, text: str, history: List[str]) -> Dict: - kwargs = { - "temperature": self.temperature, - "top_p": self.topp, - "max_tokens": self.max_tokens, - } - if self.seed: - kwargs["seed"] = self.seed - if self.json_mode: - kwargs["response_format"] = {"type": "json_object"} - - messages = [] - if self.system_prompt: - messages.append({"role": "system", "content": self.system_prompt}) - messages.append({"role": "user", "content": text}) - - if history: - assert len(history) % 2 == 0, "History should have even number of elements." - messages = history + messages - - kwargs['messages']= messages - return kwargs - - - @retry( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)), - ) - async def generate_topk_per_token(self, text: str, history: Optional[List[str]] = None) -> List[Token]: - kwargs = self._pre_generate(text, history) - if self.topk_per_token > 0: - kwargs["logprobs"] = True - kwargs["top_logprobs"] = self.topk_per_token - - # Limit max_tokens to 1 to avoid long completions - kwargs["max_tokens"] = 1 - - completion = await self.client.chat.completions.create( # pylint: disable=E1125 - model=self.model_name, - **kwargs - ) - - tokens = get_top_response_tokens(completion) - - return tokens - - @retry( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)), - ) - async def generate_answer(self, text: str, history: Optional[List[str]] = None, temperature: int = 0) -> str: - kwargs = self._pre_generate(text, history) - kwargs["temperature"] = temperature - - prompt_tokens = 0 - for message in kwargs['messages']: - prompt_tokens += len(Tokenizer().encode_string(message['content'])) - estimated_tokens = prompt_tokens + kwargs['max_tokens'] - - if self.request_limit: - await self.rpm.wait(silent=True) - await self.tpm.wait(estimated_tokens, silent=True) - - completion = await self.client.chat.completions.create( # pylint: disable=E1125 - model=self.model_name, - **kwargs - ) - if hasattr(completion, "usage"): - self.token_usage.append({ - "prompt_tokens": completion.usage.prompt_tokens, - "completion_tokens": completion.usage.completion_tokens, - "total_tokens": completion.usage.total_tokens, - }) - return completion.choices[0].message.content - - async def generate_inputs_prob(self, text: str, history: Optional[List[str]] = None) -> List[Token]: - raise NotImplementedError diff --git a/graphgen/models/llm/tokenizer.py b/graphgen/models/llm/tokenizer.py deleted file mode 100644 index 6a1c4b22..00000000 --- a/graphgen/models/llm/tokenizer.py +++ /dev/null @@ -1,73 +0,0 @@ -from dataclasses import dataclass -from typing import List -import tiktoken - -try: - from transformers import AutoTokenizer - TRANSFORMERS_AVAILABLE = True -except ImportError: - AutoTokenizer = None - TRANSFORMERS_AVAILABLE = False - - -def get_tokenizer(tokenizer_name: str = "cl100k_base"): - """ - Get a tokenizer instance by name. - - :param tokenizer_name: tokenizer name, tiktoken encoding name or Hugging Face model name - :return: tokenizer instance - """ - if tokenizer_name in tiktoken.list_encoding_names(): - return tiktoken.get_encoding(tokenizer_name) - if TRANSFORMERS_AVAILABLE: - try: - return AutoTokenizer.from_pretrained(tokenizer_name) - except Exception as e: - raise ValueError(f"Failed to load tokenizer from Hugging Face: {e}") from e - else: - raise ValueError("Hugging Face Transformers is not available, please install it first.") - -@dataclass -class Tokenizer: - model_name: str = "cl100k_base" - - def __post_init__(self): - self.tokenizer = get_tokenizer(self.model_name) - - def encode_string(self, text: str) -> List[int]: - """ - Encode text to tokens - - :param text - :return: tokens - """ - return self.tokenizer.encode(text) - - def decode_tokens(self, tokens: List[int]) -> str: - """ - Decode tokens to text - - :param tokens - :return: text - """ - return self.tokenizer.decode(tokens) - - def chunk_by_token_size( - self, content: str, overlap_token_size=128, max_token_size=1024 - ): - tokens = self.encode_string(content) - results = [] - for index, start in enumerate( - range(0, len(tokens), max_token_size - overlap_token_size) - ): - chunk_content = self.decode_tokens( - tokens[start : start + max_token_size] - ) - results.append( - { - "tokens": min(max_token_size, len(tokens) - start), - "content": chunk_content.strip(), - "chunk_order_index": index, - } - ) - return results diff --git a/graphgen/models/llm/topk_token_model.py b/graphgen/models/llm/topk_token_model.py deleted file mode 100644 index b7595cb1..00000000 --- a/graphgen/models/llm/topk_token_model.py +++ /dev/null @@ -1,48 +0,0 @@ -import math -from dataclasses import dataclass, field -from typing import List, Union, Optional - - -@dataclass -class Token: - text: str - prob: float - top_candidates: List = field(default_factory=list) - ppl: Union[float, None] = field(default=None) - - @property - def logprob(self) -> float: - return math.log(self.prob) - - -@dataclass -class TopkTokenModel: - do_sample: bool = False - temperature: float = 0 - max_tokens: int = 4096 - repetition_penalty: float = 1.05 - num_beams: int = 1 - topk: int = 50 - topp: float = 0.95 - - topk_per_token: int = 5 # number of topk tokens to generate for each token - - async def generate_topk_per_token(self, text: str) -> List[Token]: - """ - Generate prob, text and candidates for each token of the model's output. - This function is used to visualize the inference process. - """ - raise NotImplementedError - - async def generate_inputs_prob(self, text: str, history: Optional[List[str]] = None) -> List[Token]: - """ - Generate prob and text for each token of the input text. - This function is used to visualize the ppl. - """ - raise NotImplementedError - - async def generate_answer(self, text: str, history: Optional[List[str]] = None) -> str: - """ - Generate answer from the model. - """ - raise NotImplementedError diff --git a/graphgen/models/partitioner/__init__.py b/graphgen/models/partitioner/__init__.py new file mode 100644 index 00000000..2e1bcb68 --- /dev/null +++ b/graphgen/models/partitioner/__init__.py @@ -0,0 +1,5 @@ +from .anchor_bfs_partitioner import AnchorBFSPartitioner +from .bfs_partitioner import BFSPartitioner +from .dfs_partitioner import DFSPartitioner +from .ece_partitioner import ECEPartitioner +from .leiden_partitioner import LeidenPartitioner diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py new file mode 100644 index 00000000..d75dc2e8 --- /dev/null +++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py @@ -0,0 +1,190 @@ +import random +from collections import deque +from typing import Any, Iterable, List, Literal, Set, Tuple, Union + +from graphgen.bases import BaseGraphStorage +from graphgen.bases.datatypes import Community + +from .bfs_partitioner import BFSPartitioner + +NODE_UNIT: str = "n" +EDGE_UNIT: str = "e" + + +class AnchorBFSPartitioner(BFSPartitioner): + """ + Anchor BFS partitioner that partitions the graph into communities of a fixed size. + 1. Randomly choose a node of a specified type as the anchor. + 2. Expand the community using BFS until the max unit size is reached.(A unit is a node or an edge.) + 3. Non-anchor units can only be "pulled" into a community and never become seeds themselves. + For example, for VQA tasks, we may want to use image nodes as anchors and expand to nearby text nodes and edges. + + Supports multiple anchor types for multi-omics data: anchor_type can be a single string or a list of strings. + When a list is provided, nodes matching any of the types in the list can serve as anchors. + """ + + def __init__( + self, + *, + anchor_type: Union[ + Literal["image", "dna", "rna", "protein"], + List[Literal["dna", "rna", "protein"]], + ] = "image", + anchor_ids: Set[str] | None = None, + ) -> None: + super().__init__() + # Normalize anchor_type to always be a list for internal processing + if isinstance(anchor_type, str): + self.anchor_types = [anchor_type] + else: + self.anchor_types = list(anchor_type) + self.anchor_ids = anchor_ids + + def partition( + self, + g: BaseGraphStorage, + max_units_per_community: int = 1, + **kwargs: Any, + ) -> Iterable[Community]: + nodes = g.get_all_nodes() # List[tuple[id, meta]] + edges = g.get_all_edges() # List[tuple[u, v, meta]] + + adj, _ = self._build_adjacency_list(nodes, edges) + + anchors: Set[str] = self._pick_anchor_ids(nodes) + if not anchors: + return # if no anchors, return nothing + + used_n: set[str] = set() + used_e: set[frozenset[str]] = set() + + seeds = list(anchors) + random.shuffle(seeds) + + for seed_node in seeds: + if seed_node in used_n: + continue + comm_n, comm_e = self._grow_community( + seed_node, adj, max_units_per_community, used_n, used_e + ) + if comm_n or comm_e: + yield Community(id=seed_node, nodes=comm_n, edges=comm_e) + + def _pick_anchor_ids( + self, + nodes: List[tuple[str, dict]], + ) -> Set[str]: + if self.anchor_ids is not None: + return self.anchor_ids + + anchor_ids: Set[str] = set() + anchor_types_lower = [at.lower() for at in self.anchor_types] + + for node_id, meta in nodes: + # Check if node matches any of the anchor types + matched = False + + # Check 1: entity_type (for image, etc.) + node_type = str(meta.get("entity_type", "")).lower() + for anchor_type_lower in anchor_types_lower: + if anchor_type_lower in node_type: + anchor_ids.add(node_id) + matched = True + break + + if matched: + continue + + # Check 2: molecule_type (for omics data: dna, rna, protein) + molecule_type = str(meta.get("molecule_type", "")).lower() + if molecule_type in anchor_types_lower: + anchor_ids.add(node_id) + continue + + # Check 3: source_id prefix (for omics data: dna-, rna-, protein-) + source_id = str(meta.get("source_id", "")).lower() + for anchor_type_lower in anchor_types_lower: + if source_id.startswith(f"{anchor_type_lower}-"): + anchor_ids.add(node_id) + matched = True + break + + if matched: + continue + + # Check 4: Check if source_id contains multiple IDs separated by + if "" in source_id: + source_ids = source_id.split("") + for sid in source_ids: + sid = sid.strip() + for anchor_type_lower in anchor_types_lower: + if sid.startswith(f"{anchor_type_lower}-"): + anchor_ids.add(node_id) + matched = True + break + if matched: + break + + return anchor_ids + + @staticmethod + def _grow_community( + seed: str, + adj: dict[str, List[str]], + max_units: int, + used_n: set[str], + used_e: set[frozenset[str]], + ) -> Tuple[List[str], List[Tuple[str, str]]]: + """ + Grow a community from the seed node using BFS. + :param seed: seed node id + :param adj: adjacency list + :param max_units: maximum number of units (nodes + edges) in the community + :param used_n: set of used node ids + :param used_e: set of used edge keys + :return: (list of node ids, list of edge tuples) + """ + comm_n: List[str] = [] + comm_e: List[Tuple[str, str]] = [] + queue: deque[tuple[str, Any]] = deque([(NODE_UNIT, seed)]) + cnt = 0 + + while queue and cnt < max_units: + k, it = queue.popleft() + + if k == NODE_UNIT: + if it in used_n: + continue + used_n.add(it) + comm_n.append(it) + cnt += 1 + for nei in adj[it]: + e_key = frozenset((it, nei)) + if e_key not in used_e: + queue.append((EDGE_UNIT, e_key)) + else: # EDGE_UNIT + if it in used_e: + continue + used_e.add(it) + # Convert frozenset to tuple for edge representation + # Note: Self-loops should be filtered during graph construction, + # but we handle edge cases defensively + try: + u, v = tuple(it) + except ValueError: + # Handle edge case: frozenset with unexpected number of elements + # This should not happen if graph construction is correct + edge_nodes = list(it) + if len(edge_nodes) == 1: + # Self-loop edge (should have been filtered during graph construction) + u, v = edge_nodes[0], edge_nodes[0] + else: + # Invalid edge, skip it + continue + comm_e.append((u, v)) + cnt += 1 + for n in it: + if n not in used_n: + queue.append((NODE_UNIT, n)) + + return comm_n, comm_e diff --git a/graphgen/models/partitioner/bfs_partitioner.py b/graphgen/models/partitioner/bfs_partitioner.py new file mode 100644 index 00000000..994e08e8 --- /dev/null +++ b/graphgen/models/partitioner/bfs_partitioner.py @@ -0,0 +1,76 @@ +import random +from collections import deque +from typing import Any, Iterable, List + +from graphgen.bases import BaseGraphStorage, BasePartitioner +from graphgen.bases.datatypes import Community + +NODE_UNIT: str = "n" +EDGE_UNIT: str = "e" + + +class BFSPartitioner(BasePartitioner): + """ + BFS partitioner that partitions the graph into communities of a fixed size. + 1. Randomly choose a unit. + 2. Expand the community using BFS until the max unit size is reached. + (A unit is a node or an edge.) + """ + + def partition( + self, + g: BaseGraphStorage, + max_units_per_community: int = 1, + **kwargs: Any, + ) -> Iterable[Community]: + nodes = g.get_all_nodes() + edges = g.get_all_edges() + + adj, _ = self._build_adjacency_list(nodes, edges) + + used_n: set[str] = set() + used_e: set[frozenset[str]] = set() + + units = [(NODE_UNIT, n[0]) for n in nodes] + [ + (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges + ] + random.shuffle(units) + + for kind, seed in units: + if (kind == NODE_UNIT and seed in used_n) or ( + kind == EDGE_UNIT and seed in used_e + ): + continue + + comm_n: List[str] = [] + comm_e: List[tuple[str, str]] = [] + queue: deque[tuple[str, Any]] = deque([(kind, seed)]) + cnt = 0 + + while queue and cnt < max_units_per_community: + k, it = queue.popleft() + if k == NODE_UNIT: + if it in used_n: + continue + used_n.add(it) + comm_n.append(it) + cnt += 1 + for nei in adj[it]: + e_key = frozenset((it, nei)) + if e_key not in used_e: + queue.append((EDGE_UNIT, e_key)) + else: + if it in used_e: + continue + used_e.add(it) + + u, v = it + comm_e.append((u, v)) + cnt += 1 + # push nodes that are not visited + for n in it: + if n not in used_n: + queue.append((NODE_UNIT, n)) + + if comm_n or comm_e: + yield Community(id=seed, nodes=comm_n, edges=comm_e) diff --git a/graphgen/models/partitioner/dfs_partitioner.py b/graphgen/models/partitioner/dfs_partitioner.py new file mode 100644 index 00000000..4d93ad7f --- /dev/null +++ b/graphgen/models/partitioner/dfs_partitioner.py @@ -0,0 +1,74 @@ +import random +from collections.abc import Iterable +from typing import Any + +from graphgen.bases import BaseGraphStorage, BasePartitioner +from graphgen.bases.datatypes import Community + +NODE_UNIT: str = "n" +EDGE_UNIT: str = "e" + + +class DFSPartitioner(BasePartitioner): + """ + DFS partitioner that partitions the graph into communities of a fixed size. + 1. Randomly choose a unit. + 2. Random walk using DFS until the community reaches the max unit size. + (In GraphGen, a unit is defined as a node or an edge.) + """ + + def partition( + self, + g: BaseGraphStorage, + max_units_per_community: int = 1, + **kwargs: Any, + ) -> Iterable[Community]: + nodes = g.get_all_nodes() + edges = g.get_all_edges() + + adj, _ = self._build_adjacency_list(nodes, edges) + + used_n: set[str] = set() + used_e: set[frozenset[str]] = set() + + units = [(NODE_UNIT, n[0]) for n in nodes] + [ + (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges + ] + random.shuffle(units) + + for kind, seed in units: + if (kind == NODE_UNIT and seed in used_n) or ( + kind == EDGE_UNIT and seed in used_e + ): + continue + + comm_n, comm_e = [], [] + stack = [(kind, seed)] + cnt = 0 + + while stack and cnt < max_units_per_community: + k, it = stack.pop() + if k == NODE_UNIT: + if it in used_n: + continue + used_n.add(it) + comm_n.append(it) + cnt += 1 + for nei in adj[it]: + e_key = frozenset((it, nei)) + if e_key not in used_e: + stack.append((EDGE_UNIT, e_key)) + break + else: + if it in used_e: + continue + used_e.add(it) + comm_e.append(tuple(it)) + cnt += 1 + # push neighboring nodes + for n in it: + if n not in used_n: + stack.append((NODE_UNIT, n)) + + if comm_n or comm_e: + yield Community(id=seed, nodes=comm_n, edges=comm_e) diff --git a/graphgen/models/partitioner/ece_partitioner.py b/graphgen/models/partitioner/ece_partitioner.py new file mode 100644 index 00000000..fcf776c7 --- /dev/null +++ b/graphgen/models/partitioner/ece_partitioner.py @@ -0,0 +1,156 @@ +import random +from collections import deque +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple + +from tqdm import tqdm + +from graphgen.bases import BaseGraphStorage +from graphgen.bases.datatypes import Community +from graphgen.models.partitioner.bfs_partitioner import BFSPartitioner + +NODE_UNIT: str = "n" +EDGE_UNIT: str = "e" + + +class ECEPartitioner(BFSPartitioner): + """ + ECE partitioner that partitions the graph into communities based on Expected Calibration Error (ECE). + We calculate ECE for units in KG (represented as 'comprehension loss') + and group units with similar ECE values into the same community. + 1. Select a sampling strategy. + 2. Choose a unit based on the sampling strategy. + 2. Expand the community using BFS. + 3. When expending, prefer to add units with the sampling strategy. + 4. Stop when the max unit size is reached or the max input length is reached. + (A unit is a node or an edge.) + """ + + @staticmethod + def _sort_units(units: list, edge_sampling: str) -> list: + """ + Sort units with edge sampling strategy + + :param units: total units + :param edge_sampling: edge sampling strategy (random, min_loss, max_loss) + :return: sorted units + """ + if edge_sampling == "random": + random.shuffle(units) + elif edge_sampling == "min_loss": + units = sorted( + units, + key=lambda x: x[-1]["loss"], + ) + elif edge_sampling == "max_loss": + units = sorted( + units, + key=lambda x: x[-1]["loss"], + reverse=True, + ) + else: + raise ValueError(f"Invalid edge sampling: {edge_sampling}") + return units + + def partition( + self, + g: BaseGraphStorage, + max_units_per_community: int = 10, + min_units_per_community: int = 1, + max_tokens_per_community: int = 10240, + unit_sampling: str = "random", + **kwargs: Any, + ) -> Iterable[Community]: + nodes: List[Tuple[str, dict]] = g.get_all_nodes() + edges: List[Tuple[str, str, dict]] = g.get_all_edges() + + adj, _ = self._build_adjacency_list(nodes, edges) + node_dict = dict(nodes) + edge_dict = {frozenset((u, v)): d for u, v, d in edges} + + all_units: List[Tuple[str, Any, dict]] = [ + (NODE_UNIT, nid, d) for nid, d in nodes + ] + [(EDGE_UNIT, frozenset((u, v)), d) for u, v, d in edges] + + used_n: Set[str] = set() + used_e: Set[frozenset[str]] = set() + + all_units = self._sort_units(all_units, unit_sampling) + + def _grow_community(seed_unit: Tuple[str, Any, dict]) -> Optional[Community]: + nonlocal used_n, used_e + + community_nodes: Dict[str, dict] = {} + community_edges: Dict[frozenset[str], dict] = {} + queue = deque() + token_sum = 0 + + def _add_unit(u): + nonlocal token_sum + t, i, d = u + if t == NODE_UNIT: # node + if i in used_n or i in community_nodes: + return False + community_nodes[i] = d + used_n.add(i) + else: # edge + if i in used_e or i in community_edges: + return False + community_edges[i] = d + used_e.add(i) + token_sum += d.get("length", 0) + return True + + _add_unit(seed_unit) + queue.append(seed_unit) + + # BFS + while queue: + if ( + len(community_nodes) + len(community_edges) + >= max_units_per_community + or token_sum >= max_tokens_per_community + ): + break + + cur_type, cur_id, _ = queue.popleft() + + neighbors: List[Tuple[str, Any, dict]] = [] + if cur_type == NODE_UNIT: + for nb_id in adj.get(cur_id, []): + e_key = frozenset((cur_id, nb_id)) + if e_key not in used_e and e_key not in community_edges: + neighbors.append((EDGE_UNIT, e_key, edge_dict[e_key])) + else: + for n_id in cur_id: + if n_id not in used_n and n_id not in community_nodes: + neighbors.append((NODE_UNIT, n_id, node_dict[n_id])) + + neighbors = self._sort_units(neighbors, unit_sampling) + for nb in neighbors: + if ( + len(community_nodes) + len(community_edges) + >= max_units_per_community + or token_sum >= max_tokens_per_community + ): + break + if _add_unit(nb): + queue.append(nb) + + if len(community_nodes) + len(community_edges) < min_units_per_community: + return None + + return Community( + id=seed_unit[1], + nodes=list(community_nodes.keys()), + edges=[(u, v) for (u, v), _ in community_edges.items()], + ) + + for unit in tqdm(all_units, desc="ECE partition"): + utype, uid, _ = unit + if (utype == NODE_UNIT and uid in used_n) or ( + utype == EDGE_UNIT and uid in used_e + ): + continue + comm = _grow_community(unit) + if comm: + yield comm diff --git a/graphgen/models/partitioner/leiden_partitioner.py b/graphgen/models/partitioner/leiden_partitioner.py new file mode 100644 index 00000000..b62b8544 --- /dev/null +++ b/graphgen/models/partitioner/leiden_partitioner.py @@ -0,0 +1,114 @@ +from collections import defaultdict +from typing import Any, Dict, List, Set, Tuple + +import igraph as ig +from leidenalg import ModularityVertexPartition, find_partition + +from graphgen.bases import BaseGraphStorage, BasePartitioner +from graphgen.bases.datatypes import Community + + +class LeidenPartitioner(BasePartitioner): + """ + Leiden partitioner that partitions the graph into communities using the Leiden algorithm. + """ + + def partition( + self, + g: BaseGraphStorage, + max_size: int = 20, + use_lcc: bool = False, + random_seed: int = 42, + **kwargs: Any, + ) -> List[Community]: + """ + Leiden Partition follows these steps: + 1. export the graph from graph storage + 2. use the leiden algorithm to detect communities, get {node: community_id} + 3. split large communities if max_size is given + 4. convert {node: community_id} to List[Community] + :param g + :param max_size: maximum size of each community, if None or <=0, no limit + :param use_lcc: whether to use the largest connected component only + :param random_seed + :param kwargs: other parameters for the leiden algorithm + :return: + """ + nodes = g.get_all_nodes() # List[Tuple[str, dict]] + edges = g.get_all_edges() # List[Tuple[str, str, dict]] + + node2cid: Dict[str, int] = self._run_leiden(nodes, edges, use_lcc, random_seed) + + if max_size is not None and max_size > 0: + node2cid = self._split_communities(node2cid, max_size) + + cid2nodes: Dict[int, List[str]] = defaultdict(list) + for n, cid in node2cid.items(): + cid2nodes[cid].append(n) + + communities: List[Community] = [] + for cid, nodes in cid2nodes.items(): + node_set: Set[str] = set(nodes) + comm_edges: List[Tuple[str, str]] = [ + (u, v) for u, v, _ in edges if u in node_set and v in node_set + ] + communities.append(Community(id=cid, nodes=nodes, edges=comm_edges)) + return communities + + @staticmethod + def _run_leiden( + nodes: List[Tuple[str, dict]], + edges: List[Tuple[str, str, dict]], + use_lcc: bool = False, + random_seed: int = 42, + ) -> Dict[str, int]: + # build igraph + ig_graph = ig.Graph.TupleList(((u, v) for u, v, _ in edges), directed=False) + + # remove isolated nodes + ig_graph.delete_vertices(ig_graph.vs.select(_degree_eq=0)) + + node2cid: Dict[str, int] = {} + if use_lcc: + lcc = ig_graph.components().giant() + partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed) + for part_id, cluster in enumerate(partition): + for v in cluster: + node2cid[lcc.vs[v]["name"]] = part_id + else: + offset = 0 + for component in ig_graph.components(): + subgraph = ig_graph.induced_subgraph(component) + partition = find_partition( + subgraph, ModularityVertexPartition, seed=random_seed + ) + for part_id, cluster in enumerate(partition): + for v in cluster: + original_node = subgraph.vs[v]["name"] + node2cid[original_node] = part_id + offset + offset += len(partition) + return node2cid + + @staticmethod + def _split_communities(node2cid: Dict[str, int], max_size: int) -> Dict[str, int]: + """ + Split communities larger than max_size into smaller sub-communities. + """ + cid2nodes: Dict[int, List[str]] = defaultdict(list) + for n, cid in node2cid.items(): + cid2nodes[cid].append(n) + + new_mapping: Dict[str, int] = {} + new_cid = 0 + for nodes in cid2nodes.values(): + if len(nodes) <= max_size: + for n in nodes: + new_mapping[n] = new_cid + new_cid += 1 + else: + for start in range(0, len(nodes), max_size): + chunk = nodes[start : start + max_size] + for n in chunk: + new_mapping[n] = new_cid + new_cid += 1 + return new_mapping diff --git a/graphgen/models/reader/__init__.py b/graphgen/models/reader/__init__.py new file mode 100644 index 00000000..220460c3 --- /dev/null +++ b/graphgen/models/reader/__init__.py @@ -0,0 +1,7 @@ +from .csv_reader import CSVReader +from .json_reader import JSONReader +from .parquet_reader import ParquetReader +from .pdf_reader import PDFReader +from .pickle_reader import PickleReader +from .rdf_reader import RDFReader +from .txt_reader import TXTReader diff --git a/graphgen/models/reader/csv_reader.py b/graphgen/models/reader/csv_reader.py new file mode 100644 index 00000000..a0343d97 --- /dev/null +++ b/graphgen/models/reader/csv_reader.py @@ -0,0 +1,28 @@ +from typing import List, Union + +import ray +from ray.data import Dataset + +from graphgen.bases.base_reader import BaseReader + + +class CSVReader(BaseReader): + """ + Reader for CSV files. + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. + """ + + def read(self, input_path: Union[str, List[str]]) -> Dataset: + """ + Read CSV files and return Ray Dataset. + + :param input_path: Path to CSV file or list of CSV files. + :return: Ray Dataset containing validated and filtered data. + """ + + ds = ray.data.read_csv(input_path) + ds = ds.map_batches(self._validate_batch, batch_format="pandas") + ds = ds.filter(self._should_keep_item) + return ds diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py new file mode 100644 index 00000000..a25abee8 --- /dev/null +++ b/graphgen/models/reader/json_reader.py @@ -0,0 +1,111 @@ +import json +import os +from typing import Any, Dict, Iterator, List, Union + +import ray +import ray.data + +from graphgen.bases.base_reader import BaseReader +from graphgen.utils import logger + + +class JSONReader(BaseReader): + """ + Reader for JSON and JSONL files. + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. + """ + + def read(self, input_path: Union[str, List[str]]) -> ray.data.Dataset: + """ + Read JSON file and return Ray Dataset. + :param input_path: Path to JSON/JSONL file or list of JSON/JSONL files. + :return: Ray Dataset containing validated and filtered data. + """ + if self.modalities and len(self.modalities) >= 2: + ds: ray.data.Dataset = ray.data.from_items([]) + for file in input_path if isinstance(input_path, list) else [input_path]: + data = [] + if file.endswith(".jsonl"): + with open(file, "r", encoding="utf-8") as f: + for line in f: + item = json.loads(line) + data.append(item) + else: + with open(file, "r", encoding="utf-8") as f: + data = json.load(f) + data = self._unify_schema(data) + file_ds: ray.data.Dataset = ray.data.from_items(data) + ds = ds.union(file_ds) # type: ignore + else: + ds = ray.data.read_json(input_path) + ds = ds.map_batches(self._validate_batch, batch_format="pandas") + ds = ds.filter(self._should_keep_item) + return ds + + def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]: + """ + Stream read JSONL files line by line without loading entire file into memory. + Returns an iterator that yields filtered documents. + + :param file_path: Path to the JSONL file. + :return: Iterator of dictionaries containing the data. + """ + if not file_path.endswith(".jsonl"): + raise ValueError("read_stream only supports JSONL files, not JSON files") + + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + try: + doc = json.loads(line) + assert "type" in doc, f"Missing 'type' in document: {doc}" + if doc.get("type") == "text" and self.text_column not in doc: + raise ValueError( + f"Missing '{self.text_column}' in document: {doc}" + ) + + # Apply filtering logic inline (similar to BaseReader.filter) + if doc.get("type") == "text": + content = doc.get(self.text_column, "").strip() + if content: + yield doc + elif doc.get("type") in ("image", "table", "equation"): + img_path = doc.get("img_path") + if self._image_exists(img_path): + yield doc + else: + yield doc + except json.JSONDecodeError as e: + logger.error("Error decoding JSON line: %s. Error: %s", line, e) + + @staticmethod + def _image_exists(path_or_url: str, timeout: int = 3) -> bool: + """ + Check if an image exists at the given local path or URL. + :param path_or_url: Local file path or remote URL of the image. + :param timeout: Timeout for remote URL requests in seconds. + :return: True if the image exists, False otherwise. + """ + if not path_or_url: + return False + if not path_or_url.startswith(("http://", "https://", "ftp://")): + path = path_or_url.replace("file://", "", 1) + path = os.path.abspath(path) + return os.path.isfile(path) + try: + import requests + resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout) + return resp.status_code == 200 + except Exception: + return False + + @staticmethod + def _unify_schema(data): + """ + Unify schema for JSON data. + """ + for item in data: + if "content" in item and isinstance(item["content"], dict): + item["content"] = json.dumps(item["content"]) + return data diff --git a/graphgen/models/reader/jsonl_reader.py b/graphgen/models/reader/jsonl_reader.py new file mode 100644 index 00000000..c4979ad3 --- /dev/null +++ b/graphgen/models/reader/jsonl_reader.py @@ -0,0 +1,87 @@ +import json +import os +from typing import Any, Dict, Iterator, List, Union + +from graphgen.bases.base_reader import BaseReader +from graphgen.utils import logger + + +class JSONLReader(BaseReader): + """ + Reader for JSONL files. + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. + """ + + def read(self, input_path: Union[str, List[str]]) -> List[Dict[str, Any]]: + docs = [] + # Handle both single file and list of files + file_paths = input_path if isinstance(input_path, list) else [input_path] + for file_path in file_paths: + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + try: + doc = json.loads(line) + assert "type" in doc, f"Missing 'type' in document: {doc}" + if doc.get("type") == "text" and self.text_column not in doc: + raise ValueError( + f"Missing '{self.text_column}' in document: {doc}" + ) + docs.append(doc) + except json.JSONDecodeError as e: + logger.error("Error decoding JSON line: %s. Error: %s", line, e) + return self.filter(docs) + + def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]: + """ + Stream read JSONL files line by line without loading entire file into memory. + Returns an iterator that yields filtered documents. + + :param file_path: Path to the JSONL file. + :return: Iterator of dictionaries containing the data. + """ + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + try: + doc = json.loads(line) + assert "type" in doc, f"Missing 'type' in document: {doc}" + if doc.get("type") == "text" and self.text_column not in doc: + raise ValueError( + f"Missing '{self.text_column}' in document: {doc}" + ) + + # Apply filtering logic inline (similar to BaseReader.filter) + if doc.get("type") == "text": + content = doc.get(self.text_column, "").strip() + if content: + yield doc + elif doc.get("type") in ("image", "table", "equation"): + img_path = doc.get("img_path") + if self._image_exists(img_path): + yield doc + else: + yield doc + except json.JSONDecodeError as e: + logger.error("Error decoding JSON line: %s. Error: %s", line, e) + + @staticmethod + def _image_exists(path_or_url: str, timeout: int = 3) -> bool: + """ + Check if an image exists at the given local path or URL. + :param path_or_url: Local file path or remote URL of the image. + :param timeout: Timeout for remote URL requests in seconds. + :return: True if the image exists, False otherwise. + """ + if not path_or_url: + return False + if not path_or_url.startswith(("http://", "https://", "ftp://")): + path = path_or_url.replace("file://", "", 1) + path = os.path.abspath(path) + return os.path.isfile(path) + try: + import requests + resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout) + return resp.status_code == 200 + except Exception: + return False diff --git a/graphgen/models/reader/parquet_reader.py b/graphgen/models/reader/parquet_reader.py new file mode 100644 index 00000000..dd289e31 --- /dev/null +++ b/graphgen/models/reader/parquet_reader.py @@ -0,0 +1,30 @@ +from typing import List, Union + +import ray +from ray.data import Dataset + +from graphgen.bases.base_reader import BaseReader + + +class ParquetReader(BaseReader): + """ + Read parquet files, requiring the schema to be restored to List[Dict[str, Any]]. + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. + """ + + def read(self, input_path: Union[str, List[str]]) -> Dataset: + """ + Read Parquet files using Ray Data. + + :param input_path: Path to Parquet file or list of Parquet files. + :return: Ray Dataset containing validated documents. + """ + if not ray.is_initialized(): + ray.init() + + ds = ray.data.read_parquet(input_path) + ds = ds.map_batches(self._validate_batch, batch_format="pandas") + ds = ds.filter(self._should_keep_item) + return ds diff --git a/graphgen/models/reader/pdf_reader.py b/graphgen/models/reader/pdf_reader.py new file mode 100644 index 00000000..55dab30b --- /dev/null +++ b/graphgen/models/reader/pdf_reader.py @@ -0,0 +1,248 @@ +import json +import os +import subprocess +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import ray +from ray.data import Dataset + +from graphgen.bases.base_reader import BaseReader +from graphgen.models.reader.txt_reader import TXTReader +from graphgen.utils import logger, pick_device + + +class PDFReader(BaseReader): + """ + PDF files are converted using MinerU, see [MinerU](https://github.com/opendatalab/MinerU). + After conversion, the resulting markdown file is parsed into text, images, tables, and formulas which can be used + for multi-modal graph generation. + """ + + def __init__( + self, + *, + output_dir: Optional[Union[str, Path]] = None, + method: str = "auto", # auto | txt | ocr + lang: Optional[str] = None, # ch / en / ja / ... + backend: Optional[ + str + ] = None, # pipeline | vlm-transformers | vlm-sglang-engine | vlm-sglang-client + device: Optional[str] = "auto", # cpu | cuda | cuda:0 | npu | mps | auto + source: Optional[str] = None, # huggingface | modelscope | local + vlm_url: Optional[str] = None, # 当 backend=vlm-sglang-client 时必填 + start_page: Optional[int] = None, # 0-based + end_page: Optional[int] = None, # 0-based, inclusive + formula: bool = True, + table: bool = True, + return_assets: bool = True, + **other_mineru_kwargs: Any, + ): + super().__init__() + self.output_dir = os.path.join(output_dir, "mineru") if output_dir else None + + if device == "auto": + device = pick_device() + + self._default_kwargs: Dict[str, Any] = { + "method": method, + "lang": lang, + "backend": backend, + "device": device, + "source": source, + "vlm_url": vlm_url, + "start_page": start_page, + "end_page": end_page, + "formula": formula, + "table": table, + **other_mineru_kwargs, + } + self._default_kwargs = { + k: v for k, v in self._default_kwargs.items() if v is not None + } + self.return_assets = return_assets + self.parser = MinerUParser() + self.txt_reader = TXTReader() + + def read( + self, + input_path: Union[str, List[str]], + **override, + ) -> Dataset: + + # Ensure input_path is a list + if isinstance(input_path, str): + input_path = [input_path] + + paths_ds = ray.data.from_items(input_path) + + def process_pdf(row: Dict[str, Any]) -> List[Dict[str, Any]]: + try: + pdf_path = row["item"] + kwargs = {**self._default_kwargs, **override} + return self._call_mineru(Path(pdf_path), kwargs) + except Exception as e: + logger.error("Failed to process %s: %s", row, e) + return [] + + docs_ds = paths_ds.flat_map(process_pdf) + docs_ds = docs_ds.filter(self._should_keep_item) + + return docs_ds + + def _call_mineru( + self, pdf_path: Path, kwargs: Dict[str, Any] + ) -> List[Dict[str, Any]]: + output_dir: Optional[str] = None + if self.output_dir: + output_dir = str(self.output_dir) + + return self.parser.parse_pdf(pdf_path, output_dir=output_dir, **kwargs) + + def _locate_md(self, pdf_path: Path, kwargs: Dict[str, Any]) -> Optional[Path]: + out_dir = ( + Path(self.output_dir) if self.output_dir else Path(tempfile.gettempdir()) + ) + method = kwargs.get("method", "auto") + backend = kwargs.get("backend", "") + if backend.startswith("vlm-"): + method = "vlm" + + candidate = Path( + os.path.join(out_dir, pdf_path.stem, method, f"{pdf_path.stem}.md") + ) + if candidate.exists(): + return candidate + candidate = Path(os.path.join(out_dir, f"{pdf_path.stem}.md")) + if candidate.exists(): + return candidate + return None + + +class MinerUParser: + def __init__(self) -> None: + self._check_bin() + + @staticmethod + def parse_pdf( + pdf_path: Union[str, Path], + output_dir: Optional[Union[str, Path]] = None, + method: str = "auto", + device: str = "cpu", + **kw: Any, + ) -> List[Dict[str, Any]]: + pdf = Path(pdf_path).expanduser().resolve() + if not pdf.is_file(): + raise FileNotFoundError(pdf) + + out = ( + Path(output_dir) if output_dir else Path(tempfile.mkdtemp(prefix="mineru_")) + ) + out.mkdir(parents=True, exist_ok=True) + + cached = MinerUParser._try_load_cached_result(str(out), pdf.stem, method) + if cached is not None: + return cached + + MinerUParser._run_mineru(pdf, out, method, device, **kw) + + cached = MinerUParser._try_load_cached_result(str(out), pdf.stem, method) + return cached if cached is not None else [] + + @staticmethod + def _try_load_cached_result( + out_dir: str, pdf_stem: str, method: str + ) -> Optional[List[Dict[str, Any]]]: + """ + try to load cached json result from MinerU output. + :param out_dir: + :param pdf_stem: + :param method: + :return: + """ + json_file = os.path.join( + out_dir, pdf_stem, method, f"{pdf_stem}_content_list.json" + ) + if not os.path.exists(json_file): + return None + + try: + with open(json_file, encoding="utf-8") as f: + data = json.load(f) + except Exception as exc: # pylint: disable=broad-except + logger.warning("Failed to load cached MinerU result: %s", exc) + return None + + base = os.path.dirname(json_file) + results = [] + for it in data: + for key in ("img_path", "table_img_path", "equation_img_path"): + rel_path = it.get(key) + if rel_path: + it[key] = str(Path(base).joinpath(rel_path).resolve()) + if it["type"] == "text": + it["content"] = it["text"] + del it["text"] + for key in ("page_idx", "bbox", "text_level"): + if it.get(key) is not None: + del it[key] + results.append(it) + return results + + @staticmethod + def _run_mineru( + pdf: Path, + out: Path, + method: str, + device: str, + **kw: Any, + ) -> None: + cmd = [ + "mineru", + "-p", + str(pdf), + "-o", + str(out), + "-m", + method, + "-d", + device, + ] + for k, v in kw.items(): + if v is None: + continue + if isinstance(v, bool): + cmd += [f"--{k}", str(v).lower()] + else: + cmd += [f"--{k}", str(v)] + + logger.info("Parsing PDF with MinerU: %s", pdf) + logger.debug("Running MinerU command: %s", " ".join(cmd)) + + proc = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="ignore", + check=False, + ) + if proc.returncode != 0: + raise RuntimeError(f"MinerU failed: {proc.stderr or proc.stdout}") + + @staticmethod + def _check_bin() -> None: + try: + subprocess.run( + ["mineru", "--version"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError) as exc: + raise RuntimeError( + "MinerU is not installed or not found in PATH. Please install it from pip: \n" + "pip install -U 'mineru[core]'" + ) from exc diff --git a/graphgen/models/reader/pickle_reader.py b/graphgen/models/reader/pickle_reader.py new file mode 100644 index 00000000..6e3d1949 --- /dev/null +++ b/graphgen/models/reader/pickle_reader.py @@ -0,0 +1,78 @@ +import pickle +from typing import List, Union + +import pandas as pd +import ray +from ray.data import Dataset + +from graphgen.bases.base_reader import BaseReader +from graphgen.utils import logger + + +class PickleReader(BaseReader): + """ + Read pickle files, requiring the schema to be restored to List[Dict[str, Any]]. + Each pickle file should contain a list of dictionaries with at least: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. + + Note: Uses ray.data.read_binary_files as ray.data.read_pickle is not available. + For Ray >= 2.5, consider using read_pickle if available in your version. + """ + + def read( + self, + input_path: Union[str, List[str]], + ) -> Dataset: + """ + Read Pickle files using Ray Data. + + :param input_path: Path to pickle file or list of pickle files. + :return: Ray Dataset containing validated documents. + """ + if not ray.is_initialized(): + ray.init() + + # Use read_binary_files as a reliable alternative to read_pickle + ds = ray.data.read_binary_files(input_path, include_paths=True) + + # Deserialize pickle files and flatten into individual records + def deserialize_batch(batch: pd.DataFrame) -> pd.DataFrame: + all_records = [] + for _, row in batch.iterrows(): + try: + # Load pickle data from bytes + data = pickle.loads(row["bytes"]) + + # Validate structure + if not isinstance(data, list): + logger.error( + "Pickle file {row['path']} must contain a list, got {type(data)}" + ) + continue + + if not all(isinstance(item, dict) for item in data): + logger.error( + "Pickle file {row['path']} must contain a list of dictionaries" + ) + continue + + # Flatten: each dict in the list becomes a separate row + all_records.extend(data) + except Exception as e: + logger.error( + "Failed to deserialize pickle file %s: %s", row["path"], str(e) + ) + continue + + return pd.DataFrame(all_records) + + # Apply deserialization and flattening + ds = ds.map_batches(deserialize_batch, batch_format="pandas") + + # Validate the schema + ds = ds.map_batches(self._validate_batch, batch_format="pandas") + + # Filter valid items + ds = ds.filter(self._should_keep_item) + return ds diff --git a/graphgen/models/reader/rdf_reader.py b/graphgen/models/reader/rdf_reader.py new file mode 100644 index 00000000..9670107a --- /dev/null +++ b/graphgen/models/reader/rdf_reader.py @@ -0,0 +1,128 @@ +from pathlib import Path +from typing import Any, Dict, List, Union + +import ray +import rdflib +from ray.data import Dataset +from rdflib import Literal +from rdflib.util import guess_format + +from graphgen.bases.base_reader import BaseReader +from graphgen.utils import logger + + +class RDFReader(BaseReader): + """ + Reader for RDF files that extracts triples and represents them as dictionaries. + + Uses Ray Data for distributed processing of multiple RDF files. + """ + + def __init__(self, *, text_column: str = "content", **kwargs): + """ + Initialize RDFReader. + + :param text_column: The column name for text content (default: "content"). + """ + super().__init__(**kwargs) + self.text_column = text_column + + def read( + self, + input_path: Union[str, List[str]], + ) -> Dataset: + """ + Read RDF file(s) using Ray Data. + + :param input_path: Path to RDF file or list of RDF files. + :return: Ray Dataset containing extracted documents. + """ + if not ray.is_initialized(): + ray.init() + + # Ensure input_path is a list to prevent Ray from splitting string into characters + if isinstance(input_path, str): + input_path = [input_path] + + # Create dataset from file paths + paths_ds = ray.data.from_items(input_path) + + def process_rdf(row: Dict[str, Any]) -> List[Dict[str, Any]]: + """Process a single RDF file and return list of documents.""" + try: + file_path = row["item"] + return self._parse_rdf_file(Path(file_path)) + except Exception as e: + logger.error( + "Failed to process RDF file %s: %s", row.get("item", "unknown"), e + ) + return [] + + # Process files in parallel and flatten results + docs_ds = paths_ds.flat_map(process_rdf) + + # Filter valid documents + docs_ds = docs_ds.filter(self._should_keep_item) + + return docs_ds + + def _parse_rdf_file(self, file_path: Path) -> List[Dict[str, Any]]: + """ + Parse a single RDF file and extract documents. + + :param file_path: Path to RDF file. + :return: List of document dictionaries. + """ + if not file_path.is_file(): + raise FileNotFoundError(f"RDF file not found: {file_path}") + + g = rdflib.Graph() + fmt = guess_format(str(file_path)) + + try: + g.parse(str(file_path), format=fmt) + except Exception as e: + raise ValueError(f"Cannot parse RDF file {file_path}: {e}") from e + + docs: List[Dict[str, Any]] = [] + + # Process each unique subject in the RDF graph + for subj in set(g.subjects()): + literals = [] + props = {} + + # Extract all triples for this subject + for _, pred, obj in g.triples((subj, None, None)): + pred_str = str(pred) + obj_str = str(obj) + + # Collect literal values as text content + if isinstance(obj, Literal): + literals.append(obj_str) + + # Store all properties (including non-literals) + props.setdefault(pred_str, []).append(obj_str) + + # Join all literal values as the text content + text = " ".join(literals).strip() + if not text: + logger.warning( + "Subject %s in %s has no literal values; document will have empty '%s' field.", + subj, + file_path, + self.text_column, + ) + + # Create document dictionary + doc = { + "id": str(subj), + self.text_column: text, + "properties": props, + "source_file": str(file_path), + } + docs.append(doc) + + if not docs: + logger.warning("RDF file %s contains no valid documents.", file_path) + + return docs diff --git a/graphgen/models/reader/txt_reader.py b/graphgen/models/reader/txt_reader.py new file mode 100644 index 00000000..51a47de2 --- /dev/null +++ b/graphgen/models/reader/txt_reader.py @@ -0,0 +1,32 @@ +from typing import List, Union + +import ray +from ray.data import Dataset + +from graphgen.bases.base_reader import BaseReader + + +class TXTReader(BaseReader): + def read( + self, + input_path: Union[str, List[str]], + ) -> Dataset: + """ + Read text files from the specified input path. + :param input_path: Path to the input text file or list of text files. + :return: Ray Dataset containing the read text data. + """ + docs_ds = ray.data.read_binary_files( + input_path, + include_paths=False, + ) + + docs_ds = docs_ds.map( + lambda row: { + "type": "text", + self.text_column: row["bytes"].decode("utf-8"), + } + ) + + docs_ds = docs_ds.filter(self._should_keep_item) + return docs_ds diff --git a/graphgen/models/strategy/__init__.py b/graphgen/models/searcher/__init__.py similarity index 100% rename from graphgen/models/strategy/__init__.py rename to graphgen/models/searcher/__init__.py diff --git a/graphgen/models/text/__init__.py b/graphgen/models/searcher/db/__init__.py similarity index 100% rename from graphgen/models/text/__init__.py rename to graphgen/models/searcher/db/__init__.py diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py new file mode 100644 index 00000000..efaacf4b --- /dev/null +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -0,0 +1,577 @@ +import asyncio +import os +import re +import subprocess +import tempfile +from concurrent.futures import ThreadPoolExecutor +from functools import lru_cache +from http.client import IncompleteRead +from typing import Dict, Optional + +from Bio import Entrez, SeqIO +from Bio.Blast import NCBIWWW, NCBIXML +from requests.exceptions import RequestException +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher + + +@lru_cache(maxsize=None) +def _get_pool(): + return ThreadPoolExecutor(max_workers=20) # NOTE:can increase for better parallelism + + +# ensure only one NCBI request at a time +_blast_lock = asyncio.Lock() + + +class NCBISearch(BaseSearcher): + """ + NCBI Search client to search DNA/GenBank/Entrez databases. + 1) Get the gene/DNA by accession number or gene ID. + 2) Search with keywords or gene names (fuzzy search). + 3) Search with FASTA sequence (BLAST search for DNA sequences). + + API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/ + Note: NCBI has rate limits (max 3 requests per second), delays are required between requests. + """ + + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "nt_db", + email: str = "email@example.com", + api_key: str = "", + tool: str = "GraphGen", + blast_num_threads: int = 4, + working_dir: str = "cache", + ): + """ + Initialize the NCBI Search client. + + Args: + use_local_blast (bool): Whether to use local BLAST database. + local_blast_db (str): Path to the local BLAST database. + email (str): Email address for NCBI API requests. + api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/. + tool (str): Tool name for NCBI API requests. + blast_num_threads (int): Number of threads for BLAST search. + working_dir (str): Working directory for log files. + """ + super().__init__(working_dir=working_dir) + Entrez.timeout = 60 # 60 seconds timeout + Entrez.email = email + Entrez.tool = tool + if api_key: + Entrez.api_key = api_key + Entrez.max_tries = 10 if api_key else 3 + Entrez.sleep_between_tries = 5 + self.use_local_blast = use_local_blast + self.local_blast_db = local_blast_db + self.blast_num_threads = blast_num_threads + if self.use_local_blast: + # Check for single-file database (.nhr) or multi-file database (.00.nhr) + db_exists = ( + os.path.isfile(f"{self.local_blast_db}.nhr") or + os.path.isfile(f"{self.local_blast_db}.00.nhr") + ) + if not db_exists: + self.logger.error("Local BLAST database files not found. Please check the path.") + self.logger.error("Expected: %s.nhr or %s.00.nhr", self.local_blast_db, self.local_blast_db) + self.use_local_blast = False + + @staticmethod + def _nested_get(data: dict, *keys, default=None): + """Safely traverse nested dictionaries.""" + for key in keys: + if not isinstance(data, dict): + return default + data = data.get(key, default) + return data + + @staticmethod + def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]: + """Infer molecule_type_detail from accession prefix or gene type.""" + if accession: + # Map accession prefixes to molecule types + prefix_map = { + ("NM_", "XM_"): "mRNA", + ("NC_", "NT_"): "genomic DNA", + ("NR_", "XR_"): "RNA", + ("NG_",): "genomic region", + } + for prefixes, mol_type in prefix_map.items(): + if accession.startswith(prefixes): + return mol_type + # Fallback: infer from gene type if available + if gene_type is not None: + gene_type_map = { + 3: "rRNA", + 4: "tRNA", + 5: "snRNA", + 6: "ncRNA", + } + return gene_type_map.get(gene_type) + return None + + def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: + """ + Convert an Entrez gene record to a dictionary. + All extraction logic is inlined for maximum clarity and performance. + """ + if not gene_record: + raise ValueError("Empty gene record") + + data = gene_record[0] + locus = (data.get("Entrezgene_locus") or [{}])[0] + + # Extract common nested paths once + gene_ref = self._nested_get(data, "Entrezgene_gene", "Gene-ref", default={}) + biosource = self._nested_get(data, "Entrezgene_source", "BioSource", default={}) + + # Process synonyms + synonyms_raw = gene_ref.get("Gene-ref_syn", []) + gene_synonyms = [] + if isinstance(synonyms_raw, list): + for syn in synonyms_raw: + gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn)) + elif synonyms_raw: + gene_synonyms.append(str(synonyms_raw)) + + # Extract location info + label = locus.get("Gene-commentary_label", "") + chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None + + seq_interval = self._nested_get( + locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={} + ) + genomic_location = ( + f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}" + if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to') + else None + ) + + # Extract representative accession (prefer type 3 = mRNA/transcript) + representative_accession = next( + ( + product.get("Gene-commentary_accession") + for product in locus.get("Gene-commentary_products", []) + if product.get("Gene-commentary_type") == "3" + ), + None, + ) + # Fallback: if no type 3 accession, try any available accession + if not representative_accession: + representative_accession = next( + ( + product.get("Gene-commentary_accession") + for product in locus.get("Gene-commentary_products", []) + if product.get("Gene-commentary_accession") + ), + None, + ) + + # Extract function + function = data.get("Entrezgene_summary") or next( + ( + comment.get("Gene-commentary_comment") + for comment in data.get("Entrezgene_comments", []) + if isinstance(comment, dict) + and "function" in str(comment.get("Gene-commentary_heading", "")).lower() + ), + None, + ) + + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": gene_id, + "gene_name": gene_ref.get("Gene-ref_locus", "N/A"), + "gene_description": gene_ref.get("Gene-ref_desc", "N/A"), + "organism": self._nested_get( + biosource, "BioSource_org", "Org-ref", "Org-ref_taxname", default="N/A" + ), + "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", + "gene_synonyms": gene_synonyms or None, + "gene_type": { + "1": "protein-coding", + "2": "pseudo", + "3": "rRNA", + "4": "tRNA", + "5": "snRNA", + "6": "ncRNA", + "7": "other", + }.get(str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}"), + "chromosome": chromosome_match.group(1) if chromosome_match else None, + "genomic_location": genomic_location, + "function": function, + # Fields from accession-based queries + "title": None, + "sequence": None, + "sequence_length": None, + "gene_id": gene_id, + "molecule_type_detail": self._infer_molecule_type_detail( + representative_accession, data.get("Entrezgene_type") + ), + "_representative_accession": representative_accession, + } + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) + def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: + """Get gene information by Gene ID.""" + def _extract_metadata_from_genbank(result: dict, accession: str): + """Extract metadata from GenBank format (title, features, organism, etc.).""" + with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle: + record = SeqIO.read(handle, "genbank") + + result["title"] = record.description + result["molecule_type_detail"] = self._infer_molecule_type_detail(accession) or "N/A" + + for feature in record.features: + if feature.type == "source": + if 'chromosome' in feature.qualifiers: + result["chromosome"] = feature.qualifiers['chromosome'][0] + + if feature.location: + start = int(feature.location.start) + 1 + end = int(feature.location.end) + result["genomic_location"] = f"{start}-{end}" + + break + + if not result.get("organism") and 'organism' in record.annotations: + result["organism"] = record.annotations['organism'] + + return result + + def _extract_sequence_from_fasta(result: dict, accession: str): + """Extract sequence from FASTA format (more reliable than GenBank for CON-type records).""" + try: + with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle: + fasta_record = SeqIO.read(fasta_handle, "fasta") + result["sequence"] = str(fasta_record.seq) + result["sequence_length"] = len(fasta_record.seq) + except Exception as fasta_exc: + self.logger.warning( + "Failed to extract sequence from accession %s using FASTA format: %s", + accession, fasta_exc + ) + result["sequence"] = None + result["sequence_length"] = None + return result + + def _extract_sequence(result: dict, accession: str): + """ + Extract sequence using the appropriate method based on configuration. + If use_local_blast=True, use local database. Otherwise, use NCBI API. + Always fetches sequence (no option to skip). + """ + # If using local BLAST, use local database + if self.use_local_blast: + sequence = self._extract_sequence_from_local_db(accession) + + if sequence: + result["sequence"] = sequence + result["sequence_length"] = len(sequence) + else: + # Failed to extract from local DB, set to None (no fallback to API) + result["sequence"] = None + result["sequence_length"] = None + self.logger.warning( + "Failed to extract sequence from local DB for accession %s. " + "Not falling back to NCBI API as use_local_blast=True.", + accession + ) + else: + # Use NCBI API to fetch sequence + result = _extract_sequence_from_fasta(result, accession) + + return result + + try: + with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle: + gene_record = Entrez.read(handle) + + if not gene_record: + return None + + result = self._gene_record_to_dict(gene_record, gene_id) + + if accession := (preferred_accession or result.get("_representative_accession")): + result = _extract_metadata_from_genbank(result, accession) + # Extract sequence using appropriate method + result = _extract_sequence(result, accession) + + result.pop("_representative_accession", None) + return result + except (RequestException, IncompleteRead): + raise + except Exception as exc: + self.logger.error("Gene ID %s not found: %s", gene_id, exc) + return None + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) + def get_by_accession(self, accession: str) -> Optional[dict]: + """Get sequence information by accession number.""" + def _extract_gene_id(link_handle): + """Extract GeneID from elink results.""" + links = Entrez.read(link_handle) + if not links or "LinkSetDb" not in links[0]: + return None + + for link_set in links[0]["LinkSetDb"]: + if link_set.get("DbTo") != "gene": + continue + + link = (link_set.get("Link") or link_set.get("IdList", [{}]))[0] + return str(link.get("Id") if isinstance(link, dict) else link) + + try: + # TODO: support accession number with version number (e.g., NM_000546.3) + with Entrez.elink(dbfrom="nuccore", db="gene", id=accession) as link_handle: + gene_id = _extract_gene_id(link_handle) + + if not gene_id: + self.logger.warning("Accession %s has no associated GeneID", accession) + return None + + result = self.get_by_gene_id(gene_id, preferred_accession=accession) + + if result: + result["id"] = accession + result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" + + return result + except (RequestException, IncompleteRead): + raise + except Exception as exc: + self.logger.error("Accession %s not found: %s", accession, exc) + return None + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) + def get_best_hit(self, keyword: str) -> Optional[dict]: + """Search NCBI Gene database with a keyword and return the best hit.""" + if not keyword.strip(): + return None + + try: + for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]: + with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle: + search_results = Entrez.read(search_handle) + + if len(gene_id := search_results.get("IdList", [])) > 0: + result = self.get_by_gene_id(gene_id[0]) + return result + except (RequestException, IncompleteRead): + raise + except Exception as e: + self.logger.error("Keyword %s not found: %s", keyword, e) + return None + + def _extract_sequence_from_local_db(self, accession: str) -> Optional[str]: + """Extract sequence from local BLAST database using blastdbcmd.""" + try: + cmd = [ + "blastdbcmd", + "-db", self.local_blast_db, + "-entry", accession, + "-outfmt", "%s" # Only sequence, no header + ] + sequence = subprocess.check_output( + cmd, + text=True, + timeout=10, # 10 second timeout for local extraction + stderr=subprocess.DEVNULL + ).strip() + return sequence if sequence else None + except subprocess.TimeoutExpired: + self.logger.warning("Timeout extracting sequence from local DB for accession %s", accession) + return None + except Exception as exc: + self.logger.warning("Failed to extract sequence from local DB for accession %s: %s", accession, exc) + return None + + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: + """ + Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + """ + try: + with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: + tmp.write(f">query\n{seq}\n") + tmp_name = tmp.name + + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance + cmd = [ + "blastn", "-db", self.local_blast_db, "-query", tmp_name, + "-evalue", str(threshold), + "-max_target_seqs", "1", + "-num_threads", str(self.blast_num_threads), + "-outfmt", "6 sacc" # Only accession, tab-separated + ] + self.logger.debug("Running local blastn (threads=%d): %s", + self.blast_num_threads, " ".join(cmd)) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + self.logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + + os.remove(tmp_name) + return out.split("\n", maxsplit=1)[0] if out else None + except Exception as exc: + self.logger.error("Local blastn failed: %s", exc) + # Clean up temp file if it still exists + try: + if 'tmp_name' in locals(): + os.remove(tmp_name) + except Exception: + pass + return None + + def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: + """Search NCBI with a DNA sequence using BLAST.""" + + def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: + """Extract and normalize DNA sequence from input.""" + if sequence.startswith(">"): + seq = "".join(sequence.strip().split("\n")[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + return seq if re.fullmatch(r"[ATCGN]+", seq, re.I) else None + + + def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]: + """Process network BLAST result and return dictionary or None.""" + if not blast_record.alignments: + self.logger.info("No BLAST hits found for the given sequence.") + return None + + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + self.logger.info("No BLAST hits below the threshold E-value.") + return None + + hit_id = best_alignment.hit_id + if accession_match := re.search(r"ref\|([^|]+)", hit_id): + return self.get_by_accession(accession_match.group(1).split(".")[0]) + + # If unable to extract accession, return basic information + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": hit_id, + "title": best_alignment.title, + "sequence_length": len(seq), + "e_value": best_hsp.expect, + "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, + "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", + } + + try: + if not (seq := _extract_and_normalize_sequence(sequence)): + self.logger.error("Empty or invalid DNA sequence provided.") + return None + + # Try local BLAST first if enabled + if self.use_local_blast: + accession = self._local_blast(seq, threshold) + + if accession: + self.logger.debug("Local BLAST found accession: %s", accession) + # When using local BLAST, skip sequence fetching by default (faster, fewer API calls) + # Sequence is already known from the query, so we only need metadata + result = self.get_by_accession(accession) + return result + + self.logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None + + # Fall back to network BLAST only if local BLAST is not enabled + self.logger.debug("Falling back to NCBIWWW.qblast") + with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle: + result = _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold) + return result + except (RequestException, IncompleteRead): + raise + except Exception as e: + self.logger.error("BLAST search failed: %s", e) + return None + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) + async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]: + """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.""" + if not query or not isinstance(query, str): + self.logger.error("Empty or non-string input.") + return None + + query = query.strip() + self.logger.debug("NCBI search query: %s", query) + + loop = asyncio.get_running_loop() + + # Auto-detect query type and execute in thread pool + # All methods need lock because they all call NCBI API (rate limit: max 3 requests per second) + # Even if get_by_fasta uses local BLAST, it still calls get_by_accession which needs API + async def _execute_with_lock(func, *args): + """Execute function with lock for NCBI API calls.""" + async with _blast_lock: + return await loop.run_in_executor(_get_pool(), func, *args) + + if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): + # FASTA sequence: always use lock (even with local BLAST, get_by_accession needs API) + result = await _execute_with_lock(self.get_by_fasta, query, threshold) + elif re.fullmatch(r"^\d+$", query): + # Gene ID: always use lock (network API call) + result = await _execute_with_lock(self.get_by_gene_id, query) + elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): + # Accession: always use lock (network API call) + result = await _execute_with_lock(self.get_by_accession, query) + else: + # Keyword: always use lock (network API call) + result = await _execute_with_lock(self.get_best_hit, query) + + if result: + result["_search_query"] = query + return result diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py new file mode 100644 index 00000000..25dbc73c --- /dev/null +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -0,0 +1,395 @@ +import asyncio +import os +import re +import subprocess +from concurrent.futures import ThreadPoolExecutor +from functools import lru_cache +import tempfile +from typing import Dict, Optional, List, Any, Set + +import hashlib +import requests +import aiohttp +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher + + +@lru_cache(maxsize=None) +def _get_pool(): + return ThreadPoolExecutor(max_workers=20) # NOTE:can increase for better parallelism + +class RNACentralSearch(BaseSearcher): + """ + RNAcentral Search client to search RNA databases. + 1) Get RNA by RNAcentral ID. + 2) Search with keywords or RNA names (fuzzy search). + 3) Search with RNA sequence. + + API Documentation: https://rnacentral.org/api/v1 + """ + + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "rna_db", + api_timeout: int = 30, + blast_num_threads: int = 4, + working_dir: str = "cache", + ): + super().__init__(working_dir=working_dir) + self.base_url = "https://rnacentral.org/api/v1" + self.headers = {"Accept": "application/json"} + self.use_local_blast = use_local_blast + self.local_blast_db = local_blast_db + self.api_timeout = api_timeout + self.blast_num_threads = blast_num_threads # Number of threads for BLAST search + + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): + self.logger.error("Local BLAST database files not found. Please check the path.") + self.use_local_blast = False + + @staticmethod + def _rna_data_to_dict( + rna_id: str, + rna_data: Dict[str, Any], + xrefs_data: Optional[List[Dict[str, Any]]] = None + ) -> Dict[str, Any]: + organisms, gene_names, so_terms = set(), set(), set() + modifications: List[Any] = [] + + for xref in xrefs_data or []: + acc = xref.get("accession", {}) + if s := acc.get("species"): + organisms.add(s) + gene_value = acc.get("gene") + if isinstance(gene_value, str) and (g := gene_value.strip()): + gene_names.add(g) + if m := xref.get("modifications"): + modifications.extend(m) + if b := acc.get("biotype"): + so_terms.add(b) + + def format_unique_values(values: Set[str]) -> Optional[str]: + if not values: + return None + if len(values) == 1: + return next(iter(values)) + return ", ".join(sorted(values)) + + xrefs_info = { + "organism": format_unique_values(organisms), + "gene_name": format_unique_values(gene_names), + "related_genes": list(gene_names) if gene_names else None, + "modifications": modifications or None, + "so_term": format_unique_values(so_terms), + } + + fallback_rules = { + "organism": ["organism", "species"], + "related_genes": ["related_genes", "genes"], + "gene_name": ["gene_name", "gene"], + "so_term": ["so_term"], + "modifications": ["modifications"], + } + + def resolve_field(field_name: str) -> Any: + if (value := xrefs_info.get(field_name)) is not None: + return value + + for key in fallback_rules[field_name]: + if (value := rna_data.get(key)) is not None: + return value + + return None + + organism = resolve_field("organism") + gene_name = resolve_field("gene_name") + so_term = resolve_field("so_term") + modifications = resolve_field("modifications") + + related_genes = resolve_field("related_genes") + if not related_genes and (single_gene := rna_data.get("gene_name")): + related_genes = [single_gene] + + sequence = rna_data.get("sequence", "") + + return { + "molecule_type": "RNA", + "database": "RNAcentral", + "id": rna_id, + "rnacentral_id": rna_data.get("rnacentral_id", rna_id), + "sequence": sequence, + "sequence_length": rna_data.get("length", len(sequence)), + "rna_type": rna_data.get("rna_type", "N/A"), + "description": rna_data.get("description", "N/A"), + "url": f"https://rnacentral.org/rna/{rna_id}", + "organism": organism, + "related_genes": related_genes or None, + "gene_name": gene_name, + "so_term": so_term, + "modifications": modifications, + } + + @staticmethod + def _calculate_md5(sequence: str) -> str: + """ + Calculate MD5 hash for RNA sequence as per RNAcentral spec. + - Replace U with T + - Convert to uppercase + - Encode as ASCII + """ + # Normalize sequence + normalized_seq = sequence.replace("U", "T").replace("u", "t").upper() + if not re.fullmatch(r"[ATCGN]+", normalized_seq): + raise ValueError(f"Invalid sequence characters after normalization: {normalized_seq[:50]}...") + + return hashlib.md5(normalized_seq.encode("ascii")).hexdigest() + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((requests.Timeout, requests.RequestException)), + reraise=False, + ) + def get_by_rna_id(self, rna_id: str) -> Optional[dict]: + """ + Get RNA information by RNAcentral ID. + :param rna_id: RNAcentral ID (e.g., URS0000000001). + :return: A dictionary containing RNA information or None if not found. + """ + try: + url = f"{self.base_url}/rna/{rna_id}" + url += "?flat=true" + + resp = requests.get(url, headers=self.headers, timeout=self.api_timeout) + resp.raise_for_status() + + rna_data = resp.json() + xrefs_data = rna_data.get("xrefs", []) + result = self._rna_data_to_dict(rna_id, rna_data, xrefs_data) + return result + except requests.Timeout as e: + self.logger.warning("Timeout getting RNA ID %s (timeout=%ds): %s", rna_id, self.api_timeout, e) + return None + except requests.RequestException as e: + self.logger.error("Network error getting RNA ID %s: %s", rna_id, e) + return None + except Exception as e: # pylint: disable=broad-except + self.logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e) + return None + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((requests.Timeout, requests.RequestException)), + reraise=False, + ) + def get_best_hit(self, keyword: str) -> Optional[dict]: + """ + Search RNAcentral with a keyword and return the best hit. + :param keyword: The search keyword (e.g., miRNA name, RNA name). + :return: Dictionary with RNA information or None. + """ + keyword = keyword.strip() + if not keyword: + self.logger.warning("Empty keyword provided to get_best_hit") + return None + + try: + url = f"{self.base_url}/rna" + params = {"search": keyword, "format": "json"} + resp = requests.get(url, params=params, headers=self.headers, timeout=self.api_timeout) + resp.raise_for_status() + + data = resp.json() + results = data.get("results", []) + + if not results: + self.logger.info("No search results for keyword: %s", keyword) + return None + + first_result = results[0] + rna_id = first_result.get("rnacentral_id") + + if rna_id: + detailed = self.get_by_rna_id(rna_id) + if detailed: + return detailed + self.logger.debug("Using search result data for %s", rna_id or "unknown") + return self._rna_data_to_dict(rna_id or "", first_result) + + except requests.RequestException as e: + self.logger.error("Network error searching keyword '%s': %s", keyword, e) + return None + except Exception as e: + self.logger.error("Unexpected error searching keyword '%s': %s", keyword, e) + return None + + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: + """ + Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + """ + try: + # Use temporary file for query sequence + with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: + tmp.write(f">query\n{seq}\n") + tmp_name = tmp.name + + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance + cmd = [ + "blastn", "-db", self.local_blast_db, "-query", tmp_name, + "-evalue", str(threshold), + "-max_target_seqs", "1", + "-num_threads", str(self.blast_num_threads), + "-outfmt", "6 sacc" # Only accession, tab-separated + ] + self.logger.debug("Running local blastn for RNA (threads=%d): %s", + self.blast_num_threads, " ".join(cmd)) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + self.logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + + os.remove(tmp_name) + return out.split("\n", maxsplit=1)[0] if out else None + except Exception as exc: + self.logger.error("Local blastn failed: %s", exc) + # Clean up temp file if it still exists + try: + if 'tmp_name' in locals(): + os.remove(tmp_name) + except Exception: + pass + return None + + def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: + """ + Search RNAcentral with an RNA sequence. + Tries local BLAST first if enabled, falls back to RNAcentral API. + Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information. + :param sequence: RNA sequence (FASTA format or raw sequence). + :param threshold: E-value threshold for BLAST search. + :return: A dictionary containing complete RNA information or None if not found. + """ + def _extract_sequence(sequence: str) -> Optional[str]: + """Extract and normalize RNA sequence from input.""" + if sequence.startswith(">"): + seq_lines = sequence.strip().split("\n") + seq = "".join(seq_lines[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + # Accept both U (original RNA) and T + return seq if seq and re.fullmatch(r"[AUCGTN\s]+", seq, re.I) else None + + try: + seq = _extract_sequence(sequence) + if not seq: + self.logger.error("Empty or invalid RNA sequence provided.") + return None + + # Try local BLAST first if enabled + if self.use_local_blast: + accession = self._local_blast(seq, threshold) + if accession: + self.logger.debug("Local BLAST found accession: %s", accession) + detailed = self.get_by_rna_id(accession) + if detailed: + return detailed + self.logger.info( + "Local BLAST found accession %s but could not retrieve metadata from API.", + accession + ) + return None + self.logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None + + # Fall back to RNAcentral API only if local BLAST is not enabled + self.logger.debug("Falling back to RNAcentral API.") + + md5_hash = self._calculate_md5(seq) + search_url = f"{self.base_url}/rna" + params = {"md5": md5_hash, "format": "json"} + + resp = requests.get(search_url, params=params, headers=self.headers, timeout=60) + resp.raise_for_status() + + search_results = resp.json() + results = search_results.get("results", []) + + if not results: + self.logger.info("No exact match found in RNAcentral for sequence") + return None + + rna_id = results[0].get("rnacentral_id") + if rna_id: + detailed = self.get_by_rna_id(rna_id) + if detailed: + return detailed + # Fallback: use search result data if get_by_rna_id returns None + self.logger.debug("Using search result data for %s (get_by_rna_id returned None)", rna_id) + return self._rna_data_to_dict(rna_id, results[0]) + + self.logger.error("No RNAcentral ID found in search results.") + return None + except Exception as e: + self.logger.error("Sequence search failed: %s", e) + return None + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)), + reraise=True, + ) + async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]: + """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.""" + if not query or not isinstance(query, str): + self.logger.error("Empty or non-string input.") + return None + + query = query.strip() + self.logger.debug("RNAcentral search query: %s", query) + + loop = asyncio.get_running_loop() + + # check if RNA sequence (AUCG or ATCG characters, contains U or T) + # Note: Sequences with T are also RNA sequences + is_rna_sequence = query.startswith(">") or ( + re.fullmatch(r"[AUCGTN\s]+", query, re.I) and + ("U" in query.upper() or "T" in query.upper()) + ) + if is_rna_sequence: + result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) + # check if RNAcentral ID (typically starts with URS) + elif re.fullmatch(r"URS\d+", query, re.I): + result = await loop.run_in_executor(_get_pool(), self.get_by_rna_id, query) + else: + # otherwise treat as keyword + result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + + if result: + result["_search_query"] = query + return result diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py new file mode 100644 index 00000000..32115e34 --- /dev/null +++ b/graphgen/models/searcher/db/uniprot_searcher.py @@ -0,0 +1,298 @@ +import asyncio +import os +import re +import subprocess +import tempfile +from concurrent.futures import ThreadPoolExecutor +from functools import lru_cache +from io import StringIO +from typing import Dict, Optional + +from Bio import ExPASy, SeqIO, SwissProt, UniProt +from Bio.Blast import NCBIWWW, NCBIXML +from requests.exceptions import RequestException +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher + + +@lru_cache(maxsize=None) +def _get_pool(): + return ThreadPoolExecutor(max_workers=20) # NOTE:can increase for better parallelism + + +# ensure only one BLAST searcher at a time +_blast_lock = asyncio.Lock() + + +class UniProtSearch(BaseSearcher): + """ + UniProt Search client to searcher with UniProt. + 1) Get the protein by accession number. + 2) Search with keywords or protein names (fuzzy searcher). + 3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async. + """ + + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "sp_db", + blast_num_threads: int = 4, + working_dir: str = "cache", + ): + super().__init__(working_dir=working_dir) + self.use_local_blast = use_local_blast + self.local_blast_db = local_blast_db + self.blast_num_threads = blast_num_threads # Number of threads for BLAST search + + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"): + self.logger.error("Local BLAST database files not found. Please check the path.") + self.use_local_blast = False + + def get_by_accession(self, accession: str) -> Optional[dict]: + try: + handle = ExPASy.get_sprot_raw(accession) + record = SwissProt.read(handle) + handle.close() + return self._swissprot_to_dict(record) + except RequestException: # network-related errors + raise + except Exception as exc: # pylint: disable=broad-except + self.logger.error("Accession %s not found: %s", accession, exc) + return None + + @staticmethod + def _swissprot_to_dict(record: SwissProt.Record) -> dict: + """error + Convert a SwissProt.Record to a dictionary. + """ + functions = [] + for line in record.comments: + if line.startswith("FUNCTION:"): + functions.append(line[9:].strip()) + + return { + "molecule_type": "protein", + "database": "UniProt", + "id": record.accessions[0], + "entry_name": record.entry_name, + "gene_names": record.gene_name, + "protein_name": record.description.split(";")[0].split("=")[-1], + "organism": record.organism.split(" (")[0], + "sequence": str(record.sequence), + "function": functions, + "url": f"https://www.uniprot.org/uniprot/{record.accessions[0]}", + } + + def get_best_hit(self, keyword: str) -> Optional[Dict]: + """ + Search UniProt with a keyword and return the best hit. + :param keyword: The searcher keyword. + :return: A dictionary containing the best hit information or None if not found. + """ + if not keyword.strip(): + return None + + try: + iterator = UniProt.search(keyword, fields=None, batch_size=1) + hit = next(iterator, None) + if hit is None: + return None + return self.get_by_accession(hit["primaryAccession"]) + + except RequestException: + raise + except Exception as e: # pylint: disable=broad-except + self.logger.error("Keyword %s not found: %s", keyword, e) + return None + + def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: + """ + Search UniProt with a FASTA sequence and return the best hit. + :param fasta_sequence: The FASTA sequence. + :param threshold: E-value threshold for BLAST searcher. + :return: A dictionary containing the best hit information or None if not found. + """ + try: + if fasta_sequence.startswith(">"): + seq = str(list(SeqIO.parse(StringIO(fasta_sequence), "fasta"))[0].seq) + else: + seq = fasta_sequence.strip() + except Exception as e: # pylint: disable=broad-except + self.logger.error("Invalid FASTA sequence: %s", e) + return None + + if not seq: + self.logger.error("Empty FASTA sequence provided.") + return None + + if self.use_local_blast: + accession = self._local_blast(seq, threshold) + if accession: + self.logger.debug("Local BLAST found accession: %s", accession) + return self.get_by_accession(accession) + self.logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None + + # Fall back to network BLAST only if local BLAST is not enabled + self.logger.debug("Falling back to NCBIWWW.qblast.") + + # UniProtKB/Swiss-Prot BLAST API + try: + self.logger.debug( + "Performing BLAST searcher for the given sequence: %s", seq + ) + result_handle = NCBIWWW.qblast( + program="blastp", + database="swissprot", + sequence=seq, + hitlist_size=1, + expect=threshold, + ) + blast_record = NCBIXML.read(result_handle) + except RequestException: + raise + except Exception as e: # pylint: disable=broad-except + self.logger.error("BLAST searcher failed: %s", e) + return None + + if not blast_record.alignments: + self.logger.info("No BLAST hits found for the given sequence.") + return None + + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + self.logger.info("No BLAST hits below the threshold E-value.") + return None + + # like sp|P01308.1|INS_HUMAN + hit_id = best_alignment.hit_id + accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id + return self.get_by_accession(accession) + + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: + """ + Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + :param seq: The protein sequence. + :param threshold: E-value threshold for BLAST searcher. + :return: The accession number of the best hit or None if not found. + """ + try: + with tempfile.NamedTemporaryFile( + mode="w+", suffix=".fa", delete=False + ) as tmp: + tmp.write(f">query\n{seq}\n") + tmp_name = tmp.name + + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance + cmd = [ + "blastp", + "-db", + self.local_blast_db, + "-query", + tmp_name, + "-evalue", + str(threshold), + "-max_target_seqs", + "1", + "-num_threads", + str(self.blast_num_threads), + "-outfmt", + "6 sacc", # Only accession, tab-separated + ] + self.logger.debug("Running local blastp (threads=%d): %s", + self.blast_num_threads, " ".join(cmd)) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + self.logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + + os.remove(tmp_name) + if out: + return out.split("\n", maxsplit=1)[0] + return None + except Exception as exc: # pylint: disable=broad-except + self.logger.error("Local blastp failed: %s", exc) + return None + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type(RequestException), + reraise=True, + ) + async def search( + self, query: str, threshold: float = 0.7, **kwargs + ) -> Optional[Dict]: + """ + Search UniProt with either an accession number, keyword, or FASTA sequence. + :param query: The searcher query (accession number, keyword, or FASTA sequence). + :param threshold: E-value threshold for BLAST searcher. + :return: A dictionary containing the best hit information or None if not found. + """ + + # auto detect query type + if not query or not isinstance(query, str): + self.logger.error("Empty or non-string input.") + return None + query = query.strip() + + self.logger.debug("UniProt searcher query: %s", query) + + loop = asyncio.get_running_loop() + + # check if fasta sequence + if query.startswith(">") or re.fullmatch( + r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I + ): + # Only use lock for network BLAST (NCBIWWW), local BLAST can run in parallel + if self.use_local_blast: + # Local BLAST can run in parallel, no lock needed + result = await loop.run_in_executor( + _get_pool(), self.get_by_fasta, query, threshold + ) + else: + # Network BLAST needs lock to respect rate limits + async with _blast_lock: + result = await loop.run_in_executor( + _get_pool(), self.get_by_fasta, query, threshold + ) + + # check if accession number + # UniProt accession IDs: 6-10 characters, must start with a letter + # Format: [A-Z][A-Z0-9]{5,9} (6-10 chars total: 1 letter + 5-9 alphanumeric) + elif re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", query, re.I): + result = await loop.run_in_executor( + _get_pool(), self.get_by_accession, query + ) + + else: + # otherwise treat as keyword + result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + + if result: + result["_search_query"] = query + return result diff --git a/graphgen/models/searcher/kg/__init__.py b/graphgen/models/searcher/kg/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/models/search/wiki_search.py b/graphgen/models/searcher/kg/wiki_search.py similarity index 87% rename from graphgen/models/search/wiki_search.py rename to graphgen/models/searcher/kg/wiki_search.py index db312a2b..2d8686cf 100644 --- a/graphgen/models/search/wiki_search.py +++ b/graphgen/models/searcher/kg/wiki_search.py @@ -1,21 +1,20 @@ from typing import List, Union -from dataclasses import dataclass import wikipedia from wikipedia import set_lang + from graphgen.utils import detect_main_language, logger -@dataclass class WikiSearch: @staticmethod def set_language(language: str): assert language in ["en", "zh"], "Only support English and Chinese" set_lang(language) - async def search(self, query: str) -> Union[List[str], None]: + async def search(self, query: str, num_results: int = 1) -> Union[List[str], None]: self.set_language(detect_main_language(query)) - return wikipedia.search(query) + return wikipedia.search(query, results=num_results, suggestion=False) async def summary(self, query: str) -> Union[str, None]: self.set_language(detect_main_language(query)) diff --git a/graphgen/models/searcher/web/__init__.py b/graphgen/models/searcher/web/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/models/searcher/web/bing_search.py b/graphgen/models/searcher/web/bing_search.py new file mode 100644 index 00000000..77ae2110 --- /dev/null +++ b/graphgen/models/searcher/web/bing_search.py @@ -0,0 +1,41 @@ +import requests +from fastapi import HTTPException + +from graphgen.utils import logger + +BING_SEARCH_V7_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search" +BING_MKT = "en-US" + + +class BingSearch: + """ + Bing Search client to searcher with Bing. + """ + + def __init__(self, subscription_key: str): + self.subscription_key = subscription_key + + def search(self, query: str, num_results: int = 1): + """ + Search with Bing and return the contexts. + :param query: The searcher query. + :param num_results: The number of results to return. + :return: A list of searcher results. + """ + params = {"q": query, "mkt": BING_MKT, "count": num_results} + response = requests.get( + BING_SEARCH_V7_ENDPOINT, + headers={"Ocp-Apim-Subscription-Key": self.subscription_key}, + params=params, + timeout=10, + ) + if not response.ok: + logger.error("Search engine error: %s", response.text) + raise HTTPException(response.status_code, "Search engine error.") + json_content = response.json() + try: + contexts = json_content["webPages"]["value"][:num_results] + except KeyError: + logger.error("Error encountered: %s", json_content) + return [] + return contexts diff --git a/graphgen/models/searcher/web/google_search.py b/graphgen/models/searcher/web/google_search.py new file mode 100644 index 00000000..0d598f3a --- /dev/null +++ b/graphgen/models/searcher/web/google_search.py @@ -0,0 +1,42 @@ +import requests +from fastapi import HTTPException + +from graphgen.utils import logger + +GOOGLE_SEARCH_ENDPOINT = "https://customsearch.googleapis.com/customsearch/v1" + + +class GoogleSearch: + def __init__(self, subscription_key: str, cx: str): + """ + Initialize the Google Search client with the subscription key and custom searcher engine ID. + :param subscription_key: Your Google API subscription key. + :param cx: Your custom searcher engine ID. + """ + self.subscription_key = subscription_key + self.cx = cx + + def search(self, query: str, num_results: int = 1): + """ + Search with Google and return the contexts. + :param query: The searcher query. + :param num_results: The number of results to return. + :return: A list of searcher results. + """ + params = { + "key": self.subscription_key, + "cx": self.cx, + "q": query, + "num": num_results, + } + response = requests.get(GOOGLE_SEARCH_ENDPOINT, params=params, timeout=10) + if not response.ok: + logger.error("Search engine error: %s", response.text) + raise HTTPException(response.status_code, "Search engine error.") + json_content = response.json() + try: + contexts = json_content["items"][:num_results] + except KeyError: + logger.error("Error encountered: %s", json_content) + return [] + return contexts diff --git a/graphgen/models/splitter/__init__.py b/graphgen/models/splitter/__init__.py new file mode 100644 index 00000000..4f8a427c --- /dev/null +++ b/graphgen/models/splitter/__init__.py @@ -0,0 +1,4 @@ +from .recursive_character_splitter import ( + ChineseRecursiveTextSplitter, + RecursiveCharacterSplitter, +) diff --git a/graphgen/models/splitter/character_splitter.py b/graphgen/models/splitter/character_splitter.py new file mode 100644 index 00000000..8877c861 --- /dev/null +++ b/graphgen/models/splitter/character_splitter.py @@ -0,0 +1,26 @@ +import re +from typing import Any, List + +from graphgen.bases.base_splitter import BaseSplitter + + +class CharacterSplitter(BaseSplitter): + """Splitting text that looks at characters.""" + + def __init__( + self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any + ) -> None: + """Create a new TextSplitter.""" + super().__init__(**kwargs) + self._separator = separator + self._is_separator_regex = is_separator_regex + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + # First we naively chunk the large input into a bunch of smaller ones. + separator = ( + self._separator if self._is_separator_regex else re.escape(self._separator) + ) + splits = self._split_text_with_regex(text, separator, self.keep_separator) + _separator = "" if self.keep_separator else self._separator + return self._merge_splits(splits, _separator) diff --git a/graphgen/models/splitter/markdown_splitter.py b/graphgen/models/splitter/markdown_splitter.py new file mode 100644 index 00000000..40b6a44e --- /dev/null +++ b/graphgen/models/splitter/markdown_splitter.py @@ -0,0 +1,33 @@ +from typing import Any + +from graphgen.models.splitter.recursive_character_splitter import ( + RecursiveCharacterSplitter, +) + + +class MarkdownTextRefSplitter(RecursiveCharacterSplitter): + """Attempts to chunk the text along Markdown-formatted headings.""" + + def __init__(self, **kwargs: Any) -> None: + """Initialize a MarkdownTextRefSplitter.""" + separators = [ + # First, try to chunk along Markdown headings (starting with level 2) + "\n#{1,6} ", + # Note the alternative syntax for headings (below) is not handled here + # Heading level 2 + # --------------- + # End of code block + "```\n", + # Horizontal lines + "\n\\*\\*\\*+\n", + "\n---+\n", + "\n___+\n", + # Note: horizontal lines defined by three or more of ***, ---, or ___ + # are handled by the regexes above, but alternative syntaxes (e.g., with spaces) + # are not handled. + "\n\n", + "\n", + " ", + "", + ] + super().__init__(separators=separators, **kwargs) diff --git a/graphgen/models/splitter/recursive_character_splitter.py b/graphgen/models/splitter/recursive_character_splitter.py new file mode 100644 index 00000000..b1ee8e06 --- /dev/null +++ b/graphgen/models/splitter/recursive_character_splitter.py @@ -0,0 +1,149 @@ +import re +from typing import Any, List, Optional + +from graphgen.bases.base_splitter import BaseSplitter + + +class RecursiveCharacterSplitter(BaseSplitter): + """Splitting text by recursively look at characters. + + Recursively tries to chunk by different characters to find one that works. + """ + + def __init__( + self, + separators: Optional[List[str]] = None, + keep_separator: bool = True, + is_separator_regex: bool = False, + **kwargs: Any, + ) -> None: + """Create a new TextSplitter.""" + super().__init__(keep_separator=keep_separator, **kwargs) + self._separators = separators or ["\n\n", "\n", " ", ""] + self._is_separator_regex = is_separator_regex + + def _split_text(self, text: str, separators: List[str]) -> List[str]: + """Split incoming text and return chunks.""" + final_chunks = [] + # Get appropriate separator to use + separator = separators[-1] + new_separators = [] + for i, _s in enumerate(separators): + _separator = _s if self._is_separator_regex else re.escape(_s) + if _s == "": + separator = _s + break + if re.search(_separator, text): + separator = _s + new_separators = separators[i + 1 :] + break + + _separator = separator if self._is_separator_regex else re.escape(separator) + splits = self._split_text_with_regex(text, _separator, self.keep_separator) + + # Now go merging things, recursively splitting longer texts. + _good_splits = [] + _separator = "" if self.keep_separator else separator + for s in splits: + if self.length_function(s) < self.chunk_size: + _good_splits.append(s) + else: + if _good_splits: + merged_text = self._merge_splits(_good_splits, _separator) + final_chunks.extend(merged_text) + _good_splits = [] + if not new_separators: + final_chunks.append(s) + else: + other_info = self._split_text(s, new_separators) + final_chunks.extend(other_info) + if _good_splits: + merged_text = self._merge_splits(_good_splits, _separator) + final_chunks.extend(merged_text) + return final_chunks + + def split_text(self, text: str) -> List[str]: + return self._split_text(text, self._separators) + + +class ChineseRecursiveTextSplitter(RecursiveCharacterSplitter): + def __init__( + self, + separators: Optional[List[str]] = None, + keep_separator: bool = True, + is_separator_regex: bool = True, + **kwargs: Any, + ) -> None: + super().__init__(keep_separator=keep_separator, **kwargs) + self._separators = separators or [ + "\n\n", + "\n", + "。|!|?", + r"\.\s|\!\s|\?\s", + r";|;\s", + r",|,\s", + ] + self._is_separator_regex = is_separator_regex + + def _split_text_with_regex_from_end( + self, text: str, separator: str, keep_separator: bool + ) -> List[str]: + # Now that we have the separator, chunk the text + if separator: + if keep_separator: + # The parentheses in the pattern keep the delimiters in the result. + _splits = re.split(f"({separator})", text) + splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])] + if len(_splits) % 2 == 1: + splits += _splits[-1:] + else: + splits = re.split(separator, text) + else: + splits = list(text) + return [s for s in splits if s != ""] + + def _split_text(self, text: str, separators: List[str]) -> List[str]: + """Split incoming text and return chunks.""" + final_chunks = [] + # Get appropriate separator to use + separator = separators[-1] + new_separators = [] + for i, _s in enumerate(separators): + _separator = _s if self._is_separator_regex else re.escape(_s) + if _s == "": + separator = _s + break + if re.search(_separator, text): + separator = _s + new_separators = separators[i + 1 :] + break + + _separator = separator if self._is_separator_regex else re.escape(separator) + splits = self._split_text_with_regex_from_end( + text, _separator, self.keep_separator + ) + + # Now go merging things, recursively splitting longer texts. + _good_splits = [] + _separator = "" if self.keep_separator else separator + for s in splits: + if self.length_function(s) < self.chunk_size: + _good_splits.append(s) + else: + if _good_splits: + merged_text = self._merge_splits(_good_splits, _separator) + final_chunks.extend(merged_text) + _good_splits = [] + if not new_separators: + final_chunks.append(s) + else: + other_info = self._split_text(s, new_separators) + final_chunks.extend(other_info) + if _good_splits: + merged_text = self._merge_splits(_good_splits, _separator) + final_chunks.extend(merged_text) + return [ + re.sub(r"\n{2,}", "\n", chunk.strip()) + for chunk in final_chunks + if chunk.strip() != "" + ] diff --git a/graphgen/models/splitter/sequence_splitter.py b/graphgen/models/splitter/sequence_splitter.py new file mode 100644 index 00000000..e0c62bbc --- /dev/null +++ b/graphgen/models/splitter/sequence_splitter.py @@ -0,0 +1,119 @@ +import re +from typing import List, Optional + +from graphgen.bases.base_splitter import BaseSplitter +from graphgen.utils.log import logger + + +class SequenceSplitter(BaseSplitter): + """ + Splitter for biological sequences (DNA, RNA, protein). + Supports chunking by fixed length with overlap. + """ + + def __init__( + self, + chunk_size: int = 1000, + chunk_overlap: int = 100, + length_function=None, + **kwargs, + ): + """ + Initialize sequence splitter. + + :param chunk_size: Maximum length of each chunk (in sequence characters) + :param chunk_overlap: Number of characters to overlap between chunks + :param length_function: Function to calculate length (default: len) + """ + if length_function is None: + length_function = len + super().__init__( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + length_function=length_function, + **kwargs, + ) + + def split_text(self, text: str) -> List[str]: + """ + Split a sequence into chunks of fixed size with overlap. + + :param text: The sequence to split (may include FASTA header) + :return: List of sequence chunks + """ + # Remove FASTA header if present + sequence = self._extract_sequence(text) + + if not sequence: + logger.warning("Empty sequence provided to SequenceSplitter") + return [] + + # If sequence is shorter than chunk_size, return as single chunk + if len(sequence) <= self.chunk_size: + return [sequence] + + chunks = [] + start = 0 + step = self.chunk_size - self.chunk_overlap + + while start < len(sequence): + end = min(start + self.chunk_size, len(sequence)) + chunk = sequence[start:end] + chunks.append(chunk) + start += step + + # Avoid infinite loop if step is 0 or negative + if step <= 0: + break + + return chunks + + @staticmethod + def _extract_sequence(text: str) -> str: + """ + Extract sequence from text, removing FASTA headers and whitespace. + + :param text: Input text (may contain FASTA header) + :return: Clean sequence string + """ + # Remove FASTA header lines (lines starting with >) + lines = text.split("\n") + sequence_lines = [line for line in lines if not line.strip().startswith(">")] + + # Join and remove whitespace + sequence = "".join(sequence_lines) + sequence = re.sub(r"\s+", "", sequence) + + return sequence.upper() # Normalize to uppercase + + @staticmethod + def detect_sequence_type(sequence: str) -> Optional[str]: + """ + Detect the type of sequence (DNA, RNA, or protein). + + :param sequence: The sequence string + :return: "dna", "rna", "protein", or None if cannot determine + """ + # Remove FASTA header and whitespace + clean_seq = SequenceSplitter._extract_sequence(sequence) + + if not clean_seq: + return None + + # Check for protein-specific amino acids + protein_chars = set("EFILPQXZ") # Amino acids not in DNA/RNA + if any(char in clean_seq for char in protein_chars): + return "protein" + + # Check for RNA-specific character (U) + if "U" in clean_seq.upper(): + return "rna" + + # Check if contains only DNA/RNA characters (A, T, G, C, N) + dna_rna_chars = set("ATGCUN") + if all(char.upper() in dna_rna_chars for char in clean_seq): + # Default to DNA if ambiguous (could be DNA or RNA without U) + return "dna" + + # If contains other characters, might be protein + return "protein" diff --git a/graphgen/models/storage/__init__.py b/graphgen/models/storage/__init__.py index e69de29b..889a074c 100644 --- a/graphgen/models/storage/__init__.py +++ b/graphgen/models/storage/__init__.py @@ -0,0 +1,6 @@ +from graphgen.models.storage.graph.kuzu_storage import KuzuStorage +from graphgen.models.storage.graph.networkx_storage import NetworkXStorage +from graphgen.models.storage.kv.json_storage import JsonKVStorage +from graphgen.models.storage.kv.rocksdb_storage import RocksDBKVStorage + +from .rocksdb_cache import RocksDBCache diff --git a/graphgen/models/storage/base_storage.py b/graphgen/models/storage/base_storage.py deleted file mode 100644 index 2e70a3cb..00000000 --- a/graphgen/models/storage/base_storage.py +++ /dev/null @@ -1,94 +0,0 @@ -from dataclasses import dataclass -from typing import Union, Generic, TypeVar -from graphgen.models.embed.embedding import EmbeddingFunc - -T = TypeVar("T") - -@dataclass -class StorageNameSpace: - working_dir: str = None - namespace: str = None - - async def index_done_callback(self): - """commit the storage operations after indexing""" - - async def query_done_callback(self): - """commit the storage operations after querying""" - - -@dataclass -class BaseKVStorage(Generic[T], StorageNameSpace): - embedding_func: EmbeddingFunc = None - - async def all_keys(self) -> list[str]: - raise NotImplementedError - - async def get_by_id(self, id: str) -> Union[T, None]: - raise NotImplementedError - - async def get_by_ids( - self, ids: list[str], fields: Union[set[str], None] = None - ) -> list[Union[T, None]]: - raise NotImplementedError - - async def filter_keys(self, data: list[str]) -> set[str]: - """return un-exist keys""" - raise NotImplementedError - - async def upsert(self, data: dict[str, T]): - raise NotImplementedError - - async def drop(self): - raise NotImplementedError - -@dataclass -class BaseGraphStorage(StorageNameSpace): - embedding_func: EmbeddingFunc = None - - async def has_node(self, node_id: str) -> bool: - raise NotImplementedError - - async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: - raise NotImplementedError - - async def node_degree(self, node_id: str) -> int: - raise NotImplementedError - - async def edge_degree(self, src_id: str, tgt_id: str) -> int: - raise NotImplementedError - - async def get_node(self, node_id: str) -> Union[dict, None]: - raise NotImplementedError - - async def update_node(self, node_id: str, node_data: dict[str, str]): - raise NotImplementedError - - async def get_all_nodes(self) -> Union[list[dict], None]: - raise NotImplementedError - - async def get_edge( - self, source_node_id: str, target_node_id: str - ) -> Union[dict, None]: - raise NotImplementedError - - async def update_edge(self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]): - raise NotImplementedError - - async def get_all_edges(self) -> Union[list[dict], None]: - raise NotImplementedError - - async def get_node_edges( - self, source_node_id: str - ) -> Union[list[tuple[str, str]], None]: - raise NotImplementedError - - async def upsert_node(self, node_id: str, node_data: dict[str, str]): - raise NotImplementedError - - async def upsert_edge( - self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] - ): - raise NotImplementedError - - async def delete_node(self, node_id: str): - raise NotImplementedError diff --git a/graphgen/models/storage/graph/__init__.py b/graphgen/models/storage/graph/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/models/storage/graph/kuzu_storage.py b/graphgen/models/storage/graph/kuzu_storage.py new file mode 100644 index 00000000..4a221b8e --- /dev/null +++ b/graphgen/models/storage/graph/kuzu_storage.py @@ -0,0 +1,256 @@ +import json +import os +import shutil +from dataclasses import dataclass +from typing import Any + +try: + import kuzu +except ImportError: + kuzu = None + +from graphgen.bases.base_storage import BaseGraphStorage + + +@dataclass +class KuzuStorage(BaseGraphStorage): + """ + Graph storage implementation based on KuzuDB. + Since KuzuDB is a structured graph database and GraphGen uses dynamic dictionaries for properties, + we map the data to a generic schema: + - Node Table 'Entity': {id: STRING, data: STRING (JSON)} + - Rel Table 'Relation': {FROM Entity TO Entity, data: STRING (JSON)} + """ + + working_dir: str = None + namespace: str = None + _db: Any = None + _conn: Any = None + + def __post_init__(self): + if kuzu is None: + raise ImportError( + "KuzuDB is not installed. Please install it via `pip install kuzu`." + ) + + self.db_path = os.path.join(self.working_dir, f"{self.namespace}_kuzu") + self._init_db() + + def _init_db(self): + # KuzuDB automatically creates the directory + self._db = kuzu.Database(self.db_path) + self._conn = kuzu.Connection(self._db) + self._init_schema() + print(f"KuzuDB initialized at {self.db_path}") + + def _init_schema(self): + """Initialize the generic Node and Edge tables if they don't exist.""" + # Check and create Node table + try: + # We use a generic table name "Entity" to store all nodes + self._conn.execute( + "CREATE NODE TABLE Entity(id STRING, data STRING, PRIMARY KEY(id))" + ) + print("Created KuzuDB Node Table 'Entity'") + except RuntimeError as e: + # Usually throws if table exists, verify safely or ignore + print("Node Table 'Entity' already exists or error:", e) + + # Check and create Edge table + try: + # We use a generic table name "Relation" to store all edges + self._conn.execute( + "CREATE REL TABLE Relation(FROM Entity TO Entity, data STRING)" + ) + print("Created KuzuDB Rel Table 'Relation'") + except RuntimeError as e: + print("Rel Table 'Relation' already exists or error:", e) + + def index_done_callback(self): + """KuzuDB is ACID, changes are immediate, but we can verify generic persistence here.""" + + def has_node(self, node_id: str) -> bool: + result = self._conn.execute( + "MATCH (a:Entity {id: $id}) RETURN count(a)", {"id": node_id} + ) + count = result.get_next()[0] + return count > 0 + + def has_edge(self, source_node_id: str, target_node_id: str): + result = self._conn.execute( + "MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst}) RETURN count(e)", + {"src": source_node_id, "dst": target_node_id}, + ) + count = result.get_next()[0] + return count > 0 + + def node_degree(self, node_id: str) -> int: + # Calculate total degree (incoming + outgoing) + query = """ + MATCH (a:Entity {id: $id})-[e:Relation]-(b:Entity) + RETURN count(e) + """ + result = self._conn.execute(query, {"id": node_id}) + if result.has_next(): + return result.get_next()[0] + return 0 + + def edge_degree(self, src_id: str, tgt_id: str) -> int: + # In this context, usually checks existence or multiplicity. + # Kuzu supports multi-edges, so we count them. + query = """ + MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst}) + RETURN count(e) + """ + result = self._conn.execute(query, {"src": src_id, "dst": tgt_id}) + if result.has_next(): + return result.get_next()[0] + return 0 + + def get_node(self, node_id: str) -> Any: + result = self._conn.execute( + "MATCH (a:Entity {id: $id}) RETURN a.data", {"id": node_id} + ) + if result.has_next(): + data_str = result.get_next()[0] + return json.loads(data_str) if data_str else {} + return None + + def update_node(self, node_id: str, node_data: dict[str, str]): + current_data = self.get_node(node_id) + if current_data is None: + print(f"Node {node_id} not found for update.") + return + + # Merge existing data with new data + current_data.update(node_data) + json_data = json.dumps(current_data, ensure_ascii=False) + + self._conn.execute( + "MATCH (a:Entity {id: $id}) SET a.data = $data", + {"id": node_id, "data": json_data}, + ) + + def get_all_nodes(self) -> Any: + """Returns List[Tuple[id, data_dict]]""" + result = self._conn.execute("MATCH (a:Entity) RETURN a.id, a.data") + nodes = [] + while result.has_next(): + row = result.get_next() + nodes.append((row[0], json.loads(row[1]))) + return nodes + + def get_edge(self, source_node_id: str, target_node_id: str): + # Warning: If multiple edges exist, this returns the first one found + query = """ + MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst}) + RETURN e.data + """ + result = self._conn.execute( + query, {"src": source_node_id, "dst": target_node_id} + ) + if result.has_next(): + data_str = result.get_next()[0] + return json.loads(data_str) if data_str else {} + return None + + def update_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + current_data = self.get_edge(source_node_id, target_node_id) + if current_data is None: + print(f"Edge {source_node_id}->{target_node_id} not found for update.") + return + + current_data.update(edge_data) + json_data = json.dumps(current_data, ensure_ascii=False) + + query = """ + MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst}) + SET e.data = $data + """ + self._conn.execute( + query, {"src": source_node_id, "dst": target_node_id, "data": json_data} + ) + + def get_all_edges(self) -> Any: + """Returns List[Tuple[src, dst, data_dict]]""" + query = "MATCH (a:Entity)-[e:Relation]->(b:Entity) RETURN a.id, b.id, e.data" + result = self._conn.execute(query) + edges = [] + while result.has_next(): + row = result.get_next() + edges.append((row[0], row[1], json.loads(row[2]))) + return edges + + def get_node_edges(self, source_node_id: str) -> Any: + """Returns generic edges connected to this node (outgoing)""" + query = """ + MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity) + RETURN a.id, b.id, e.data + """ + result = self._conn.execute(query, {"src": source_node_id}) + edges = [] + while result.has_next(): + row = result.get_next() + edges.append((row[0], row[1], json.loads(row[2]))) + return edges + + def upsert_node(self, node_id: str, node_data: dict[str, str]): + """ + Insert or Update node. + Kuzu supports MERGE clause (similar to Neo4j) to handle upserts. + """ + json_data = json.dumps(node_data, ensure_ascii=False) + query = """ + MERGE (a:Entity {id: $id}) + ON MATCH SET a.data = $data + ON CREATE SET a.data = $data + """ + self._conn.execute(query, {"id": node_id, "data": json_data}) + + def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + """ + Insert or Update edge. + Note: We explicitly ensure nodes exist before merging the edge to avoid errors, + although GraphGen generally creates nodes before edges. + """ + # Ensure source node exists + if not self.has_node(source_node_id): + self.upsert_node(source_node_id, {}) + # Ensure target node exists + if not self.has_node(target_node_id): + self.upsert_node(target_node_id, {}) + + json_data = json.dumps(edge_data, ensure_ascii=False) + query = """ + MATCH (a:Entity {id: $src}), (b:Entity {id: $dst}) + MERGE (a)-[e:Relation]->(b) + ON MATCH SET e.data = $data + ON CREATE SET e.data = $data + """ + self._conn.execute( + query, {"src": source_node_id, "dst": target_node_id, "data": json_data} + ) + + def delete_node(self, node_id: str): + # DETACH DELETE removes the node and all connected edges + query = "MATCH (a:Entity {id: $id}) DETACH DELETE a" + self._conn.execute(query, {"id": node_id}) + print(f"Node {node_id} deleted from KuzuDB.") + + def clear(self): + """Clear all data but keep schema (or drop tables).""" + self._conn.execute("MATCH (n) DETACH DELETE n") + print(f"Graph {self.namespace} cleared.") + + def reload(self): + """For databases that need reloading, KuzuDB auto-manages this.""" + + def drop(self): + """Completely remove the database folder.""" + if self.db_path and os.path.exists(self.db_path): + shutil.rmtree(self.db_path) + print(f"Dropped KuzuDB at {self.db_path}") diff --git a/graphgen/models/storage/networkx_storage.py b/graphgen/models/storage/graph/networkx_storage.py similarity index 63% rename from graphgen/models/storage/networkx_storage.py rename to graphgen/models/storage/graph/networkx_storage.py index 92643760..7fb73b79 100644 --- a/graphgen/models/storage/networkx_storage.py +++ b/graphgen/models/storage/graph/networkx_storage.py @@ -1,11 +1,12 @@ -import os import html -from typing import Any, Union, cast, Optional +import os from dataclasses import dataclass +from typing import Any, Optional, Union, cast + import networkx as nx -from graphgen.utils import logger -from .base_storage import BaseGraphStorage +from graphgen.bases.base_storage import BaseGraphStorage + @dataclass class NetworkXStorage(BaseGraphStorage): @@ -17,7 +18,6 @@ def load_nx_graph(file_name) -> Optional[nx.Graph]: @staticmethod def write_nx_graph(graph: nx.Graph, file_name): - logger.info("Writing graph with %d nodes, %d edges", graph.number_of_nodes(), graph.number_of_edges()) nx.write_graphml(graph, file_name) @staticmethod @@ -69,77 +69,81 @@ def _get_edge_key(source: Any, target: Any) -> str: def __post_init__(self): """ - 如果图文件存在,则加载图文件,否则创建一个新图 + Initialize the NetworkX graph storage by loading an existing graph from a GraphML file, + if it exists, or creating a new empty graph otherwise. """ self._graphml_xml_file = os.path.join( self.working_dir, f"{self.namespace}.graphml" ) preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file) - if preloaded_graph is not None: - logger.info( - "Loaded graph from %s with %d nodes, %d edges", self._graphml_xml_file, - preloaded_graph.number_of_nodes(), preloaded_graph.number_of_edges() + if preloaded_graph: + print( + f"Loaded graph from {self._graphml_xml_file} with " + f"{preloaded_graph.number_of_nodes()} nodes, " + f"{preloaded_graph.number_of_edges()} edges" ) self._graph = preloaded_graph or nx.Graph() - async def index_done_callback(self): + def index_done_callback(self): NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file) - async def has_node(self, node_id: str) -> bool: + def has_node(self, node_id: str) -> bool: return self._graph.has_node(node_id) - async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + def has_edge(self, source_node_id: str, target_node_id: str) -> bool: return self._graph.has_edge(source_node_id, target_node_id) - async def get_node(self, node_id: str) -> Union[dict, None]: + def get_node(self, node_id: str) -> Union[dict, None]: return self._graph.nodes.get(node_id) - async def get_all_nodes(self) -> Union[list[dict], None]: - return self._graph.nodes(data=True) + def get_all_nodes(self) -> Union[list[tuple[str, dict]], None]: + return list(self._graph.nodes(data=True)) - async def node_degree(self, node_id: str) -> int: - return self._graph.degree(node_id) + def node_degree(self, node_id: str) -> int: + return int(self._graph.degree[node_id]) - async def edge_degree(self, src_id: str, tgt_id: str) -> int: - return self._graph.degree(src_id) + self._graph.degree(tgt_id) + def edge_degree(self, src_id: str, tgt_id: str) -> int: + return int(self._graph.degree[src_id] + self._graph.degree[tgt_id]) - async def get_edge( - self, source_node_id: str, target_node_id: str - ) -> Union[dict, None]: + def get_edge(self, source_node_id: str, target_node_id: str) -> Union[dict, None]: return self._graph.edges.get((source_node_id, target_node_id)) - async def get_all_edges(self) -> Union[list[dict], None]: - return self._graph.edges(data=True) + def get_all_edges(self) -> Union[list[tuple[str, str, dict]], None]: + return list(self._graph.edges(data=True)) - async def get_node_edges(self, source_node_id: str) -> Union[list[tuple[str, str]], None]: + def get_node_edges(self, source_node_id: str) -> Union[list[tuple[str, str]], None]: if self._graph.has_node(source_node_id): return list(self._graph.edges(source_node_id, data=True)) return None - async def get_graph(self) -> nx.Graph: + def get_graph(self) -> nx.Graph: return self._graph - async def upsert_node(self, node_id: str, node_data: dict[str, str]): + def upsert_node(self, node_id: str, node_data: dict[str, str]): self._graph.add_node(node_id, **node_data) - async def update_node(self, node_id: str, node_data: dict[str, str]): + def update_node(self, node_id: str, node_data: dict[str, str]): if self._graph.has_node(node_id): self._graph.nodes[node_id].update(node_data) else: - logger.warning("Node %s not found in the graph for update.", node_id) + print(f"Node {node_id} not found in the graph for update.") - async def upsert_edge( + def upsert_edge( self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] ): self._graph.add_edge(source_node_id, target_node_id, **edge_data) - async def update_edge(self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]): + def update_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): if self._graph.has_edge(source_node_id, target_node_id): self._graph.edges[(source_node_id, target_node_id)].update(edge_data) else: - logger.warning("Edge %s -> %s not found in the graph for update.", source_node_id, target_node_id) + print( + f"Edge {source_node_id} -> {target_node_id} not found in the graph for update." + ) - async def delete_node(self, node_id: str): + def delete_node(self, node_id: str): """ Delete a node from the graph based on the specified node_id. @@ -147,13 +151,19 @@ async def delete_node(self, node_id: str): """ if self._graph.has_node(node_id): self._graph.remove_node(node_id) - logger.info("Node %s deleted from the graph.", node_id) + print(f"Node {node_id} deleted from the graph.") else: - logger.warning("Node %s not found in the graph for deletion.", node_id) + print(f"Node {node_id} not found in the graph for deletion.") - async def clear(self): + def clear(self): """ Clear the graph by removing all nodes and edges. """ self._graph.clear() - logger.info("Graph %s cleared.", self.namespace) + print(f"Graph {self.namespace} cleared.") + + def reload(self): + """ + Reload the graph from the GraphML file. + """ + self.__post_init__() diff --git a/graphgen/models/storage/json_storage.py b/graphgen/models/storage/json_storage.py deleted file mode 100644 index e4f21e6c..00000000 --- a/graphgen/models/storage/json_storage.py +++ /dev/null @@ -1,51 +0,0 @@ -import os - -from dataclasses import dataclass -from graphgen.utils import logger, load_json, write_json -from graphgen.models.storage.base_storage import BaseKVStorage - - -@dataclass -class JsonKVStorage(BaseKVStorage): - _data: dict[str, str] = None - - def __post_init__(self): - self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json") - self._data = load_json(self._file_name) or {} - logger.info("Load KV %s with %d data", self.namespace, len(self._data)) - - @property - def data(self): - return self._data - - async def all_keys(self) -> list[str]: - return list(self._data.keys()) - - async def index_done_callback(self): - write_json(self._data, self._file_name) - - async def get_by_id(self, id): - return self._data.get(id, None) - - async def get_by_ids(self, ids, fields=None) -> list: - if fields is None: - return [self._data.get(id, None) for id in ids] - return [ - ( - {k: v for k, v in self._data[id].items() if k in fields} - if self._data.get(id, None) - else None - ) - for id in ids - ] - - async def filter_keys(self, data: list[str]) -> set[str]: - return {s for s in data if s not in self._data} - - async def upsert(self, data: dict): - left_data = {k: v for k, v in data.items() if k not in self._data} - self._data.update(left_data) - return left_data - - async def drop(self): - self._data = {} diff --git a/graphgen/models/storage/kv/__init__.py b/graphgen/models/storage/kv/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/models/storage/kv/json_storage.py b/graphgen/models/storage/kv/json_storage.py new file mode 100644 index 00000000..4af8d1e5 --- /dev/null +++ b/graphgen/models/storage/kv/json_storage.py @@ -0,0 +1,97 @@ +import os +from dataclasses import dataclass +from typing import Iterator, Tuple + +from graphgen.bases.base_storage import BaseKVStorage +from graphgen.utils import load_json, write_json + + +@dataclass +class JsonKVStorage(BaseKVStorage): + _data: dict[str, dict] = None + + def __post_init__(self): + self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json") + self._data = load_json(self._file_name) or {} + print(f"Load KV {self.namespace} with {len(self._data)} data") + + @property + def data(self): + return self._data + + def all_keys(self) -> list[str]: + return list(self._data.keys()) + + def index_done_callback(self): + write_json(self._data, self._file_name) + + def get_by_id(self, id): + return self._data.get(id, None) + + def get_by_ids(self, ids, fields=None) -> list: + if fields is None: + return [self._data.get(id, None) for id in ids] + return [ + ( + {k: v for k, v in self._data[id].items() if k in fields} + if self._data.get(id, None) + else None + ) + for id in ids + ] + + def get_all(self) -> dict[str, dict]: + return self._data + + def iter_items(self) -> Iterator[Tuple[str, dict]]: + """ + Iterate over all items without loading everything into memory at once. + Returns an iterator of (key, value) tuples. + """ + for key, value in self._data.items(): + yield key, value + + def get_batch(self, keys: list[str]) -> dict[str, dict]: + """ + Get a batch of items by their keys. + + :param keys: List of keys to retrieve. + :return: Dictionary of {key: value} for the requested keys. + """ + return {key: self._data.get(key) for key in keys if key in self._data} + + def iter_batches(self, batch_size: int = 10000) -> Iterator[dict[str, dict]]: + """ + Iterate over items in batches to avoid loading everything into memory. + + :param batch_size: Number of items per batch. + :return: Iterator of dictionaries, each containing up to batch_size items. + """ + batch = {} + count = 0 + for key, value in self._data.items(): + batch[key] = value + count += 1 + if count >= batch_size: + yield batch + batch = {} + count = 0 + if batch: + yield batch + + def filter_keys(self, data: list[str]) -> set[str]: + return {s for s in data if s not in self._data} + + def upsert(self, data: dict): + left_data = {k: v for k, v in data.items() if k not in self._data} + if left_data: + self._data.update(left_data) + return left_data + + def drop(self): + if self._data: + self._data.clear() + + def reload(self): + self._data = load_json(self._file_name) or {} + print(f"Reload KV {self.namespace} with {len(self._data)} data") diff --git a/graphgen/models/storage/kv/rocksdb_storage.py b/graphgen/models/storage/kv/rocksdb_storage.py new file mode 100644 index 00000000..45055b93 --- /dev/null +++ b/graphgen/models/storage/kv/rocksdb_storage.py @@ -0,0 +1,82 @@ +import os +from dataclasses import dataclass +from typing import Any, Dict, List, Set + +# rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it +# pylint: disable=no-name-in-module +from rocksdict import Rdict + +from graphgen.bases.base_storage import BaseKVStorage + + +@dataclass +class RocksDBKVStorage(BaseKVStorage): + _db: Rdict = None + _db_path: str = None + + def __post_init__(self): + self._db_path = os.path.join(self.working_dir, f"{self.namespace}.db") + self._db = Rdict(self._db_path) + print( + f"RocksDBKVStorage initialized for namespace '{self.namespace}' at '{self._db_path}'" + ) + + @property + def data(self): + return self._db + + def all_keys(self) -> List[str]: + return list(self._db.keys()) + + def index_done_callback(self): + self._db.flush() + print(f"RocksDB flushed for {self.namespace}") + + def get_by_id(self, id: str) -> Any: + return self._db.get(id, None) + + def get_by_ids(self, ids: List[str], fields: List[str] = None) -> List[Any]: + result = [] + for index in ids: + item = self._db.get(index, None) + if item is None: + result.append(None) + continue + + if fields is None: + result.append(item) + else: + result.append({k: v for k, v in item.items() if k in fields}) + return result + + def get_all(self) -> Dict[str, Dict]: + return dict(self._db) + + def filter_keys(self, data: List[str]) -> Set[str]: + return {s for s in data if s not in self._db} + + def upsert(self, data: Dict[str, Any]): + left_data = {} + for k, v in data.items(): + if k not in self._db: + left_data[k] = v + + if left_data: + for k, v in left_data.items(): + self._db[k] = v + # if left_data is very large, it is recommended to use self._db.write_batch() for optimization + + return left_data + + def drop(self): + self._db.close() + Rdict.destroy(self._db_path) + self._db = Rdict(self._db_path) + print(f"Dropped RocksDB {self.namespace}") + + def close(self): + if self._db: + self._db.close() + + def reload(self): + """For databases that need reloading, RocksDB auto-manages this.""" diff --git a/graphgen/models/storage/rocksdb_cache.py b/graphgen/models/storage/rocksdb_cache.py new file mode 100644 index 00000000..2345b5b5 --- /dev/null +++ b/graphgen/models/storage/rocksdb_cache.py @@ -0,0 +1,43 @@ +from pathlib import Path +from typing import Any, Iterator, Optional + +# rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it +# pylint: disable=no-name-in-module +from rocksdict import Rdict + + +class RocksDBCache: + def __init__(self, cache_dir: str): + self.db_path = Path(cache_dir) + self.db = Rdict(str(self.db_path)) + + def get(self, key: str) -> Optional[Any]: + return self.db.get(key) + + def set(self, key: str, value: Any): + self.db[key] = value + + def delete(self, key: str): + try: + del self.db[key] + except KeyError: + # If the key does not exist, do nothing (deletion is idempotent for caches) + pass + + def close(self): + if hasattr(self, "db") and self.db is not None: + self.db.close() + self.db = None + + def __del__(self): + # Ensure the database is closed when the object is destroyed + self.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def __iter__(self) -> Iterator[str]: + return iter(self.db.keys()) diff --git a/graphgen/models/strategy/base_strategy.py b/graphgen/models/strategy/base_strategy.py deleted file mode 100644 index 70e0cc54..00000000 --- a/graphgen/models/strategy/base_strategy.py +++ /dev/null @@ -1,5 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class BaseStrategy: - pass diff --git a/graphgen/models/strategy/travserse_strategy.py b/graphgen/models/strategy/travserse_strategy.py deleted file mode 100644 index 06882c5f..00000000 --- a/graphgen/models/strategy/travserse_strategy.py +++ /dev/null @@ -1,30 +0,0 @@ -from dataclasses import dataclass, fields - -from graphgen.models.strategy.base_strategy import BaseStrategy - - -@dataclass -class TraverseStrategy(BaseStrategy): - # 生成的QA形式:原子、多跳、聚合型 - qa_form: str = "atomic" # "atomic" or "multi_hop" or "aggregated" - # 最大边数和最大token数方法中选择一个生效 - expand_method: str = "max_tokens" # "max_width" or "max_tokens" - # 单向拓展还是双向拓展 - bidirectional: bool = True - # 每个方向拓展的最大边数 - max_extra_edges: int = 5 - # 最长token数 - max_tokens: int = 256 - # 每个方向拓展的最大深度 - max_depth: int = 2 - # 同一层中选边的策略(如果是双向拓展,同一层指的是两边连接的边的集合) - edge_sampling: str = "max_loss" # "max_loss" or "min_loss" or "random" - # 孤立节点的处理策略 - isolated_node_strategy: str = "add" # "add" or "ignore" - loss_strategy: str = "only_edge" # only_edge, both - - def to_yaml(self): - strategy_dict = {} - for f in fields(self): - strategy_dict[f.name] = getattr(self, f.name) - return {"traverse_strategy": strategy_dict} diff --git a/graphgen/models/text/chunk.py b/graphgen/models/text/chunk.py deleted file mode 100644 index 9678949f..00000000 --- a/graphgen/models/text/chunk.py +++ /dev/null @@ -1,7 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class Chunk: - id : str - content: str diff --git a/graphgen/models/text/text_pair.py b/graphgen/models/text/text_pair.py deleted file mode 100644 index f9a971f1..00000000 --- a/graphgen/models/text/text_pair.py +++ /dev/null @@ -1,9 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class TextPair: - """ - A pair of input data. - """ - question: str - answer: str diff --git a/graphgen/models/tokenizer/__init__.py b/graphgen/models/tokenizer/__init__.py new file mode 100644 index 00000000..6712f918 --- /dev/null +++ b/graphgen/models/tokenizer/__init__.py @@ -0,0 +1,50 @@ +from typing import List + +from graphgen.bases import BaseTokenizer + +from .tiktoken_tokenizer import TiktokenTokenizer + +try: + from transformers import AutoTokenizer + + _HF_AVAILABLE = True +except ImportError: + _HF_AVAILABLE = False + + +def get_tokenizer_impl(tokenizer_name: str = "cl100k_base") -> BaseTokenizer: + import tiktoken + + if tokenizer_name in tiktoken.list_encoding_names(): + return TiktokenTokenizer(model_name=tokenizer_name) + + # 2. HuggingFace + if _HF_AVAILABLE: + from .hf_tokenizer import HFTokenizer + + return HFTokenizer(model_name=tokenizer_name) + + raise ValueError( + f"Unknown tokenizer {tokenizer_name} and HuggingFace not available." + ) + + +class Tokenizer(BaseTokenizer): + """ + Encapsulates different tokenization implementations based on the specified model name. + """ + + def __init__(self, model_name: str = "cl100k_base"): + super().__init__(model_name) + if not self.model_name: + raise ValueError("TOKENIZER_MODEL must be specified in the ENV variables.") + self._impl = get_tokenizer_impl(self.model_name) + + def encode(self, text: str) -> List[int]: + return self._impl.encode(text) + + def decode(self, token_ids: List[int]) -> str: + return self._impl.decode(token_ids) + + def count_tokens(self, text: str) -> int: + return self._impl.count_tokens(text) diff --git a/graphgen/models/tokenizer/hf_tokenizer.py b/graphgen/models/tokenizer/hf_tokenizer.py new file mode 100644 index 00000000..c43ddd7d --- /dev/null +++ b/graphgen/models/tokenizer/hf_tokenizer.py @@ -0,0 +1,17 @@ +from typing import List + +from transformers import AutoTokenizer + +from graphgen.bases import BaseTokenizer + + +class HFTokenizer(BaseTokenizer): + def __init__(self, model_name: str = "cl100k_base"): + super().__init__(model_name) + self.enc = AutoTokenizer.from_pretrained(self.model_name) + + def encode(self, text: str) -> List[int]: + return self.enc.encode(text, add_special_tokens=False) + + def decode(self, token_ids: List[int]) -> str: + return self.enc.decode(token_ids, skip_special_tokens=True) diff --git a/graphgen/models/tokenizer/tiktoken_tokenizer.py b/graphgen/models/tokenizer/tiktoken_tokenizer.py new file mode 100644 index 00000000..6145d070 --- /dev/null +++ b/graphgen/models/tokenizer/tiktoken_tokenizer.py @@ -0,0 +1,17 @@ +from typing import List + +import tiktoken + +from graphgen.bases import BaseTokenizer + + +class TiktokenTokenizer(BaseTokenizer): + def __init__(self, model_name: str = "cl100k_base"): + super().__init__(model_name) + self.enc = tiktoken.get_encoding(self.model_name) + + def encode(self, text: str) -> List[int]: + return self.enc.encode(text) + + def decode(self, token_ids: List[int]) -> str: + return self.enc.decode(token_ids) diff --git a/graphgen/models/vis/__init__.py b/graphgen/models/vis/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/models/vis/community_visualizer.py b/graphgen/models/vis/community_visualizer.py new file mode 100644 index 00000000..05551014 --- /dev/null +++ b/graphgen/models/vis/community_visualizer.py @@ -0,0 +1,48 @@ +from dataclasses import dataclass +from typing import Dict + +import matplotlib.pyplot as plt +import networkx as nx + + +@dataclass +class Visualizer: + """ + Class for visualizing graphs using NetworkX and Matplotlib. + """ + + graph: nx.Graph = None + communities: Dict[str, int] = None + layout: str = "spring" + max_nodes: int = 1000 + node_size: int = 10 + alpha: float = 0.6 + + def visualize(self, save_path: str = None): + n = self.graph.number_of_nodes() + if self.layout == "spring": + k = max(0.1, 1.0 / (n**0.5)) + pos = nx.spring_layout(self.graph, k=k, seed=42) + else: + raise ValueError(f"Unknown layout: {self.layout}") + + plt.figure(figsize=(10, 10)) + + node_colors = [self.communities.get(node, 0) for node in self.graph.nodes()] + + nx.draw_networkx_nodes( + self.graph, + pos, + node_size=self.node_size, + node_color=node_colors, + cmap=plt.cm.tab20, + alpha=self.alpha, + ) + nx.draw_networkx_edges(self.graph, pos, alpha=0.3, width=0.2) + plt.axis("off") + + if save_path: + plt.savefig(save_path, dpi=300, bbox_inches="tight") + print("Saved to", save_path) + else: + plt.show() diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index 8ef14fdc..d8e78927 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -1,16 +1,21 @@ -from .extract_kg import extract_kg -from .quiz import quiz -from .judge import judge_statement, skip_judge_statement -from .search_wikipedia import search_wikipedia -from .traverse_graph import traverse_graph_by_edge, traverse_graph_atomically, traverse_graph_for_multi_hop +from .build_kg import BuildKGService +from .chunk import ChunkService +from .extract import ExtractService +from .generate import GenerateService +from .judge import JudgeService +from .partition import PartitionService +from .quiz import QuizService +from .read import read, read_files +from .search import SearchService -__all__ = [ - "extract_kg", - "quiz", - "judge_statement", - "skip_judge_statement", - "search_wikipedia", - "traverse_graph_by_edge", - "traverse_graph_atomically", - "traverse_graph_for_multi_hop" -] +operators = { + "read": read, + "chunk": ChunkService, + "build_kg": BuildKGService, + "quiz": QuizService, + "judge": JudgeService, + "extract": ExtractService, + "search": SearchService, + "partition": PartitionService, + "generate": GenerateService, +} diff --git a/graphgen/operators/build_kg/__init__.py b/graphgen/operators/build_kg/__init__.py new file mode 100644 index 00000000..a8b22ce9 --- /dev/null +++ b/graphgen/operators/build_kg/__init__.py @@ -0,0 +1 @@ +from .build_kg_service import BuildKGService diff --git a/graphgen/operators/build_kg/build_kg_service.py b/graphgen/operators/build_kg/build_kg_service.py new file mode 100644 index 00000000..76dba8a9 --- /dev/null +++ b/graphgen/operators/build_kg/build_kg_service.py @@ -0,0 +1,78 @@ +from typing import List + +import pandas as pd + +from graphgen.bases import BaseGraphStorage, BaseLLMWrapper, BaseOperator +from graphgen.bases.datatypes import Chunk +from graphgen.common import init_llm, init_storage +from graphgen.utils import logger + +from .build_mm_kg import build_mm_kg +from .build_omics_kg import build_omics_kg +from .build_text_kg import build_text_kg + + +class BuildKGService(BaseOperator): + def __init__(self, working_dir: str = "cache"): + super().__init__(working_dir=working_dir, op_name="build_kg_service") + self.llm_client: BaseLLMWrapper = init_llm("synthesizer") + self.graph_storage: BaseGraphStorage = init_storage( + backend="kuzu", working_dir=working_dir, namespace="graph" + ) + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: + docs = batch.to_dict(orient="records") + docs = [Chunk.from_dict(doc["_chunk_id"], doc) for doc in docs] + + # consume the chunks and build kg + self.build_kg(docs) + return pd.DataFrame([{"status": "kg_building_completed"}]) + + def build_kg(self, chunks: List[Chunk]) -> None: + """ + Build knowledge graph (KG) and merge into kg_instance + """ + text_chunks = [chunk for chunk in chunks if chunk.type == "text"] + mm_chunks = [ + chunk + for chunk in chunks + if chunk.type in ("image", "video", "table", "formula") + ] + omics_chunks = [ + chunk + for chunk in chunks + if chunk.type in ("dna", "rna", "protein") + ] + + if len(text_chunks) == 0: + logger.info("All text chunks are already in the storage") + else: + logger.info("[Text Entity and Relation Extraction] processing ...") + build_text_kg( + llm_client=self.llm_client, + kg_instance=self.graph_storage, + chunks=text_chunks, + ) + if len(mm_chunks) == 0: + logger.info("All multi-modal chunks are already in the storage") + else: + logger.info("[Multi-modal Entity and Relation Extraction] processing ...") + build_mm_kg( + llm_client=self.llm_client, + kg_instance=self.graph_storage, + chunks=mm_chunks, + ) + if len(omics_chunks) == 0: + logger.info("All omics chunks are already in the storage") + else: + logger.info( + "[Omics Entity and Relation Extraction] processing %d chunks (DNA/RNA/protein)...", + len(omics_chunks) + ) + build_omics_kg( + llm_client=self.llm_client, + kg_instance=self.graph_storage, + chunks=omics_chunks, + ) + + self.graph_storage.index_done_callback() diff --git a/graphgen/operators/build_kg/build_mm_kg.py b/graphgen/operators/build_kg/build_mm_kg.py new file mode 100644 index 00000000..ee0459ea --- /dev/null +++ b/graphgen/operators/build_kg/build_mm_kg.py @@ -0,0 +1,50 @@ +from collections import defaultdict +from typing import List + +from graphgen.bases import BaseLLMWrapper +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk +from graphgen.models import MMKGBuilder +from graphgen.utils import run_concurrent + + +def build_mm_kg( + llm_client: BaseLLMWrapper, + kg_instance: BaseGraphStorage, + chunks: List[Chunk], +): + """ + Build multi-modal KG and merge into kg_instance + :param llm_client: Synthesizer LLM model to extract entities and relationships + :param kg_instance + :param chunks + :return: + """ + mm_builder = MMKGBuilder(llm_client=llm_client) + + results = run_concurrent( + mm_builder.extract, + chunks, + desc="[2/4] Extracting entities and relationships from multi-modal chunks", + unit="chunk", + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + for n, e in results: + for k, v in n.items(): + nodes[k].extend(v) + for k, v in e.items(): + edges[tuple(sorted(k))].extend(v) + + run_concurrent( + lambda kv: mm_builder.merge_nodes(kv, kg_instance=kg_instance), + list(nodes.items()), + desc="Inserting entities into storage", + ) + + run_concurrent( + lambda kv: mm_builder.merge_edges(kv, kg_instance=kg_instance), + list(edges.items()), + desc="Inserting relationships into storage", + ) diff --git a/graphgen/operators/build_kg/build_omics_kg.py b/graphgen/operators/build_kg/build_omics_kg.py new file mode 100644 index 00000000..503bb7eb --- /dev/null +++ b/graphgen/operators/build_kg/build_omics_kg.py @@ -0,0 +1,51 @@ +from collections import defaultdict +from typing import List + +from graphgen.bases import BaseLLMWrapper +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk +from graphgen.models import OmicsKGBuilder +from graphgen.utils import run_concurrent + + +def build_omics_kg( + llm_client: BaseLLMWrapper, + kg_instance: BaseGraphStorage, + chunks: List[Chunk], +): + """ + Build knowledge graph from multi-omics chunks (DNA, RNA, protein). + + :param llm_client: Synthesizer LLM model to extract entities and relationships + :param kg_instance: Graph storage instance + :param chunks: List of omics chunks + :return: None + """ + kg_builder = OmicsKGBuilder(llm_client=llm_client, max_loop=3) + + results = run_concurrent( + kg_builder.extract, + chunks, + desc="[2/4] Extracting entities and relationships from omics chunks", + unit="chunk", + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + for n, e in results: + for k, v in n.items(): + nodes[k].extend(v) + for k, v in e.items(): + edges[tuple(sorted(k))].extend(v) + + run_concurrent( + lambda kv: kg_builder.merge_nodes(kv, kg_instance=kg_instance), + list(nodes.items()), + desc="Inserting omics entities into storage", + ) + + run_concurrent( + lambda kv: kg_builder.merge_edges(kv, kg_instance=kg_instance), + list(edges.items()), + desc="Inserting omics relationships into storage", + ) diff --git a/graphgen/operators/build_kg/build_text_kg.py b/graphgen/operators/build_kg/build_text_kg.py new file mode 100644 index 00000000..1b5a8762 --- /dev/null +++ b/graphgen/operators/build_kg/build_text_kg.py @@ -0,0 +1,50 @@ +from collections import defaultdict +from typing import List + +from graphgen.bases import BaseLLMWrapper +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk +from graphgen.models import LightRAGKGBuilder +from graphgen.utils import run_concurrent + + +def build_text_kg( + llm_client: BaseLLMWrapper, + kg_instance: BaseGraphStorage, + chunks: List[Chunk], +): + """ + :param llm_client: Synthesizer LLM model to extract entities and relationships + :param kg_instance + :param chunks + :return: + """ + + kg_builder = LightRAGKGBuilder(llm_client=llm_client, max_loop=3) + + results = run_concurrent( + kg_builder.extract, + chunks, + desc="[2/4]Extracting entities and relationships from chunks", + unit="chunk", + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + for n, e in results: + for k, v in n.items(): + nodes[k].extend(v) + for k, v in e.items(): + edges[tuple(sorted(k))].extend(v) + + run_concurrent( + lambda kv: kg_builder.merge_nodes(kv, kg_instance=kg_instance), + list(nodes.items()), + desc="Inserting entities into storage", + ) + + run_concurrent( + lambda kv: kg_builder.merge_edges(kv, kg_instance=kg_instance), + list(edges.items()), + desc="Inserting relationships into storage", + ) diff --git a/graphgen/operators/chunk/__init__.py b/graphgen/operators/chunk/__init__.py new file mode 100644 index 00000000..f2f116f7 --- /dev/null +++ b/graphgen/operators/chunk/__init__.py @@ -0,0 +1 @@ +from .chunk_service import ChunkService diff --git a/graphgen/operators/chunk/chunk_service.py b/graphgen/operators/chunk/chunk_service.py new file mode 100644 index 00000000..b6775764 --- /dev/null +++ b/graphgen/operators/chunk/chunk_service.py @@ -0,0 +1,101 @@ +import os +from functools import lru_cache +from typing import Union + +import pandas as pd + +from graphgen.bases import BaseOperator +from graphgen.common import init_storage +from graphgen.models import ( + ChineseRecursiveTextSplitter, + RecursiveCharacterSplitter, + Tokenizer, +) +from graphgen.utils import compute_content_hash, detect_main_language + +_MAPPING = { + "en": RecursiveCharacterSplitter, + "zh": ChineseRecursiveTextSplitter, +} + +SplitterT = Union[RecursiveCharacterSplitter, ChineseRecursiveTextSplitter] + + +@lru_cache(maxsize=None) +def _get_splitter(language: str, frozen_kwargs: frozenset) -> SplitterT: + cls = _MAPPING[language] + kwargs = dict(frozen_kwargs) + return cls(**kwargs) + + +def split_chunks(text: str, language: str = "en", **kwargs) -> list: + if language not in _MAPPING: + raise ValueError( + f"Unsupported language: {language}. " + f"Supported languages are: {list(_MAPPING.keys())}" + ) + frozen_kwargs = frozenset( + (k, tuple(v) if isinstance(v, list) else v) for k, v in kwargs.items() + ) + splitter = _get_splitter(language, frozen_kwargs) + return splitter.split_text(text) + + +class ChunkService(BaseOperator): + def __init__(self, working_dir: str = "cache", **chunk_kwargs): + super().__init__(working_dir=working_dir, op_name="chunk_service") + tokenizer_model = os.getenv("TOKENIZER_MODEL", "cl100k_base") + self.tokenizer_instance: Tokenizer = Tokenizer(model_name=tokenizer_model) + self.chunk_storage = init_storage( + backend="rocksdb", + working_dir=working_dir, + namespace="chunk", + ) + self.chunk_kwargs = chunk_kwargs + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: + docs = batch.to_dict(orient="records") + return pd.DataFrame(self.chunk_documents(docs)) + + def chunk_documents(self, new_docs: list) -> list: + chunks = [] + for doc in new_docs: + doc_id = doc.get("_doc_id") + doc_type = doc.get("type") + + if doc_type == "text": + doc_language = detect_main_language(doc["content"]) + text_chunks = split_chunks( + doc["content"], + language=doc_language, + **self.chunk_kwargs, + ) + + chunks.extend( + [ + { + "_chunk_id": compute_content_hash( + chunk_text, prefix="chunk-" + ), + "content": chunk_text, + "type": "text", + "_doc_id": doc_id, + "length": len(self.tokenizer_instance.encode(chunk_text)) + if self.tokenizer_instance + else len(chunk_text), + "language": doc_language, + } + for chunk_text in text_chunks + ] + ) + else: + # other types of documents(images, sequences) are not chunked + chunks.append( + { + "_chunk_id": doc_id.replace("doc-", f"{doc_type}-"), + **doc, + } + ) + self.chunk_storage.upsert({chunk["_chunk_id"]: chunk for chunk in chunks}) + self.chunk_storage.index_done_callback() + return chunks diff --git a/graphgen/operators/evaluate/__init__.py b/graphgen/operators/evaluate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/operators/evaluate/evaluate.py b/graphgen/operators/evaluate/evaluate.py new file mode 100644 index 00000000..fdbfbf82 --- /dev/null +++ b/graphgen/operators/evaluate/evaluate.py @@ -0,0 +1,177 @@ +# TODO: this module needs refactoring to merge into GraphGen framework +"""Evaluate the quality of the generated text using various metrics""" + +import argparse +import json +import os + +import pandas as pd +from dotenv import load_dotenv + +from graphgen.bases.datatypes import QAPair +from graphgen.models import ( + LengthEvaluator, + MTLDEvaluator, + RewardEvaluator, + UniEvaluator, +) +from graphgen.utils import logger, set_logger + +sys_path = os.path.abspath(os.path.dirname(__file__)) +set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log")) + +load_dotenv() + + +def evaluate_length(corpus, tokenizer_name): + length_evaluator = LengthEvaluator(tokenizer_name=tokenizer_name) + logger.info("Length evaluator loaded") + scores = length_evaluator.get_average_score(corpus) + logger.info("Length scores: %s", scores) + return scores + + +def evaluate_mtld(corpus): + mtld_evaluator = MTLDEvaluator() + logger.info("MTLD evaluator loaded") + scores = mtld_evaluator.get_average_score(corpus) + logger.info("MTLD scores: %s", scores) + min_max_scores = mtld_evaluator.get_min_max_score(corpus) + logger.info("MTLD min max scores: %s", min_max_scores) + return scores, min_max_scores + + +def evaluate_reward(corpus, reward_model_names): + scores = [] + for reward_name in reward_model_names: + reward_evaluator = RewardEvaluator(reward_name=reward_name) + logger.info("Loaded reward model: %s", reward_name) + average_score = reward_evaluator.get_average_score(corpus) + logger.info("%s scores: %s", reward_name, average_score) + min_max_scores = reward_evaluator.get_min_max_score(corpus) + logger.info("%s min max scores: %s", reward_name, min_max_scores) + scores.append( + { + "reward_name": reward_name.split("/")[-1], + "score": average_score, + "min_max_scores": min_max_scores, + } + ) + del reward_evaluator + clean_gpu_cache() + return scores + + +def evaluate_uni(corpus, uni_model_name): + uni_evaluator = UniEvaluator(model_name=uni_model_name) + logger.info("Uni evaluator loaded with model %s", uni_model_name) + uni_scores = uni_evaluator.get_average_score(corpus) + for key, value in uni_scores.items(): + logger.info("Uni %s scores: %s", key, value) + min_max_scores = uni_evaluator.get_min_max_score(corpus) + for key, value in min_max_scores.items(): + logger.info("Uni %s min max scores: %s", key, value) + del uni_evaluator + clean_gpu_cache() + return ( + uni_scores["naturalness"], + uni_scores["coherence"], + uni_scores["understandability"], + min_max_scores["naturalness"], + min_max_scores["coherence"], + min_max_scores["understandability"], + ) + + +def clean_gpu_cache(): + import torch + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + +if __name__ == "__main__": + import torch.multiprocessing as mp + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--folder", type=str, default="cache/data", help="folder to load data" + ) + parser.add_argument( + "--output", type=str, default="cache/output", help="path to save output" + ) + + parser.add_argument( + "--tokenizer", type=str, default="cl100k_base", help="tokenizer name" + ) + parser.add_argument( + "--reward", + type=str, + default="OpenAssistant/reward-model-deberta-v3-large-v2", + help="Comma-separated list of reward models", + ) + parser.add_argument( + "--uni", type=str, default="MingZhong/unieval-sum", help="uni model name" + ) + + args = parser.parse_args() + + if not os.path.exists(args.folder): + raise ValueError(f"Folder {args.folder} does not exist") + + if not os.path.exists(args.output): + os.makedirs(args.output) + + reward_models = args.reward.split(",") + + results = [] + + logger.info("Data loaded from %s", args.folder) + mp.set_start_method("spawn") + + for file in os.listdir(args.folder): + if file.endswith(".json"): + logger.info("Processing %s", file) + with open(os.path.join(args.folder, file), "r", encoding="utf-8") as f: + data = json.load(f) + data = [ + QAPair(question=data[key]["question"], answer=data[key]["answer"]) + for key in data + ] + + length_scores = evaluate_length(data, args.tokenizer) + mtld_scores, min_max_mtld_scores = evaluate_mtld(data) + reward_scores = evaluate_reward(data, reward_models) + ( + uni_naturalness_scores, + uni_coherence_scores, + uni_understandability_scores, + min_max_uni_naturalness_scores, + min_max_uni_coherence_scores, + min_max_uni_understandability_scores, + ) = evaluate_uni(data, args.uni) + + result = { + "file": file, + "number": len(data), + "length": length_scores, + "mtld": mtld_scores, + "mtld_min_max": min_max_mtld_scores, + "uni_naturalness": uni_naturalness_scores, + "uni_coherence": uni_coherence_scores, + "uni_understandability": uni_understandability_scores, + "uni_naturalness_min_max": min_max_uni_naturalness_scores, + "uni_coherence_min_max": min_max_uni_coherence_scores, + "uni_understandability_min_max": min_max_uni_understandability_scores, + } + for reward_score in reward_scores: + result[reward_score["reward_name"]] = reward_score["score"] + result[f"{reward_score['reward_name']}_min_max"] = reward_score[ + "min_max_scores" + ] + + results.append(result) + + results = pd.DataFrame(results) + results.to_csv(os.path.join(args.output, "evaluation.csv"), index=False) diff --git a/graphgen/operators/extract/__init__.py b/graphgen/operators/extract/__init__.py new file mode 100644 index 00000000..6c7c2b94 --- /dev/null +++ b/graphgen/operators/extract/__init__.py @@ -0,0 +1 @@ +from .extract_service import ExtractService diff --git a/graphgen/operators/extract/extract_service.py b/graphgen/operators/extract/extract_service.py new file mode 100644 index 00000000..33987fcb --- /dev/null +++ b/graphgen/operators/extract/extract_service.py @@ -0,0 +1,45 @@ +import json + +import pandas as pd + +from graphgen.bases import BaseLLMWrapper, BaseOperator +from graphgen.common import init_llm +from graphgen.models.extractor import SchemaGuidedExtractor +from graphgen.utils import logger, run_concurrent + + +class ExtractService(BaseOperator): + def __init__(self, working_dir: str = "cache", **extract_kwargs): + super().__init__(working_dir=working_dir, op_name="extract_service") + self.llm_client: BaseLLMWrapper = init_llm("synthesizer") + self.extract_kwargs = extract_kwargs + self.method = self.extract_kwargs.get("method") + if self.method == "schema_guided": + schema_file = self.extract_kwargs.get("schema_path") + with open(schema_file, "r", encoding="utf-8") as f: + schema = json.load(f) + self.extractor = SchemaGuidedExtractor(self.llm_client, schema) + else: + raise ValueError(f"Unsupported extraction method: {self.method}") + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: + items = batch.to_dict(orient="records") + return pd.DataFrame(self.extract(items)) + + def extract(self, items: list[dict]) -> list[dict]: + + logger.info("Start extracting information from %d items", len(items)) + + results = run_concurrent( + self.extractor.extract, + items, + desc="Extracting information", + unit="item", + ) + results = self.extractor.merge_extractions(results) + + results = [ + {"_extract_id": key, "extracted_data": value} + for key, value in results.items() + ] + return results diff --git a/graphgen/operators/extract_kg.py b/graphgen/operators/extract_kg.py deleted file mode 100644 index 3fad5525..00000000 --- a/graphgen/operators/extract_kg.py +++ /dev/null @@ -1,132 +0,0 @@ -import re -import asyncio -from typing import List -from collections import defaultdict - -import gradio as gr -from tqdm.asyncio import tqdm as tqdm_async -from graphgen.models import Chunk, OpenAIModel, Tokenizer -from graphgen.models.storage.base_storage import BaseGraphStorage -from graphgen.templates import KG_EXTRACTION_PROMPT -from graphgen.utils import (logger, pack_history_conversations, split_string_by_multi_markers, - handle_single_entity_extraction, handle_single_relationship_extraction, - detect_if_chinese) -from graphgen.operators.merge_kg import merge_nodes, merge_edges - - -# pylint: disable=too-many-statements -async def extract_kg( - llm_client: OpenAIModel, - kg_instance: BaseGraphStorage, - tokenizer_instance: Tokenizer, - chunks: List[Chunk], - progress_bar: gr.Progress = None, - max_concurrent: int = 1000 -): - """ - :param llm_client: Synthesizer LLM model to extract entities and relationships - :param kg_instance - :param tokenizer_instance - :param chunks - :param progress_bar: Gradio progress bar to show the progress of the extraction - :param max_concurrent - :return: - """ - - semaphore = asyncio.Semaphore(max_concurrent) - - async def _process_single_content(chunk: Chunk, max_loop: int = 3): - async with semaphore: - chunk_id = chunk.id - content = chunk.content - if detect_if_chinese(content): - language = "Chinese" - else: - language = "English" - KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language - - hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format( - **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content - ) - - final_result = await llm_client.generate_answer(hint_prompt) - logger.info('First result: %s', final_result) - - history = pack_history_conversations(hint_prompt, final_result) - for loop_index in range(max_loop): - if_loop_result = await llm_client.generate_answer( - text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"], - history=history - ) - if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() - if if_loop_result != "yes": - break - - glean_result = await llm_client.generate_answer( - text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], - history=history - ) - logger.info('Loop %s glean: %s', loop_index, glean_result) - - history += pack_history_conversations(KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result) - final_result += glean_result - if loop_index == max_loop - 1: - break - - records = split_string_by_multi_markers( - final_result, - [ - KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], - KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"]], - ) - - nodes = defaultdict(list) - edges = defaultdict(list) - - for record in records: - record = re.search(r"\((.*)\)", record) - if record is None: - continue - record = record.group(1) # 提取括号内的内容 - record_attributes = split_string_by_multi_markers( - record, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] - ) - - entity = await handle_single_entity_extraction(record_attributes, chunk_id) - if entity is not None: - nodes[entity["entity_name"]].append(entity) - continue - relation = await handle_single_relationship_extraction(record_attributes, chunk_id) - if relation is not None: - edges[(relation["src_id"], relation["tgt_id"])].append(relation) - return dict(nodes), dict(edges) - - results = [] - chunk_number = len(chunks) - async for result in tqdm_async( - asyncio.as_completed([_process_single_content(c) for c in chunks]), - total=len(chunks), - desc="[3/4]Extracting entities and relationships from chunks", - unit="chunk", - ): - try: - if progress_bar is not None: - progress_bar(len(results) / chunk_number, desc="[3/4]Extracting entities and relationships from chunks") - results.append(await result) - if progress_bar is not None and len(results) == chunk_number: - progress_bar(1, desc="[3/4]Extracting entities and relationships from chunks") - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while extracting entities and relationships from chunks: %s", e) - - nodes = defaultdict(list) - edges = defaultdict(list) - for n, e in results: - for k, v in n.items(): - nodes[k].extend(v) - for k, v in e.items(): - edges[tuple(sorted(k))].extend(v) - - await merge_nodes(nodes, kg_instance, llm_client, tokenizer_instance) - await merge_edges(edges, kg_instance, llm_client, tokenizer_instance) - - return kg_instance diff --git a/graphgen/operators/generate/__init__.py b/graphgen/operators/generate/__init__.py new file mode 100644 index 00000000..04057ce6 --- /dev/null +++ b/graphgen/operators/generate/__init__.py @@ -0,0 +1 @@ +from .generate_service import GenerateService diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py new file mode 100644 index 00000000..7ad52dec --- /dev/null +++ b/graphgen/operators/generate/generate_service.py @@ -0,0 +1,71 @@ +import pandas as pd + +from graphgen.bases import BaseLLMWrapper, BaseOperator +from graphgen.common import init_llm +from graphgen.models import ( + AggregatedGenerator, + AtomicGenerator, + CoTGenerator, + MultiHopGenerator, + OmicsQAGenerator, + VQAGenerator, +) +from graphgen.utils import logger, run_concurrent + + +class GenerateService(BaseOperator): + """ + Generate question-answer pairs based on nodes and edges. + """ + + def __init__( + self, + working_dir: str = "cache", + method: str = "aggregated", + data_format: str = "ChatML", + ): + super().__init__(working_dir=working_dir, op_name="generate_service") + self.llm_client: BaseLLMWrapper = init_llm("synthesizer") + + self.method = method + self.data_format = data_format + + if self.method == "atomic": + self.generator = AtomicGenerator(self.llm_client) + elif self.method == "aggregated": + self.generator = AggregatedGenerator(self.llm_client) + elif self.method == "multi_hop": + self.generator = MultiHopGenerator(self.llm_client) + elif self.method == "cot": + self.generator = CoTGenerator(self.llm_client) + elif self.method == "omics_qa": + self.generator = OmicsQAGenerator(self.llm_client) + elif self.method in ["vqa"]: + self.generator = VQAGenerator(self.llm_client) + else: + raise ValueError(f"Unsupported generation mode: {method}") + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: + items = batch.to_dict(orient="records") + return pd.DataFrame(self.generate(items)) + + def generate(self, items: list[dict]) -> list[dict]: + """ + Generate question-answer pairs based on nodes and edges. + :param items + :return: QA pairs + """ + logger.info("[Generation] mode: %s, batches: %d", self.method, len(items)) + items = [(item["nodes"], item["edges"]) for item in items] + results = run_concurrent( + self.generator.generate, + items, + desc="[4/4]Generating QAs", + unit="batch", + ) + + results = self.generator.format_generation_results( + results, output_data_format=self.data_format + ) + + return results diff --git a/graphgen/operators/judge.py b/graphgen/operators/judge.py deleted file mode 100644 index 0292e1e4..00000000 --- a/graphgen/operators/judge.py +++ /dev/null @@ -1,188 +0,0 @@ -import math -import asyncio -from tqdm.asyncio import tqdm as tqdm_async -from graphgen.models import NetworkXStorage, OpenAIModel, JsonKVStorage -from graphgen.utils import logger, yes_no_loss_entropy -from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT - - -async def judge_statement( # pylint: disable=too-many-statements - trainee_llm_client: OpenAIModel, - graph_storage: NetworkXStorage, - rephrase_storage: JsonKVStorage, - re_judge: bool = False, - max_concurrent: int = 1000) -> NetworkXStorage: - """ - Get all edges and nodes and judge them - - :param trainee_llm_client: judge the statements to get comprehension loss - :param graph_storage: graph storage instance - :param rephrase_storage: rephrase storage instance - :param re_judge: re-judge the relations - :param max_concurrent: max concurrent - :return: - """ - - semaphore = asyncio.Semaphore(max_concurrent) - - async def _judge_single_relation( - edge: tuple, - ): - async with semaphore: - source_id = edge[0] - target_id = edge[1] - edge_data = edge[2] - - if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None: - logger.info("Edge %s -> %s already judged, loss: %s, skip", source_id, target_id, edge_data["loss"]) - return source_id, target_id, edge_data - - description = edge_data["description"] - - try: - descriptions = await rephrase_storage.get_by_id(description) - assert descriptions is not None - - judgements = [] - gts = [gt for _, gt in descriptions] - for description, gt in descriptions: - judgement = await trainee_llm_client.generate_topk_per_token( - STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description) - ) - judgements.append(judgement[0].top_candidates) - - loss = yes_no_loss_entropy(judgements, gts) - - logger.info("Edge %s -> %s description: %s loss: %s", source_id, target_id, description, loss) - - edge_data["loss"] = loss - except Exception as e: # pylint: disable=broad-except - logger.error("Error in judging relation %s -> %s: %s", source_id, target_id, e) - logger.info("Use default loss 0.1") - edge_data["loss"] = -math.log(0.1) - - await graph_storage.update_edge(source_id, target_id, edge_data) - return source_id, target_id, edge_data - - edges = await graph_storage.get_all_edges() - - results = [] - for result in tqdm_async( - asyncio.as_completed([_judge_single_relation(edge) for edge in edges]), - total=len(edges), - desc="Judging relations" - ): - results.append(await result) - - async def _judge_single_entity( - node: tuple, - ): - async with semaphore: - node_id = node[0] - node_data = node[1] - - if (not re_judge) and "loss" in node_data and node_data["loss"] is not None: - logger.info("Node %s already judged, loss: %s, skip", node_id, node_data["loss"]) - return node_id, node_data - - description = node_data["description"] - - try: - descriptions = await rephrase_storage.get_by_id(description) - assert descriptions is not None - - judgements = [] - gts = [gt for _, gt in descriptions] - for description, gt in descriptions: - judgement = await trainee_llm_client.generate_topk_per_token( - STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description) - ) - judgements.append(judgement[0].top_candidates) - - loss = yes_no_loss_entropy(judgements, gts) - - logger.info("Node %s description: %s loss: %s", node_id, description, loss) - - node_data["loss"] = loss - except Exception as e: # pylint: disable=broad-except - logger.error("Error in judging entity %s: %s", node_id, e) - logger.info("Use default loss 0.1") - node_data["loss"] = -math.log(0.1) - - await graph_storage.update_node(node_id, node_data) - return node_id, node_data - - nodes = await graph_storage.get_all_nodes() - - results = [] - for result in tqdm_async( - asyncio.as_completed([_judge_single_entity(node) for node in nodes]), - total=len(nodes), - desc="Judging entities" - ): - results.append(await result) - - return graph_storage - -async def skip_judge_statement( - graph_storage: NetworkXStorage, - max_concurrent: int = 1000 -): - """ - Skip the judgement of the statement - :param graph_storage: graph storage instance - :param max_concurrent: max concurrent - :return: - """ - semaphore = asyncio.Semaphore(max_concurrent) - - async def _skip_single_relation( - edge: tuple, - ): - async with semaphore: - source_id = edge[0] - target_id = edge[1] - edge_data = edge[2] - - if "loss" in edge_data and edge_data["loss"] is not None: - logger.info("Edge %s -> %s already judged, loss: %s, skip", source_id, target_id, edge_data["loss"]) - return source_id, target_id, edge_data - - edge_data["loss"] = -math.log(0.1) - await graph_storage.update_edge(source_id, target_id, edge_data) - return source_id, target_id, edge_data - - edges = await graph_storage.get_all_edges() - results = [] - for result in tqdm_async( - asyncio.as_completed([_skip_single_relation(edge) for edge in edges]), - total=len(edges), - desc="Skipping judgement of relations" - ): - results.append(await result) - - async def _skip_single_entity( - node: tuple, - ): - async with semaphore: - node_id = node[0] - node_data = node[1] - - if "loss" in node_data and node_data["loss"] is not None: - logger.info("Node %s already judged, loss: %s, skip", node_id, node_data["loss"]) - return node_id, node_data - - node_data["loss"] = -math.log(0.1) - await graph_storage.update_node(node_id, node_data) - return node_id, node_data - - nodes = await graph_storage.get_all_nodes() - results = [] - for result in tqdm_async( - asyncio.as_completed([_skip_single_entity(node) for node in nodes]), - total=len(nodes), - desc="Skipping judgement of entities" - ): - results.append(await result) - - return graph_storage diff --git a/graphgen/operators/judge/__init__.py b/graphgen/operators/judge/__init__.py new file mode 100644 index 00000000..32ccf5c2 --- /dev/null +++ b/graphgen/operators/judge/__init__.py @@ -0,0 +1 @@ +from .judge_service import JudgeService diff --git a/graphgen/operators/judge/judge_service.py b/graphgen/operators/judge/judge_service.py new file mode 100644 index 00000000..35797084 --- /dev/null +++ b/graphgen/operators/judge/judge_service.py @@ -0,0 +1,70 @@ +import math + +import pandas as pd + +from graphgen.bases import BaseGraphStorage, BaseLLMWrapper, BaseOperator +from graphgen.common import init_llm, init_storage +from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT +from graphgen.utils import logger, run_concurrent, yes_no_loss_entropy + + +class JudgeService(BaseOperator): + """Service for judging graph edges and nodes using a trainee LLM.""" + + def __init__(self, working_dir: str = "cache"): + super().__init__(working_dir=working_dir, op_name="judge_service") + self.llm_client: BaseLLMWrapper = init_llm("trainee") + self.graph_storage: BaseGraphStorage = init_storage( + backend="kuzu", + working_dir=working_dir, + namespace="graph", + ) + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: + items = batch.to_dict(orient="records") + self.graph_storage.reload() + self.judge(items) + return pd.DataFrame([{"status": "judging_completed"}]) + + async def _process_single_judge(self, item: dict) -> dict: + description = item["description"] + try: + judgement = await self.llm_client.generate_topk_per_token( + STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(statement=description) + ) + top_candidates = judgement[0].top_candidates + gt = item.get("ground_truth", "yes") + loss = yes_no_loss_entropy([top_candidates], [gt]) + logger.debug("Description: %s Loss: %s", description, loss) + item["loss"] = loss + except Exception as e: # pylint: disable=broad-except + logger.error("Error in judging description: %s", e) + logger.info("Use default loss 0.1") + item["loss"] = -math.log(0.1) + return item + + def judge(self, items: list[dict]) -> None: + """ + Judge the description in the item and compute the loss. + """ + results = run_concurrent( + self._process_single_judge, + items, + desc="Judging descriptions", + unit="description", + ) + # Update the graph storage with the computed losses + for item in results: + index = item["index"] + loss = item["loss"] + if isinstance(index, str): + node_id = index + node_data = self.graph_storage.get_node(node_id) + node_data["loss"] = loss + self.graph_storage.update_node(node_id, node_data) + elif isinstance(index, tuple): + edge_source, edge_target = index + edge_data = self.graph_storage.get_edge(edge_source, edge_target) + edge_data["loss"] = loss + self.graph_storage.update_edge(edge_source, edge_target, edge_data) + self.graph_storage.index_done_callback() diff --git a/graphgen/operators/merge_kg.py b/graphgen/operators/merge_kg.py deleted file mode 100644 index 33aa1395..00000000 --- a/graphgen/operators/merge_kg.py +++ /dev/null @@ -1,215 +0,0 @@ -from collections import Counter -import asyncio -from tqdm.asyncio import tqdm as tqdm_async - -from graphgen.utils.format import split_string_by_multi_markers -from graphgen.utils import logger, detect_main_language -from graphgen.models import TopkTokenModel, Tokenizer -from graphgen.models.storage.base_storage import BaseGraphStorage -from graphgen.templates import KG_SUMMARIZATION_PROMPT, KG_EXTRACTION_PROMPT - -async def _handle_kg_summary( - entity_or_relation_name: str, - description: str, - llm_client: TopkTokenModel, - tokenizer_instance: Tokenizer, - max_summary_tokens: int = 200 -) -> str: - """ - 处理实体或关系的描述信息 - - :param entity_or_relation_name - :param description - :param llm_client - :param tokenizer_instance - :param max_summary_tokens - :return: new description - """ - language = detect_main_language(description) - if language == "en": - language = "English" - else: - language = "Chinese" - KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language - - tokens = tokenizer_instance.encode_string(description) - if len(tokens) < max_summary_tokens: - return description - - use_description = tokenizer_instance.decode_tokens(tokens[:max_summary_tokens]) - prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format( - entity_name=entity_or_relation_name, - description_list=use_description.split(''), - **KG_SUMMARIZATION_PROMPT["FORMAT"] - ) - new_description = await llm_client.generate_answer(prompt) - logger.info("Entity or relation %s summary: %s", entity_or_relation_name, new_description) - return new_description - - -async def merge_nodes( - nodes_data: dict, - kg_instance: BaseGraphStorage, - llm_client: TopkTokenModel, - tokenizer_instance: Tokenizer, - max_concurrent: int = 1000 -): - """ - Merge nodes - - :param nodes_data - :param kg_instance - :param llm_client - :param tokenizer_instance - :param max_concurrent - :return - """ - - semaphore = asyncio.Semaphore(max_concurrent) - - async def process_single_node(entity_name: str, node_data: list[dict]): - async with semaphore: - entity_types = [] - source_ids = [] - descriptions = [] - - node = await kg_instance.get_node(entity_name) - if node is not None: - entity_types.append(node["entity_type"]) - source_ids.extend( - split_string_by_multi_markers(node["source_id"], ['']) - ) - descriptions.append(node["description"]) - - # 统计当前节点数据和已有节点数据的entity_type出现次数,取出现次数最多的entity_type - entity_type = sorted( - Counter( - [dp["entity_type"] for dp in node_data] + entity_types - ).items(), - key=lambda x: x[1], - reverse=True, - )[0][0] - - description = ''.join( - sorted(set([dp["description"] for dp in node_data] + descriptions)) - ) - description = await _handle_kg_summary( - entity_name, description, llm_client, tokenizer_instance - ) - - source_id = ''.join( - set([dp["source_id"] for dp in node_data] + source_ids) - ) - - node_data = { - "entity_type": entity_type, - "description": description, - "source_id": source_id - } - await kg_instance.upsert_node( - entity_name, - node_data=node_data - ) - node_data["entity_name"] = entity_name - return node_data - - logger.info("Inserting entities into storage...") - entities_data = [] - for result in tqdm_async( - asyncio.as_completed( - [process_single_node(k, v) for k, v in nodes_data.items()] - ), - total=len(nodes_data), - desc="Inserting entities into storage", - unit="entity", - ): - try: - entities_data.append(await result) - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while inserting entities into storage: %s", e) - - -async def merge_edges( - edges_data: dict, - kg_instance: BaseGraphStorage, - llm_client: TopkTokenModel, - tokenizer_instance: Tokenizer, - max_concurrent: int = 1000 -): - """ - Merge edges - - :param edges_data - :param kg_instance - :param llm_client - :param tokenizer_instance - :param max_concurrent - :return - """ - - semaphore = asyncio.Semaphore(max_concurrent) - - async def process_single_edge(src_id: str, tgt_id: str, edge_data: list[dict]): - async with semaphore: - source_ids = [] - descriptions = [] - - edge = await kg_instance.get_edge(src_id, tgt_id) - if edge is not None: - source_ids.extend( - split_string_by_multi_markers(edge["source_id"], ['']) - ) - descriptions.append(edge["description"]) - - description = ''.join( - sorted(set([dp["description"] for dp in edge_data] + descriptions)) - ) - source_id = ''.join( - set([dp["source_id"] for dp in edge_data] + source_ids) - ) - - for insert_id in [src_id, tgt_id]: - if not await kg_instance.has_node(insert_id): - await kg_instance.upsert_node( - insert_id, - node_data={ - "source_id": source_id, - "description": description, - "entity_type": "UNKNOWN" - } - ) - - description = await _handle_kg_summary( - f"({src_id}, {tgt_id})", description, llm_client, tokenizer_instance - ) - - await kg_instance.upsert_edge( - src_id, - tgt_id, - edge_data={ - "source_id": source_id, - "description": description - } - ) - - edge_data = { - "src_id": src_id, - "tgt_id": tgt_id, - "description": description - } - return edge_data - - logger.info("Inserting relationships into storage...") - relationships_data = [] - for result in tqdm_async( - asyncio.as_completed( - [process_single_edge(src_id, tgt_id, v) for (src_id, tgt_id), v in edges_data.items()] - ), - total=len(edges_data), - desc="Inserting relationships into storage", - unit="relationship", - ): - try: - relationships_data.append(await result) - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while inserting relationships into storage: %s", e) diff --git a/graphgen/operators/partition/__init__.py b/graphgen/operators/partition/__init__.py new file mode 100644 index 00000000..8d586b95 --- /dev/null +++ b/graphgen/operators/partition/__init__.py @@ -0,0 +1 @@ +from .partition_service import PartitionService diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py new file mode 100644 index 00000000..e56a093e --- /dev/null +++ b/graphgen/operators/partition/partition_service.py @@ -0,0 +1,292 @@ +import json +import os +from typing import Iterable + +import pandas as pd + +from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseOperator, BaseTokenizer +from graphgen.common import init_storage +from graphgen.models import ( + AnchorBFSPartitioner, + BFSPartitioner, + DFSPartitioner, + ECEPartitioner, + LeidenPartitioner, + Tokenizer, +) +from graphgen.utils import logger + + +class PartitionService(BaseOperator): + def __init__(self, working_dir: str = "cache", **partition_kwargs): + super().__init__(working_dir=working_dir, op_name="partition_service") + self.kg_instance: BaseGraphStorage = init_storage( + backend="kuzu", + working_dir=working_dir, + namespace="graph", + ) + self.chunk_storage: BaseKVStorage = init_storage( + backend="rocksdb", + working_dir=working_dir, + namespace="chunk", + ) + tokenizer_model = os.getenv("TOKENIZER_MODEL", "cl100k_base") + self.tokenizer_instance: BaseTokenizer = Tokenizer(model_name=tokenizer_model) + self.partition_kwargs = partition_kwargs + + def process(self, batch: pd.DataFrame) -> Iterable[pd.DataFrame]: + # this operator does not consume any batch data + # but for compatibility we keep the interface + _ = batch.to_dict(orient="records") + self.kg_instance.reload() + self.chunk_storage.reload() + + yield from self.partition() + + def partition(self) -> Iterable[pd.DataFrame]: + method = self.partition_kwargs["method"] + method_params = self.partition_kwargs["method_params"] + if method == "bfs": + logger.info("Partitioning knowledge graph using BFS method.") + partitioner = BFSPartitioner() + elif method == "dfs": + logger.info("Partitioning knowledge graph using DFS method.") + partitioner = DFSPartitioner() + elif method == "ece": + logger.info("Partitioning knowledge graph using ECE method.") + # TODO: before ECE partitioning, we need to: + # 1. 'quiz' and 'judge' to get the comprehension loss if unit_sampling is not random + # 2. pre-tokenize nodes and edges to get the token length + self._pre_tokenize() + partitioner = ECEPartitioner() + elif method == "leiden": + logger.info("Partitioning knowledge graph using Leiden method.") + partitioner = LeidenPartitioner() + elif method == "anchor_bfs": + logger.info("Partitioning knowledge graph using Anchor BFS method.") + anchor_type = method_params.get("anchor_type") + if isinstance(anchor_type, list): + logger.info("Using multiple anchor types: %s", anchor_type) + else: + logger.info("Using single anchor type: %s", anchor_type) + partitioner = AnchorBFSPartitioner( + anchor_type=anchor_type, + anchor_ids=set(method_params.get("anchor_ids", [])) + if method_params.get("anchor_ids") + else None, + ) + else: + raise ValueError(f"Unsupported partition method: {method}") + + communities = partitioner.partition(g=self.kg_instance, **method_params) + + for community in communities: + batch = partitioner.community2batch(community, g=self.kg_instance) + batch = self._attach_additional_data_to_node(batch) + + yield pd.DataFrame( + { + "nodes": [batch[0]], + "edges": [batch[1]], + } + ) + + def _pre_tokenize(self) -> None: + """Pre-tokenize all nodes and edges to add token length information.""" + logger.info("Starting pre-tokenization of nodes and edges...") + + nodes = self.kg_instance.get_all_nodes() + edges = self.kg_instance.get_all_edges() + + # Process nodes + for node_id, node_data in nodes: + if "length" not in node_data: + try: + description = node_data.get("description", "") + tokens = self.tokenizer_instance.encode(description) + node_data["length"] = len(tokens) + self.kg_instance.update_node(node_id, node_data) + except Exception as e: + logger.warning("Failed to tokenize node %s: %s", node_id, e) + node_data["length"] = 0 + + # Process edges + for u, v, edge_data in edges: + if "length" not in edge_data: + try: + description = edge_data.get("description", "") + tokens = self.tokenizer_instance.encode(description) + edge_data["length"] = len(tokens) + self.kg_instance.update_edge(u, v, edge_data) + except Exception as e: + logger.warning("Failed to tokenize edge %s-%s: %s", u, v, e) + edge_data["length"] = 0 + + # Persist changes + self.kg_instance.index_done_callback() + logger.info("Pre-tokenization completed.") + + def _attach_additional_data_to_node(self, batch: tuple) -> tuple: + """ + Attach additional data from chunk_storage to nodes in the batch. + :param batch: tuple of (nodes_data, edges_data) + :return: updated batch with additional data attached to nodes + """ + nodes_data, edges_data = batch + + for node_id, node_data in nodes_data: + entity_type = (node_data.get("entity_type") or "").lower() + + if not entity_type: + continue + + source_ids = [ + sid.strip() + for sid in node_data.get("source_id", "").split("") + if sid.strip() + ] + + if not source_ids: + continue + + # Handle images + if "image" in entity_type: + image_chunks = [ + data + for sid in source_ids + if "image" in sid.lower() + and (data := self.chunk_storage.get_by_id(sid)) + ] + if image_chunks: + # The generator expects a dictionary with an 'img_path' key, not a list of captions. + # We'll use the first image chunk found for this node. + node_data["image_data"] = json.loads(image_chunks[0]["content"]) + logger.debug("Attached image data to node %s", node_id) + + # Handle omics data (protein/dna/rna) + molecule_type = None + if entity_type in ("protein", "dna", "rna"): + molecule_type = entity_type + else: + # Infer from source_id prefix + for sid in source_ids: + sid_lower = sid.lower() + if sid_lower.startswith("protein-"): + molecule_type = "protein" + break + if sid_lower.startswith("dna-"): + molecule_type = "dna" + break + if sid_lower.startswith("rna-"): + molecule_type = "rna" + break + + if molecule_type: + omics_chunks = [ + data + for sid in source_ids + if (data := self.chunk_storage.get_by_id(sid)) + ] + + if not omics_chunks: + logger.warning( + "No chunks found for node %s (type: %s) with source_ids: %s", + node_id, molecule_type, source_ids + ) + continue + + def get_chunk_value(chunk: dict, field: str): + # First check root level of chunk + if field in chunk: + return chunk[field] + # Then check metadata if it exists and is a dict + chunk_metadata = chunk.get("metadata") + if isinstance(chunk_metadata, dict) and field in chunk_metadata: + return chunk_metadata[field] + return None + + # Group chunks by molecule type to preserve all types of sequences + chunks_by_type = {"dna": [], "rna": [], "protein": []} + for chunk in omics_chunks: + chunk_id = chunk.get("_chunk_id", "").lower() + if chunk_id.startswith("dna-"): + chunks_by_type["dna"].append(chunk) + elif chunk_id.startswith("rna-"): + chunks_by_type["rna"].append(chunk) + elif chunk_id.startswith("protein-"): + chunks_by_type["protein"].append(chunk) + + # Field mappings for each molecule type + field_mapping = { + "protein": [ + "protein_name", "gene_names", "organism", "function", + "sequence", "id", "database", "entry_name", "uniprot_id" + ], + "dna": [ + "gene_name", "gene_description", "organism", "chromosome", + "genomic_location", "function", "gene_type", "sequence", + "id", "database" + ], + "rna": [ + "rna_type", "description", "organism", "related_genes", + "gene_name", "so_term", "sequence", "id", "database", + "rnacentral_id" + ], + } + + # Extract and store captions for each molecule type + for mol_type in ["dna", "rna", "protein"]: + type_chunks = chunks_by_type[mol_type] + if not type_chunks: + continue + + # Use the first chunk of this type + type_chunk = type_chunks[0] + caption = {} + + # Extract all relevant fields for this molecule type + for field in field_mapping.get(mol_type, []): + value = get_chunk_value(type_chunk, field) + if value: + caption[field] = value + + # Store caption if it has any data + if caption: + caption_key = f"{mol_type}_caption" + node_data[caption_key] = caption + logger.debug("Stored %s caption for node %s with %d fields", mol_type, node_id, len(caption)) + + # For backward compatibility, also attach sequence and other fields from the primary molecule type + # Use the detected molecule_type or default to the first available type + primary_chunk = None + if chunks_by_type.get(molecule_type): + primary_chunk = chunks_by_type[molecule_type][0] + elif chunks_by_type["dna"]: + primary_chunk = chunks_by_type["dna"][0] + elif chunks_by_type["rna"]: + primary_chunk = chunks_by_type["rna"][0] + elif chunks_by_type["protein"]: + primary_chunk = chunks_by_type["protein"][0] + else: + primary_chunk = omics_chunks[0] + + # Attach sequence if not already present (for backward compatibility) + if "sequence" not in node_data: + sequence = get_chunk_value(primary_chunk, "sequence") + if sequence: + node_data["sequence"] = sequence + + # Attach molecule_type if not present + if "molecule_type" not in node_data: + chunk_molecule_type = get_chunk_value(primary_chunk, "molecule_type") + if chunk_molecule_type: + node_data["molecule_type"] = chunk_molecule_type + + # Attach molecule-specific fields from primary chunk (for backward compatibility) + for field in field_mapping.get(molecule_type, []): + if field not in node_data: + value = get_chunk_value(primary_chunk, field) + if value: + node_data[field] = value + + return nodes_data, edges_data diff --git a/graphgen/operators/preprocess/__init__.py b/graphgen/operators/preprocess/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/operators/resolute_coreference.py b/graphgen/operators/preprocess/resolute_coreference.py similarity index 60% rename from graphgen/operators/resolute_coreference.py rename to graphgen/operators/preprocess/resolute_coreference.py index 4a1012fb..a4da6a86 100644 --- a/graphgen/operators/resolute_coreference.py +++ b/graphgen/operators/preprocess/resolute_coreference.py @@ -1,12 +1,14 @@ from typing import List -from graphgen.models import Chunk -from graphgen.models import OpenAIModel -from graphgen.templates import COREFERENCE_RESOLUTION_TEMPLATE + +from graphgen.bases.datatypes import Chunk +from graphgen.models import OpenAIClient +from graphgen.templates import COREFERENCE_RESOLUTION_PROMPT from graphgen.utils import detect_main_language + async def resolute_coreference( - llm_client: OpenAIModel, - chunks: List[Chunk]) -> List[Chunk]: + llm_client: OpenAIClient, chunks: List[Chunk] +) -> List[Chunk]: """ Resolute conference @@ -23,9 +25,8 @@ async def resolute_coreference( for _, chunk in enumerate(chunks[1:]): language = detect_main_language(chunk.content) result = await llm_client.generate_answer( - COREFERENCE_RESOLUTION_TEMPLATE[language].format( - reference = results[0].content, - input_sentence = chunk.content + COREFERENCE_RESOLUTION_PROMPT[language].format( + reference=results[0].content, input_sentence=chunk.content ) ) results.append(Chunk(id=chunk.id, content=result)) diff --git a/graphgen/operators/quiz.py b/graphgen/operators/quiz.py deleted file mode 100644 index 36edddb1..00000000 --- a/graphgen/operators/quiz.py +++ /dev/null @@ -1,109 +0,0 @@ -import asyncio -from collections import defaultdict - -from tqdm.asyncio import tqdm as tqdm_async -from graphgen.models import JsonKVStorage, OpenAIModel, NetworkXStorage -from graphgen.utils import logger, detect_main_language -from graphgen.templates import DESCRIPTION_REPHRASING_PROMPT - - -async def quiz( - synth_llm_client: OpenAIModel, - graph_storage: NetworkXStorage, - rephrase_storage: JsonKVStorage, - max_samples: int = 1, - max_concurrent: int = 1000) -> JsonKVStorage: - """ - Get all edges and quiz them - - :param synth_llm_client: generate statements - :param graph_storage: graph storage instance - :param rephrase_storage: rephrase storage instance - :param max_samples: max samples for each edge - :param max_concurrent: max concurrent - :return: - """ - - semaphore = asyncio.Semaphore(max_concurrent) - - async def _process_single_quiz( - des: str, - prompt: str, - gt: str - ): - async with semaphore: - try: - # 如果在rephrase_storage中已经存在,直接取出 - descriptions = await rephrase_storage.get_by_id(des) - if descriptions: - return None - - new_description = await synth_llm_client.generate_answer( - prompt, - temperature=1 - ) - return {des: [(new_description, gt)]} - - except Exception as e: # pylint: disable=broad-except - logger.error("Error when quizzing description %s: %s", des, e) - return None - - - edges = await graph_storage.get_all_edges() - nodes = await graph_storage.get_all_nodes() - - results = defaultdict(list) - tasks = [] - for edge in edges: - edge_data = edge[2] - - description = edge_data["description"] - language = "English" if detect_main_language(description) == "en" else "Chinese" - - results[description] = [(description, 'yes')] - - for i in range(max_samples): - if i > 0: - tasks.append( - _process_single_quiz(description, - DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format( - input_sentence=description), 'yes') - ) - tasks.append(_process_single_quiz(description, - DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format( - input_sentence=description), 'no')) - - for node in nodes: - node_data = node[1] - description = node_data["description"] - language = "English" if detect_main_language(description) == "en" else "Chinese" - - results[description] = [(description, 'yes')] - - for i in range(max_samples): - if i > 0: - tasks.append( - _process_single_quiz(description, - DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format( - input_sentence=description), 'yes') - ) - tasks.append(_process_single_quiz(description, - DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format( - input_sentence=description), 'no')) - - for result in tqdm_async( - asyncio.as_completed(tasks), - total=len(tasks), - desc="Quizzing descriptions" - ): - new_result = await result - if new_result: - for key, value in new_result.items(): - results[key].extend(value) - - for key, value in results.items(): - results[key] = list(set(value)) - await rephrase_storage.upsert({key: results[key]}) - - - return rephrase_storage diff --git a/graphgen/operators/quiz/__init__.py b/graphgen/operators/quiz/__init__.py new file mode 100644 index 00000000..2a931f4b --- /dev/null +++ b/graphgen/operators/quiz/__init__.py @@ -0,0 +1 @@ +from .quiz_service import QuizService diff --git a/graphgen/operators/quiz/quiz_service.py b/graphgen/operators/quiz/quiz_service.py new file mode 100644 index 00000000..66dc1193 --- /dev/null +++ b/graphgen/operators/quiz/quiz_service.py @@ -0,0 +1,112 @@ +from collections.abc import Iterable + +import pandas as pd + +from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseLLMWrapper, BaseOperator +from graphgen.common import init_llm, init_storage +from graphgen.models import QuizGenerator +from graphgen.utils import compute_dict_hash, logger, run_concurrent + + +class QuizService(BaseOperator): + def __init__( + self, + working_dir: str = "cache", + quiz_samples: int = 1, + concurrency_limit: int = 200, + ): + super().__init__(working_dir=working_dir, op_name="quiz_service") + self.quiz_samples = quiz_samples + self.llm_client: BaseLLMWrapper = init_llm("synthesizer") + self.graph_storage: BaseGraphStorage = init_storage( + backend="networkx", working_dir=working_dir, namespace="graph" + ) + # { _quiz_id: { "description": str, "quizzes": List[Tuple[str, str]] } } + self.quiz_storage: BaseKVStorage = init_storage( + backend="rocksdb", working_dir=working_dir, namespace="quiz" + ) + self.generator = QuizGenerator(self.llm_client) + self.concurrency_limit = concurrency_limit + + def process(self, batch: pd.DataFrame) -> Iterable[pd.DataFrame]: + # this operator does not consume any batch data + # but for compatibility we keep the interface + _ = batch.to_dict(orient="records") + self.graph_storage.reload() + yield from self.quiz() + + async def _process_single_quiz(self, item: tuple) -> dict | None: + # if quiz in quiz_storage exists already, directly get it + index, desc = item + _quiz_id = compute_dict_hash({"index": index, "description": desc}) + if self.quiz_storage.get_by_id(_quiz_id): + return None + + tasks = [] + for i in range(self.quiz_samples): + if i > 0: + tasks.append((desc, "TEMPLATE", "yes")) + tasks.append((desc, "ANTI_TEMPLATE", "no")) + try: + quizzes = [] + for d, template_type, gt in tasks: + prompt = self.generator.build_prompt_for_description(d, template_type) + new_description = await self.llm_client.generate_answer( + prompt, temperature=1 + ) + rephrased_text = self.generator.parse_rephrased_text(new_description) + quizzes.append((rephrased_text, gt)) + return { + "_quiz_id": _quiz_id, + "description": desc, + "index": index, + "quizzes": quizzes, + } + except Exception as e: + logger.error("Error when quizzing description %s: %s", item, e) + return None + + def quiz(self) -> Iterable[pd.DataFrame]: + """ + Get all nodes and edges and quiz their descriptions using QuizGenerator. + """ + edges = self.graph_storage.get_all_edges() + nodes = self.graph_storage.get_all_nodes() + + items = [] + + for edge in edges: + edge_data = edge[2] + desc = edge_data["description"] + items.append(((edge[0], edge[1]), desc)) + + for node in nodes: + node_data = node[1] + desc = node_data["description"] + items.append((node[0], desc)) + + logger.info("Total descriptions to quiz: %d", len(items)) + + for i in range(0, len(items), self.concurrency_limit): + batch_items = items[i : i + self.concurrency_limit] + batch_results = run_concurrent( + self._process_single_quiz, + batch_items, + desc=f"Quizzing descriptions ({i} / {i + len(batch_items)})", + unit="description", + ) + + final_results = [] + for new_result in batch_results: + if new_result: + self.quiz_storage.upsert( + { + new_result["_quiz_id"]: { + "description": new_result["description"], + "quizzes": new_result["quizzes"], + } + } + ) + final_results.append(new_result) + self.quiz_storage.index_done_callback() + yield pd.DataFrame(final_results) diff --git a/graphgen/operators/read/__init__.py b/graphgen/operators/read/__init__.py new file mode 100644 index 00000000..cd22453e --- /dev/null +++ b/graphgen/operators/read/__init__.py @@ -0,0 +1 @@ +from .read import read, read_files diff --git a/graphgen/operators/read/parallel_file_scanner.py b/graphgen/operators/read/parallel_file_scanner.py new file mode 100644 index 00000000..84219139 --- /dev/null +++ b/graphgen/operators/read/parallel_file_scanner.py @@ -0,0 +1,216 @@ +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Any, Dict, List, Set, Union + +from graphgen.models import RocksDBCache + + +class ParallelFileScanner: + def __init__( + self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4 + ): + self.cache = RocksDBCache(os.path.join(cache_dir, "input_paths.db")) + self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None + self.rescan = rescan + self.max_workers = max_workers + + def scan( + self, paths: Union[str, List[str]], recursive: bool = True + ) -> Dict[str, Any]: + if isinstance(paths, str): + paths = [paths] + + results = {} + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + future_to_path = {} + for p in paths: + if os.path.exists(p): + future = executor.submit( + self._scan_files, Path(p).resolve(), recursive, set() + ) + future_to_path[future] = p + + for future in as_completed(future_to_path): + path = future_to_path[future] + try: + results[path] = future.result() + except Exception as e: + results[path] = { + "error": str(e), + "files": [], + "dirs": [], + "stats": {}, + } + return results + + def _scan_files( + self, path: Path, recursive: bool, visited: Set[str] + ) -> Dict[str, Any]: + path_str = str(path) + + # Avoid cycles due to symlinks + if path_str in visited: + return self._empty_result(path_str) + + # cache check + cache_key = f"scan::{path_str}::recursive::{recursive}" + cached = self.cache.get(cache_key) + if cached and not self.rescan: + return cached["data"] + + files, dirs = [], [] + stats = {"total_size": 0, "file_count": 0, "dir_count": 0, "errors": 0} + + try: + path_stat = path.stat() + if path.is_file(): + return self._scan_single_file(path, path_str, path_stat) + if path.is_dir(): + with os.scandir(path_str) as entries: + for entry in entries: + try: + entry_stat = entry.stat(follow_symlinks=False) + + if entry.is_dir(): + dirs.append( + { + "path": entry.path, + "name": entry.name, + "mtime": entry_stat.st_mtime, + } + ) + stats["dir_count"] += 1 + else: + # allowed suffix filter + if not self._is_allowed_file(Path(entry.path)): + continue + files.append( + { + "path": entry.path, + "name": entry.name, + "size": entry_stat.st_size, + "mtime": entry_stat.st_mtime, + } + ) + stats["total_size"] += entry_stat.st_size + stats["file_count"] += 1 + + except OSError: + stats["errors"] += 1 + + except (PermissionError, FileNotFoundError, OSError) as e: + return {"error": str(e), "files": [], "dirs": [], "stats": stats} + + if recursive: + sub_visited = visited | {path_str} + sub_results = self._scan_subdirs(dirs, sub_visited) + + for sub_data in sub_results.values(): + files.extend(sub_data.get("files", [])) + stats["total_size"] += sub_data["stats"].get("total_size", 0) + stats["file_count"] += sub_data["stats"].get("file_count", 0) + + result = {"path": path_str, "files": files, "dirs": dirs, "stats": stats} + self._cache_result(cache_key, result, path) + return result + + def _scan_single_file( + self, path: Path, path_str: str, stat: os.stat_result + ) -> Dict[str, Any]: + """Scan a single file and return its metadata""" + if not self._is_allowed_file(path): + return self._empty_result(path_str) + + return { + "path": path_str, + "files": [ + { + "path": path_str, + "name": path.name, + "size": stat.st_size, + "mtime": stat.st_mtime, + } + ], + "dirs": [], + "stats": { + "total_size": stat.st_size, + "file_count": 1, + "dir_count": 0, + "errors": 0, + }, + } + + def _scan_subdirs(self, dir_list: List[Dict], visited: Set[str]) -> Dict[str, Any]: + """ + Parallel scan subdirectories + :param dir_list + :param visited + :return: + """ + results = {} + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = { + executor.submit(self._scan_files, Path(d["path"]), True, visited): d[ + "path" + ] + for d in dir_list + } + + for future in as_completed(futures): + path = futures[future] + try: + results[path] = future.result() + except Exception as e: + results[path] = { + "error": str(e), + "files": [], + "dirs": [], + "stats": {}, + } + + return results + + def _cache_result(self, key: str, result: Dict, path: Path): + """Cache the scan result""" + self.cache.set( + key, + { + "data": result, + "dir_mtime": path.stat().st_mtime, + "cached_at": time.time(), + }, + ) + + def _is_allowed_file(self, path: Path) -> bool: + """Check if the file has an allowed suffix""" + if self.allowed_suffix is None: + return True + suffix = path.suffix.lower().lstrip(".") + return suffix in self.allowed_suffix + + def invalidate(self, path: str): + """Invalidate cache for a specific path""" + path = Path(path).resolve() + keys = [k for k in self.cache if k.startswith(f"scan::{path}")] + for k in keys: + self.cache.delete(k) + + def close(self): + self.cache.close() + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + @staticmethod + def _empty_result(path: str) -> Dict[str, Any]: + return { + "path": path, + "files": [], + "dirs": [], + "stats": {"total_size": 0, "file_count": 0, "dir_count": 0, "errors": 0}, + } diff --git a/graphgen/operators/read/read.py b/graphgen/operators/read/read.py new file mode 100644 index 00000000..0f23160f --- /dev/null +++ b/graphgen/operators/read/read.py @@ -0,0 +1,199 @@ +from pathlib import Path +from typing import Any, Dict, Iterator, List, Optional, Union + +import ray + +from graphgen.models import ( + CSVReader, + JSONReader, + ParquetReader, + PDFReader, + PickleReader, + RDFReader, + TXTReader, +) +from graphgen.utils import compute_mm_hash, logger + +from .parallel_file_scanner import ParallelFileScanner + +_MAPPING = { + "jsonl": JSONReader, + "json": JSONReader, + "txt": TXTReader, + "csv": CSVReader, + "md": TXTReader, + "pdf": PDFReader, + "parquet": ParquetReader, + "pickle": PickleReader, + "rdf": RDFReader, + "owl": RDFReader, + "ttl": RDFReader, +} + + +def _build_reader(suffix: str, cache_dir: str | None, **reader_kwargs): + """Factory function to build appropriate reader instance""" + suffix = suffix.lower() + reader_cls = _MAPPING.get(suffix) + if not reader_cls: + raise ValueError(f"Unsupported file suffix: {suffix}") + + # Special handling for PDFReader which needs output_dir + if suffix == "pdf": + if cache_dir is None: + raise ValueError("cache_dir must be provided for PDFReader") + return reader_cls(output_dir=cache_dir, **reader_kwargs) + + return reader_cls(**reader_kwargs) + + +def read( + input_path: Union[str, List[str]], + allowed_suffix: Optional[List[str]] = None, + cache_dir: Optional[str] = "cache", + parallelism: int = 4, + recursive: bool = True, + **reader_kwargs: Any, +) -> ray.data.Dataset: + """ + Unified entry point to read files of multiple types using Ray Data. + + :param input_path: File or directory path(s) to read from + :param allowed_suffix: List of allowed file suffixes (e.g., ['pdf', 'txt']) + :param cache_dir: Directory to cache intermediate files (PDF processing) + :param parallelism: Number of parallel workers + :param recursive: Whether to scan directories recursively + :param reader_kwargs: Additional kwargs passed to readers + :return: Ray Dataset containing all documents + """ + try: + # 1. Scan all paths to discover files + logger.info("[READ] Scanning paths: %s", input_path) + scanner = ParallelFileScanner( + cache_dir=cache_dir, + allowed_suffix=allowed_suffix, + rescan=False, + max_workers=parallelism if parallelism > 0 else 1, + ) + + all_files = [] + scan_results = scanner.scan(input_path, recursive=recursive) + + for result in scan_results.values(): + all_files.extend(result.get("files", [])) + + logger.info("[READ] Found %d files to process", len(all_files)) + + if not all_files: + raise ValueError("No files found to read.") + + # 2. Group files by suffix to use appropriate reader + files_by_suffix = {} + for file_info in all_files: + suffix = Path(file_info["path"]).suffix.lower().lstrip(".") + if allowed_suffix and suffix not in [ + s.lower().lstrip(".") for s in allowed_suffix + ]: + continue + files_by_suffix.setdefault(suffix, []).append(file_info["path"]) + + # 3. Create read tasks + read_tasks = [] + for suffix, file_paths in files_by_suffix.items(): + reader = _build_reader(suffix, cache_dir, **reader_kwargs) + ds = reader.read(file_paths) + read_tasks.append(ds) + + # 4. Combine all datasets + if not read_tasks: + raise ValueError("No datasets created from the provided files.") + + if len(read_tasks) == 1: + combined_ds = read_tasks[0] + else: + combined_ds = read_tasks[0].union(*read_tasks[1:]) + + combined_ds = combined_ds.map( + lambda record: { + **record, + "_doc_id": compute_mm_hash(record, prefix="doc-"), + } + ) + + logger.info("[READ] Successfully read files from %s", input_path) + return combined_ds + + except Exception as e: + logger.error("[READ] Failed to read files from %s: %s", input_path, e) + raise + + +def read_files( + input_file: str, + allowed_suffix: Optional[List[str]] = None, + cache_dir: Optional[str] = None, + max_workers: int = 4, + rescan: bool = False, +) -> Iterator[Dict[str, Any]]: + """ + Read files from a path using parallel scanning and appropriate readers. + Returns an iterator for streaming (backward compatibility with graphgen.py). + + Args: + input_file: Path to a file or directory + allowed_suffix: List of file suffixes to read. If None, uses all supported types + cache_dir: Directory for caching PDF extraction and scan results + max_workers: Number of workers for parallel scanning + rescan: Whether to force rescan even if cached results exist + + Returns: + Iterator of dictionaries containing the data (for streaming) + """ + path = Path(input_file).expanduser() + if not path.exists(): + raise FileNotFoundError(f"input_path not found: {input_file}") + + if allowed_suffix is None: + support_suffix = set(_MAPPING.keys()) + else: + support_suffix = {s.lower().lstrip(".") for s in allowed_suffix} + + with ParallelFileScanner( + cache_dir=cache_dir or "cache", + allowed_suffix=support_suffix, + rescan=rescan, + max_workers=max_workers, + ) as scanner: + scan_results = scanner.scan(str(path), recursive=True) + + # Extract files from scan results + files_to_read = [] + for path_result in scan_results.values(): + if "error" in path_result: + logger.warning("Error scanning %s: %s", path_result.path, path_result.error) + continue + files_to_read.extend(path_result.get("files", [])) + + logger.info( + "Found %d eligible file(s) under folder %s (allowed_suffix=%s)", + len(files_to_read), + input_file, + support_suffix, + ) + + for file_info in files_to_read: + try: + file_path = file_info["path"] + suffix = Path(file_path).suffix.lstrip(".").lower() + reader = _build_reader(suffix, cache_dir) + + # Prefer stream reading if available (for memory efficiency) + if hasattr(reader, "read_stream"): + yield from reader.read_stream(file_path) + else: + # Fallback to regular read() method - convert Ray Dataset to iterator + ds = reader.read([file_path]) + yield from ds.iter_rows() + + except Exception as e: # pylint: disable=broad-except + logger.exception("Error reading %s: %s", file_info.get("path"), e) diff --git a/graphgen/operators/search/__init__.py b/graphgen/operators/search/__init__.py new file mode 100644 index 00000000..47144c77 --- /dev/null +++ b/graphgen/operators/search/__init__.py @@ -0,0 +1 @@ +from .search_service import SearchService diff --git a/graphgen/operators/search/multi_omics_search.py b/graphgen/operators/search/multi_omics_search.py new file mode 100644 index 00000000..fbe10f06 --- /dev/null +++ b/graphgen/operators/search/multi_omics_search.py @@ -0,0 +1,29 @@ +import re +from typing import Dict, Optional + +from graphgen.models import UniProtSearch + + +def _fetch_uniprot(entry: str) -> Optional[Dict]: + entry = entry.strip() + client = UniProtSearch() + + # 1. first try accession search + if re.fullmatch( + r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry + ): + return client.get_by_accession(entry) + + # 2. then try keyword search + return client.get_best_hit(entry) + + +def multi_omics_search(entry: str) -> Dict: + """ + Multi-omics search function that tries to fetch protein/gene information. + """ + # TODO: Extend this function to include more omics databases as needed. + result = _fetch_uniprot(entry) + if result: + return {"input": entry, "uniprot": result} + return {"input": entry, "uniprot": None} diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py new file mode 100644 index 00000000..a948b612 --- /dev/null +++ b/graphgen/operators/search/search_service.py @@ -0,0 +1,412 @@ +""" +To use Google Web Search API, +follow the instructions [here](https://developers.google.com/custom-search/v1/overview) +to get your Google searcher api key. + +To use Bing Web Search API, +follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) +and obtain your Bing subscription key. +""" + +import pandas as pd + +from graphgen.bases import BaseOperator +from graphgen.utils import create_event_loop, run_concurrent + + +class SearchService(BaseOperator): + """ + Service class for performing searches across multiple data sources. + Provides search functionality for UniProt, NCBI, and RNAcentral databases. + """ + + def __init__( + self, + working_dir: str = "cache", + data_sources: list = None, + ncbi_params: dict = None, + uniprot_params: dict = None, + rnacentral_params: dict = None, + save_interval: int = 1000, + **kwargs, + ): + super().__init__(working_dir=working_dir, op_name="search_service") + self.working_dir = working_dir + + # Build search_config dictionary from parameters + self.search_config = { + "data_sources": data_sources or [], + } + + if ncbi_params: + self.search_config["ncbi_params"] = ncbi_params + if uniprot_params: + self.search_config["uniprot_params"] = uniprot_params + if rnacentral_params: + self.search_config["rnacentral_params"] = rnacentral_params + + self.save_interval = save_interval + self.search_storage = None # Optional: can be initialized if needed for saving intermediate results + + async def _perform_searches(self, seed_data: dict) -> dict: + """ + Internal method to perform searches across multiple search types and aggregate the results. + :param seed_data: A dictionary containing seed data with entity names. + :return: A dictionary with search results + """ + results = {} + data_sources = self.search_config.get("data_sources", []) + + for data_source in data_sources: + data = list(seed_data.values()) + data = [d["content"] for d in data if "content" in d] + data = list(set(data)) # Remove duplicates + + # Prepare save callback for this data source + def make_save_callback(source_name): + def save_callback(intermediate_results, completed_count): + """Save intermediate search results.""" + if self.search_storage is None: + return + + # Convert results list to dict format + # Results are tuples of (query, result_dict) or just result_dict + batch_results = {} + for result in intermediate_results: + if result is None: + continue + # Check if result is a dict with _search_query key + if isinstance(result, dict) and "_search_query" in result: + query = result["_search_query"] + # Create a key for the result (using query as key) + key = f"{source_name}:{query}" + batch_results[key] = result + elif isinstance(result, dict): + # If no _search_query, use a generated key + key = f"{source_name}:{completed_count}" + batch_results[key] = result + + if batch_results: + # Filter out already existing keys + new_keys = self.search_storage.filter_keys(list(batch_results.keys())) + new_results = {k: v for k, v in batch_results.items() if k in new_keys} + if new_results: + self.search_storage.upsert(new_results) + self.search_storage.index_done_callback() + self.logger.debug("Saved %d intermediate results for %s", len(new_results), source_name) + + return save_callback + + if data_source == "uniprot": + from graphgen.models import UniProtSearch + + uniprot_params = self.search_config.get("uniprot_params", {}).copy() + # Get max_concurrent from config before passing params to constructor + max_concurrent = uniprot_params.pop("max_concurrent", None) + + uniprot_search_client = UniProtSearch( + working_dir=self.working_dir, + **uniprot_params + ) + + uniprot_results = await run_concurrent( + uniprot_search_client.search, + data, + desc="Searching UniProt database", + unit="keyword", + save_interval=self.save_interval if self.save_interval > 0 else 0, + save_callback=( + make_save_callback("uniprot") + if self.search_storage and self.save_interval > 0 + else None + ), + max_concurrent=max_concurrent, + ) + results[data_source] = uniprot_results + + elif data_source == "ncbi": + from graphgen.models import NCBISearch + + ncbi_params = self.search_config.get("ncbi_params", {}).copy() + # Get max_concurrent from config before passing params to constructor + max_concurrent = ncbi_params.pop("max_concurrent", None) + + ncbi_search_client = NCBISearch( + working_dir=self.working_dir, + **ncbi_params + ) + + ncbi_results = await run_concurrent( + ncbi_search_client.search, + data, + desc="Searching NCBI database", + unit="keyword", + save_interval=self.save_interval if self.save_interval > 0 else 0, + save_callback=( + make_save_callback("ncbi") + if self.search_storage and self.save_interval > 0 + else None + ), + max_concurrent=max_concurrent, + ) + results[data_source] = ncbi_results + + elif data_source == "rnacentral": + from graphgen.models import RNACentralSearch + + rnacentral_params = self.search_config.get("rnacentral_params", {}).copy() + # Get max_concurrent from config before passing params to constructor + max_concurrent = rnacentral_params.pop("max_concurrent", None) + + rnacentral_search_client = RNACentralSearch( + working_dir=self.working_dir, + **rnacentral_params + ) + + rnacentral_results = await run_concurrent( + rnacentral_search_client.search, + data, + desc="Searching RNAcentral database", + unit="keyword", + save_interval=self.save_interval if self.save_interval > 0 else 0, + save_callback=( + make_save_callback("rnacentral") + if self.search_storage and self.save_interval > 0 + else None + ), + max_concurrent=max_concurrent, + ) + results[data_source] = rnacentral_results + + else: + self.logger.error("Data source %s not supported.", data_source) + continue + + return results + + def _is_already_searched(self, doc: dict) -> bool: + """ + Check if a document already contains search results. + + :param doc: Document dictionary + :return: True if document appears to already contain search results + """ + # Check for data_source field (added by search_service) + if "data_source" in doc and doc["data_source"]: + return True + + # Check for database field (added by search operations) + if "database" in doc and doc["database"] in ["UniProt", "NCBI", "RNAcentral"]: + # Also check for molecule_type to confirm it's a search result + if "molecule_type" in doc and doc["molecule_type"] in ["DNA", "RNA", "protein"]: + return True + + # Check for search-specific fields that indicate search results + search_indicators = [ + "uniprot_id", "entry_name", # UniProt + "gene_id", "gene_name", "chromosome", # NCBI + "rnacentral_id", "rna_type", # RNAcentral + ] + if any(indicator in doc for indicator in search_indicators): + # Make sure it's not just metadata by checking for database or molecule_type + if "database" in doc or "molecule_type" in doc: + return True + + return False + + def _normalize_searched_data(self, doc: dict) -> dict: + """ + Normalize a document that already contains search results to the expected format. + + :param doc: Document dictionary with search results + :return: Normalized document dictionary + """ + # Ensure required fields exist + doc_id = doc.get("_doc_id") + if not doc_id: + # Generate doc_id from id or other fields + raw_doc_id = doc.get("id") or doc.get("_search_query") or f"doc-{hash(str(doc))}" + doc_id = str(raw_doc_id) + + # Ensure doc_id starts with "doc-" prefix + if not doc_id.startswith("doc-"): + doc_id = f"doc-{doc_id}" + + # Determine document type from molecule_type or existing type + doc_type = doc.get("type", "text") + if doc_type == "text" and "molecule_type" in doc: + molecule_type = doc.get("molecule_type", "").lower() + if molecule_type in ["dna", "rna", "protein"]: + doc_type = molecule_type + + # Ensure data_source field exists + data_source = doc.get("data_source") + if not data_source: + # Infer from database field + database = doc.get("database", "").lower() + if "uniprot" in database: + data_source = "uniprot" + elif "ncbi" in database: + data_source = "ncbi" + elif "rnacentral" in database or "rna" in database: + data_source = "rnacentral" + + # Build or preserve content field + content = doc.get("content") + if not content or content.strip() == "": + # Build content from available fields if missing + content_parts = [] + if doc.get("title"): + content_parts.append(f"Title: {doc['title']}") + if doc.get("description"): + content_parts.append(f"Description: {doc['description']}") + if doc.get("function"): + func = doc["function"] + if isinstance(func, list): + func = ", ".join(str(f) for f in func) + content_parts.append(f"Function: {func}") + if doc.get("sequence"): + content_parts.append(f"Sequence: {doc['sequence']}") + + if not content_parts: + # Fallback: create content from key fields + key_fields = ["protein_name", "gene_name", "gene_description", "organism"] + for field in key_fields: + if field in doc and doc[field]: + content_parts.append(f"{field}: {doc[field]}") + + content = "\n".join(content_parts) if content_parts else str(doc) + + # Create normalized row + normalized_doc = { + "_doc_id": doc_id, + "type": doc_type, + "content": content, + "data_source": data_source, + **doc, # Include all original fields for metadata + } + + return normalized_doc + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: + """ + Process a batch of documents and perform searches. + This is the Ray Data operator interface. + + If input data already contains search results (detected by presence of + data_source, database, or search-specific fields), the search step is + skipped and the data is normalized and returned directly. + + :param batch: DataFrame containing documents with at least '_doc_id' and 'content' columns + :return: DataFrame containing search results + """ + # Convert DataFrame to dictionary format + docs = batch.to_dict(orient="records") + + # Check if data already contains search results + already_searched = all(self._is_already_searched(doc) for doc in docs if doc) + + if already_searched: + # Data already contains search results, normalize and return directly + self.logger.info( + "Input data already contains search results. " + "Skipping search step and normalizing data." + ) + result_rows = [] + for doc in docs: + if not doc: + continue + normalized_doc = self._normalize_searched_data(doc) + result_rows.append(normalized_doc) + + if not result_rows: + self.logger.warning("No documents found in batch") + return pd.DataFrame(columns=["_doc_id", "type", "content", "data_source"]) + + return pd.DataFrame(result_rows) + + # Data doesn't contain search results, perform search as usual + seed_data = {doc.get("_doc_id", f"doc-{i}"): doc for i, doc in enumerate(docs)} + + # Perform searches asynchronously + loop, created = create_event_loop() + try: + if loop.is_running(): + # If loop is already running, we can't use run_until_complete + # This shouldn't happen in normal usage, but handle it gracefully + raise RuntimeError( + "Cannot use process when event loop is already running. " + "This is likely a Ray worker configuration issue." + ) + search_results = loop.run_until_complete( + self._perform_searches(seed_data) + ) + finally: + # Only close the loop if we created it + if created: + loop.close() + + # Convert search_results from {data_source: [results]} to DataFrame + # Each result becomes a document row compatible with chunk service + result_rows = [] + + for data_source, result_list in search_results.items(): + if not isinstance(result_list, list): + continue + + for result in result_list: + if result is None: + continue + + # Convert search result to document format expected by chunk service + # Build content from available fields + content_parts = [] + if result.get("title"): + content_parts.append(f"Title: {result['title']}") + if result.get("description"): + content_parts.append(f"Description: {result['description']}") + if result.get("function"): + content_parts.append(f"Function: {result['function']}") + if result.get("sequence"): + content_parts.append(f"Sequence: {result['sequence']}") + + # If no content parts, use a default or combine all fields + if not content_parts: + # Fallback: create content from all string fields + content_parts = [ + f"{k}: {v}" + for k, v in result.items() + if isinstance(v, (str, int, float)) and k != "_search_query" + ] + + content = "\n".join(content_parts) if content_parts else str(result) + + # Determine document type from molecule_type or default to "text" + doc_type = result.get("molecule_type", "text").lower() + if doc_type not in ["text", "dna", "rna", "protein"]: + doc_type = "text" + + # Convert to string to handle Ray Data ListElement and other types + raw_doc_id = result.get("id") or result.get("_search_query") or f"search-{len(result_rows)}" + doc_id = str(raw_doc_id) + + # Ensure doc_id starts with "doc-" prefix + if not doc_id.startswith("doc-"): + doc_id = f"doc-{doc_id}" + + # Create document row with all result fields plus required fields + row = { + "_doc_id": doc_id, + "type": doc_type, + "content": content, + "data_source": data_source, + **result, # Include all original result fields for metadata + } + result_rows.append(row) + + if not result_rows: + self.logger.warning("No search results generated for this batch") + # Return empty DataFrame with expected structure + return pd.DataFrame(columns=["_doc_id", "type", "content", "data_source"]) + + return pd.DataFrame(result_rows) diff --git a/graphgen/operators/search_wikipedia.py b/graphgen/operators/search_wikipedia.py deleted file mode 100644 index d3d7e283..00000000 --- a/graphgen/operators/search_wikipedia.py +++ /dev/null @@ -1,71 +0,0 @@ -import asyncio -from graphgen.models import WikiSearch, OpenAIModel -from graphgen.models.storage.base_storage import BaseGraphStorage -from graphgen.templates import SEARCH_JUDGEMENT_PROMPT -from graphgen.utils import logger - - -async def _process_single_entity(entity_name: str, - description: str, - llm_client: OpenAIModel, - wiki_search_client: WikiSearch) -> tuple[str, None] | tuple[str, str]: - """ - Process single entity - - """ - search_results = await wiki_search_client.search(entity_name) - if not search_results: - return entity_name, None - examples = "\n".join(SEARCH_JUDGEMENT_PROMPT["EXAMPLES"]) - search_results.append("None of the above") - - search_results_str = "\n".join([f"{i + 1}. {sr}" for i, sr in enumerate(search_results)]) - prompt = SEARCH_JUDGEMENT_PROMPT["TEMPLATE"].format( - examples=examples, - entity_name=entity_name, - description=description, - search_results=search_results_str, - ) - response = await llm_client.generate_answer(prompt) - - try: - response = response.strip() - response = int(response) - if response < 1 or response >= len(search_results): - response = None - else: - response = await wiki_search_client.summary(search_results[response - 1]) - except ValueError: - response = None - - logger.info("Entity %s search result: %s response: %s", entity_name, str(search_results), response) - - return entity_name, response - -async def search_wikipedia(llm_client: OpenAIModel, - wiki_search_client: WikiSearch, - knowledge_graph_instance: BaseGraphStorage,) -> dict: - """ - Search wikipedia for entities - - :param llm_client: LLM model - :param wiki_search_client: wiki search client - :param knowledge_graph_instance: knowledge graph instance - :return: nodes with search results - """ - - - nodes = await knowledge_graph_instance.get_all_nodes() - nodes = list(nodes) - wiki_data = {} - - tasks = [ - _process_single_entity(node[0].strip('"'), node[1]["description"], llm_client, wiki_search_client) - for node in nodes - ] - - for task in asyncio.as_completed(tasks): - result = await task - wiki_data[result[0]] = result[1] - - return wiki_data diff --git a/graphgen/operators/split_graph.py b/graphgen/operators/split_graph.py deleted file mode 100644 index e2e2b5ca..00000000 --- a/graphgen/operators/split_graph.py +++ /dev/null @@ -1,333 +0,0 @@ -import random -from collections import defaultdict -from tqdm.asyncio import tqdm as tqdm_async -from graphgen.utils import logger - -from graphgen.models import NetworkXStorage, TraverseStrategy - -async def _get_node_info( - node_id: str, - graph_storage: NetworkXStorage, -)-> dict: - """ - Get node info - - :param node_id: node id - :param graph_storage: graph storage instance - :return: node info - """ - node_data = await graph_storage.get_node(node_id) - return { - "node_id": node_id, - **node_data - } - - -def _get_level_n_edges_by_max_width( - edge_adj_list: dict, - node_dict: dict, - edges: list, - nodes, - src_edge: tuple, - max_depth: int, - bidirectional: bool, - max_extra_edges: int, - edge_sampling: str, - loss_strategy: str = "only_edge" -) -> list: - """ - Get level n edges for an edge. - n is decided by max_depth in traverse_strategy - - :param edge_adj_list - :param node_dict - :param edges - :param nodes - :param src_edge - :param max_depth - :param bidirectional - :param max_extra_edges - :param edge_sampling - :return: level n edges - """ - src_id, tgt_id, _ = src_edge - - level_n_edges = [] - - start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id} - - while max_depth > 0 and max_extra_edges > 0: - max_depth -= 1 - - candidate_edges = [ - edges[edge_id] - for node in start_nodes - for edge_id in edge_adj_list[node] - if not edges[edge_id][2].get("visited", False) - ] - - if not candidate_edges: - break - - if len(candidate_edges) >= max_extra_edges: - if loss_strategy == "both": - er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges] - candidate_edges = _sort_tuples(er_tuples, edge_sampling)[:max_extra_edges] - elif loss_strategy == "only_edge": - candidate_edges = _sort_edges(candidate_edges, edge_sampling)[:max_extra_edges] - else: - raise ValueError(f"Invalid loss strategy: {loss_strategy}") - - for edge in candidate_edges: - level_n_edges.append(edge) - edge[2]["visited"] = True - break - - max_extra_edges -= len(candidate_edges) - new_start_nodes = set() - - for edge in candidate_edges: - level_n_edges.append(edge) - edge[2]["visited"] = True - - if not edge[0] in start_nodes: - new_start_nodes.add(edge[0]) - if not edge[1] in start_nodes: - new_start_nodes.add(edge[1]) - - start_nodes = new_start_nodes - - return level_n_edges - - -def _get_level_n_edges_by_max_tokens( - edge_adj_list: dict, - node_dict: dict, - edges: list, - nodes: list, - src_edge: tuple, - max_depth: int, - bidirectional: bool, - max_tokens: int, - edge_sampling: str, - loss_strategy: str = "only_edge" -) -> list: - """ - Get level n edges for an edge. - n is decided by max_depth in traverse_strategy. - - :param edge_adj_list - :param node_dict - :param edges - :param nodes - :param src_edge - :param max_depth - :param bidirectional - :param max_tokens - :param edge_sampling - :return: level n edges - """ - src_id, tgt_id, src_edge_data = src_edge - - max_tokens -= (src_edge_data["length"] + nodes[node_dict[src_id]][1]["length"] - + nodes[node_dict[tgt_id]][1]["length"]) - - level_n_edges = [] - - start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id} - temp_nodes = {src_id, tgt_id} - - while max_depth > 0 and max_tokens > 0: - max_depth -= 1 - - candidate_edges = [ - edges[edge_id] - for node in start_nodes - for edge_id in edge_adj_list[node] - if not edges[edge_id][2].get("visited", False) - ] - - if not candidate_edges: - break - - if loss_strategy == "both": - er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges] - candidate_edges = _sort_tuples(er_tuples, edge_sampling) - elif loss_strategy == "only_edge": - candidate_edges = _sort_edges(candidate_edges, edge_sampling) - else: - raise ValueError(f"Invalid loss strategy: {loss_strategy}") - - for edge in candidate_edges: - max_tokens -= edge[2]["length"] - if not edge[0] in temp_nodes: - max_tokens -= nodes[node_dict[edge[0]]][1]["length"] - if not edge[1] in temp_nodes: - max_tokens -= nodes[node_dict[edge[1]]][1]["length"] - - if max_tokens < 0: - return level_n_edges - - level_n_edges.append(edge) - edge[2]["visited"] = True - temp_nodes.add(edge[0]) - temp_nodes.add(edge[1]) - - new_start_nodes = set() - for edge in candidate_edges: - if not edge[0] in start_nodes: - new_start_nodes.add(edge[0]) - if not edge[1] in start_nodes: - new_start_nodes.add(edge[1]) - - start_nodes = new_start_nodes - - return level_n_edges - - -def _sort_tuples(er_tuples: list, edge_sampling: str) -> list: - """ - Sort edges with edge sampling strategy - - :param er_tuples: [(nodes:list, edge:tuple)] - :param edge_sampling: edge sampling strategy (random, min_loss, max_loss) - :return: sorted edges - """ - if edge_sampling == "random": - er_tuples = random.sample(er_tuples, len(er_tuples)) - elif edge_sampling == "min_loss": - er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"]) - elif edge_sampling == "max_loss": - er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"], - reverse=True) - else: - raise ValueError(f"Invalid edge sampling: {edge_sampling}") - edges = [edge for _, edge in er_tuples] - return edges - -def _sort_edges(edges: list, edge_sampling: str) -> list: - """ - Sort edges with edge sampling strategy - - :param edges: total edges - :param edge_sampling: edge sampling strategy (random, min_loss, max_loss) - :return: sorted edges - """ - if edge_sampling == "random": - random.shuffle(edges) - elif edge_sampling == "min_loss": - edges = sorted(edges, key=lambda x: x[2]["loss"]) - elif edge_sampling == "max_loss": - edges = sorted(edges, key=lambda x: x[2]["loss"], reverse=True) - else: - raise ValueError(f"Invalid edge sampling: {edge_sampling}") - return edges - -async def get_batches_with_strategy( # pylint: disable=too-many-branches - nodes: list, - edges: list, - graph_storage: NetworkXStorage, - traverse_strategy: TraverseStrategy -): - expand_method = traverse_strategy.expand_method - if expand_method == "max_width": - logger.info("Using max width strategy") - elif expand_method == "max_tokens": - logger.info("Using max tokens strategy") - else: - raise ValueError(f"Invalid expand method: {expand_method}") - - max_depth = traverse_strategy.max_depth - edge_sampling = traverse_strategy.edge_sampling - - # 构建临接矩阵 - edge_adj_list = defaultdict(list) - node_dict = {} - processing_batches = [] - - node_cache = {} - - async def get_cached_node_info(node_id: str) -> dict: - if node_id not in node_cache: - node_cache[node_id] = await _get_node_info(node_id, graph_storage) - return node_cache[node_id] - - for i, (node_name, _) in enumerate(nodes): - node_dict[node_name] = i - - if traverse_strategy.loss_strategy == "both": - er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in edges] - edges = _sort_tuples(er_tuples, edge_sampling) - elif traverse_strategy.loss_strategy == "only_edge": - edges = _sort_edges(edges, edge_sampling) - else: - raise ValueError(f"Invalid loss strategy: {traverse_strategy.loss_strategy}") - - for i, (src, tgt, _) in enumerate(edges): - edge_adj_list[src].append(i) - edge_adj_list[tgt].append(i) - - for edge in tqdm_async(edges, desc="Preparing batches"): - if "visited" in edge[2] and edge[2]["visited"]: - continue - - edge[2]["visited"] = True - - _process_nodes = [] - _process_edges = [] - - src_id = edge[0] - tgt_id = edge[1] - - _process_nodes.extend([await get_cached_node_info(src_id), - await get_cached_node_info(tgt_id)]) - _process_edges.append(edge) - - if expand_method == "max_width": - level_n_edges = _get_level_n_edges_by_max_width( - edge_adj_list, node_dict, edges, nodes, edge, max_depth, - traverse_strategy.bidirectional, traverse_strategy.max_extra_edges, - edge_sampling, traverse_strategy.loss_strategy - ) - else: - level_n_edges = _get_level_n_edges_by_max_tokens( - edge_adj_list, node_dict, edges, nodes, edge, max_depth, - traverse_strategy.bidirectional, traverse_strategy.max_tokens, - edge_sampling, traverse_strategy.loss_strategy - ) - - for _edge in level_n_edges: - _process_nodes.append(await get_cached_node_info(_edge[0])) - _process_nodes.append(await get_cached_node_info(_edge[1])) - _process_edges.append(_edge) - - # 去重 - _process_nodes = list({node['node_id']: node for node in _process_nodes}.values()) - _process_edges = list({(edge[0], edge[1]): edge for edge in _process_edges}.values()) - - processing_batches.append((_process_nodes, _process_edges)) - - logger.info("Processing batches: %d", len(processing_batches)) - - # isolate nodes - isolated_node_strategy = traverse_strategy.isolated_node_strategy - if isolated_node_strategy == "add": - processing_batches = await _add_isolated_nodes(nodes, processing_batches, graph_storage) - logger.info("Processing batches after adding isolated nodes: %d", len(processing_batches)) - - return processing_batches - -async def _add_isolated_nodes( - nodes: list, - processing_batches: list, - graph_storage: NetworkXStorage, -) -> list: - visited_nodes = set() - for _process_nodes, _process_edges in processing_batches: - for node in _process_nodes: - visited_nodes.add(node["node_id"]) - for node in nodes: - if node[0] not in visited_nodes: - _process_nodes = [await _get_node_info(node[0], graph_storage)] - processing_batches.append((_process_nodes, [])) - return processing_batches diff --git a/graphgen/operators/traverse_graph.py b/graphgen/operators/traverse_graph.py deleted file mode 100644 index 947033ed..00000000 --- a/graphgen/operators/traverse_graph.py +++ /dev/null @@ -1,485 +0,0 @@ -import asyncio -import gradio as gr - -from tqdm.asyncio import tqdm as tqdm_async - -from graphgen.models import OpenAIModel, NetworkXStorage, TraverseStrategy, Tokenizer, JsonKVStorage -from graphgen.templates import ANSWER_REPHRASING_PROMPT, QUESTION_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT -from graphgen.utils import detect_main_language, compute_content_hash, logger -from graphgen.operators.split_graph import get_batches_with_strategy - - -async def _pre_tokenize(graph_storage: NetworkXStorage, - tokenizer: Tokenizer, - edges: list, - nodes: list) -> tuple: - - sem = asyncio.Semaphore(1000) - async def handle_edge(edge: tuple) -> tuple: - async with sem: - if 'length' not in edge[2]: - edge[2]['length'] = len( - await asyncio.get_event_loop().run_in_executor(None, - tokenizer.encode_string, - edge[2]['description'])) - return edge - - async def handle_node(node: dict) -> dict: - async with sem: - if 'length' not in node[1]: - node[1]['length'] = len( - await asyncio.get_event_loop().run_in_executor(None, - tokenizer.encode_string, - node[1]['description'])) - return node - - new_edges = [] - new_nodes = [] - - for result in tqdm_async(asyncio.as_completed([handle_edge(edge) for edge in edges]), - total=len(edges), desc="Pre-tokenizing edges"): - new_edge = await result - await graph_storage.update_edge(new_edge[0], new_edge[1], new_edge[2]) - new_edges.append(new_edge) - - for result in tqdm_async(asyncio.as_completed([handle_node(node) for node in nodes]), - total=len(nodes), desc="Pre-tokenizing nodes"): - new_node = await result - await graph_storage.update_node(new_node[0], new_node[1]) - new_nodes.append(new_node) - - await graph_storage.index_done_callback() - return new_edges, new_nodes - -async def _construct_rephrasing_prompt(_process_nodes: list, - _process_edges: list, - text_chunks_storage: JsonKVStorage, - add_context: bool = False - ) -> str: - entities = [ - f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes - ] - relations = [ - f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}" - for _process_edge in _process_edges - ] - - entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)]) - relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)]) - language = "Chinese" if detect_main_language(entities_str + relations_str) == "zh" else "English" - - if add_context: - original_ids = ([node['source_id'].split('')[0] for node in _process_nodes] + - [edge[2]['source_id'].split('')[0] for edge in _process_edges]) - - original_ids = list(set(original_ids)) - original_text = await text_chunks_storage.get_by_ids(original_ids) - original_text = "\n".join([f"{index + 1}. {text['content']}" for index, text in enumerate(original_text)]) - - prompt = ANSWER_REPHRASING_PROMPT[language]['CONTEXT_TEMPLATE'].format( - language=language, - original_text=original_text, - entities=entities_str, - relationships=relations_str - ) - return prompt - - prompt = ANSWER_REPHRASING_PROMPT[language]['TEMPLATE'].format( - language=language, - entities=entities_str, - relationships=relations_str - ) - return prompt - -def get_loss_tercile(losses: list) -> (float, float): - losses = sorted(losses) - q1_index = int(len(losses) * (1 / 3)) - q2_index = int(len(losses) * (2 / 3)) - - return losses[q1_index], losses[q2_index] - -def get_average_loss(batch: tuple, loss_strategy: str) -> float: - if loss_strategy == "only_edge": - return sum(edge[2]['loss'] for edge in batch[1]) / len(batch[1]) - if loss_strategy == "both": - return sum(edge[2]['loss'] for edge in batch[1]) + sum(node['loss'] for node in batch[0]) / \ - (len(batch[0]) + len(batch[1])) - raise ValueError("Invalid loss strategy") - -def _post_process_synthetic_data(data): - block = data.split("\n\n") - qas = [] - for line in block: - if "Question:" in line and "Answer:" in line: - question = line.split("Question:")[1].split("Answer:")[0].strip() - answer = line.split("Answer:")[1].strip() - qas.append({ - "question": question, - "answer": answer - }) - elif "问题:" in line and "答案:" in line: - question = line.split("问题:")[1].split("答案:")[0].strip() - answer = line.split("答案:")[1].strip() - qas.append({ - "question": question, - "answer": answer - }) - elif "问题:" in line and "回答:" in line: - question = line.split("问题:")[1].split("回答:")[0].strip() - answer = line.split("回答:")[1].strip() - qas.append({ - "question": question, - "answer": answer - }) - return qas - -async def traverse_graph_by_edge( - llm_client: OpenAIModel, - tokenizer: Tokenizer, - graph_storage: NetworkXStorage, - traverse_strategy: TraverseStrategy, - text_chunks_storage: JsonKVStorage, - progress_bar: gr.Progress = None, - max_concurrent: int = 1000 -) -> dict: - """ - Traverse the graph - - :param llm_client - :param tokenizer - :param graph_storage - :param traverse_strategy - :param text_chunks_storage - :param progress_bar - :param max_concurrent - :return: question and answer - """ - - semaphore = asyncio.Semaphore(max_concurrent) - - async def _process_nodes_and_edges( - _process_nodes: list, - _process_edges: list, - ) -> str: - prompt = await _construct_rephrasing_prompt( - _process_nodes, - _process_edges, - text_chunks_storage, - add_context = False - ) - context = await llm_client.generate_answer(prompt) - - # post-process the context - if context.startswith("Rephrased Text:"): - context = context[len("Rephrased Text:"):].strip() - elif context.startswith("重述文本:"): - context = context[len("重述文本:"):].strip() - - return context - - async def _process_single_batch( - _process_batch: tuple, - question_type: str = "single" - ) -> dict: - async with semaphore: - context = await _process_nodes_and_edges( - _process_batch[0], - _process_batch[1], - ) - - language = "Chinese" if detect_main_language(context) == "zh" else "English" - pre_length = sum(node['length'] for node in _process_batch[0]) \ - + sum(edge[2]['length'] for edge in _process_batch[1]) - - if question_type == "single": - question = await llm_client.generate_answer( - QUESTION_GENERATION_PROMPT[language]['SINGLE_TEMPLATE'].format( - answer=context - ) - ) - if question.startswith("Question:"): - question = question[len("Question:"):].strip() - elif question.startswith("问题:"): - question = question[len("问题:"):].strip() - - logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1])) - logger.info("Pre-length: %s", pre_length) - logger.info("Question: %s", question) - logger.info("Answer: %s", context) - - return { - compute_content_hash(context): { - "question": question, - "answer": context, - "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy) - } - } - - content = await llm_client.generate_answer( - QUESTION_GENERATION_PROMPT[language]['MULTI_TEMPLATE'].format( - doc=context - ) - ) - qas = _post_process_synthetic_data(content) - - if len(qas) == 0: - print(content) - logger.error("Error occurred while processing batch, question or answer is None") - return {} - - final_results = {} - logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1])) - logger.info("Pre-length: %s", pre_length) - for qa in qas: - logger.info("Question: %s", qa['question']) - logger.info("Answer: %s", qa['answer']) - final_results[compute_content_hash(qa['question'])] = { - "question": qa['question'], - "answer": qa['answer'], - "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy) - } - return final_results - - results = {} - edges = list(await graph_storage.get_all_edges()) - nodes = list(await graph_storage.get_all_nodes()) - - edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes) - - processing_batches = await get_batches_with_strategy( - nodes, - edges, - graph_storage, - traverse_strategy - ) - - for result in tqdm_async(asyncio.as_completed( - [_process_single_batch(batch) for batch in processing_batches] - ), total=len(processing_batches), desc="[4/4]Generating QAs"): - try: - if progress_bar is not None: - progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs") - results.update(await result) - if progress_bar is not None and len(results) == len(processing_batches): - progress_bar(1, desc="[4/4]Generating QAs") - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while generating QA: %s", e) - - return results - - -async def traverse_graph_atomically( - llm_client: OpenAIModel, - tokenizer: Tokenizer, - graph_storage: NetworkXStorage, - traverse_strategy: TraverseStrategy, - text_chunks_storage: JsonKVStorage, - progress_bar: gr.Progress = None, - max_concurrent: int = 1000 -) -> dict: - """ - Traverse the graph atomicly - - :param llm_client - :param tokenizer - :param graph_storage - :param traverse_strategy - :param text_chunks_storage - :param progress_bar - :param max_concurrent - :return: question and answer - """ - assert traverse_strategy.qa_form == "atomic" - - semaphore = asyncio.Semaphore(max_concurrent) - async def _generate_question( - node_or_edge: tuple - ): - if len(node_or_edge) == 2: - des = node_or_edge[0] + ": " + node_or_edge[1]['description'] - loss = node_or_edge[1]['loss'] - else: - des = node_or_edge[2]['description'] - loss = node_or_edge[2]['loss'] - - async with semaphore: - try: - language = "Chinese" if detect_main_language(des) == "zh" else "English" - - qa = await llm_client.generate_answer( - QUESTION_GENERATION_PROMPT[language]['SINGLE_QA_TEMPLATE'].format( - doc=des - ) - ) - - if "Question:" in qa and "Answer:" in qa: - question = qa.split("Question:")[1].split("Answer:")[0].strip() - answer = qa.split("Answer:")[1].strip() - elif "问题:" in qa and "答案:" in qa: - question = qa.split("问题:")[1].split("答案:")[0].strip() - answer = qa.split("答案:")[1].strip() - else: - return {} - - question = question.strip("\"") - answer = answer.strip("\"") - - logger.info("Question: %s", question) - logger.info("Answer: %s", answer) - return { - compute_content_hash(question): { - "question": question, - "answer": answer, - "loss": loss - } - } - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while generating question: %s", e) - return {} - - results = {} - edges = list(await graph_storage.get_all_edges()) - nodes = list(await graph_storage.get_all_nodes()) - - edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes) - - tasks = [] - for node in nodes: - if "" in node[1]['description']: - description_list = node[1]['description'].split("") - for item in description_list: - tasks.append((node[0], {"description": item, 'loss': node[1]['loss']})) - else: - tasks.append((node[0], node[1])) - for edge in edges: - if "" in edge[2]['description']: - description_list = edge[2]['description'].split("") - for item in description_list: - tasks.append((edge[0], edge[1], {"description": item, 'loss': edge[2]['loss']})) - else: - tasks.append((edge[0], edge[1], edge[2])) - - for result in tqdm_async( - asyncio.as_completed([_generate_question(task) for task in tasks]), - total=len(tasks), - desc="[4/4]Generating QAs" - ): - try: - if progress_bar is not None: - progress_bar(len(results) / len(tasks), desc="[4/4]Generating QAs") - results.update(await result) - if progress_bar is not None and len(results) == len(tasks): - progress_bar(1, desc="[4/4]Generating QAs") - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while generating QA: %s", e) - return results - -async def traverse_graph_for_multi_hop( - llm_client: OpenAIModel, - tokenizer: Tokenizer, - graph_storage: NetworkXStorage, - traverse_strategy: TraverseStrategy, - text_chunks_storage: JsonKVStorage, - progress_bar: gr.Progress = None, - max_concurrent: int = 1000 -) -> dict: - """ - Traverse the graph for multi-hop - - :param llm_client - :param tokenizer - :param graph_storage - :param traverse_strategy - :param text_chunks_storage - :param progress_bar - :param max_concurrent - :return: question and answer - """ - assert traverse_strategy.qa_form == "multi_hop" - - semaphore = asyncio.Semaphore(max_concurrent) - - results = {} - edges = list(await graph_storage.get_all_edges()) - nodes = list(await graph_storage.get_all_nodes()) - - edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes) - - processing_batches = await get_batches_with_strategy( - nodes, - edges, - graph_storage, - traverse_strategy - ) - - async def _process_single_batch( - _process_batch: tuple - ) -> dict: - async with semaphore: - try: - language = "Chinese" if detect_main_language(_process_batch[0][0]['description']) == "zh" else "English" - - _process_nodes = _process_batch[0] - _process_edges = _process_batch[1] - - entities = [ - f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes - ] - - relations = [ - f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}" - for _process_edge in _process_edges - ] - - entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)]) - relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)]) - - prompt = MULTI_HOP_GENERATION_PROMPT[language].format( - entities=entities_str, - relationships=relations_str - ) - - context = await llm_client.generate_answer(prompt) - - # post-process the context - if "Question:" in context and "Answer:" in context: - question = context.split("Question:")[1].split("Answer:")[0].strip() - answer = context.split("Answer:")[1].strip() - elif "问题:" in context and "答案:" in context: - question = context.split("问题:")[1].split("答案:")[0].strip() - answer = context.split("答案:")[1].strip() - else: - return {} - - question = question.strip("\"") - answer = answer.strip("\"") - - logger.info("Question: %s", question) - logger.info("Answer: %s", answer) - - return { - compute_content_hash(question): { - "question": question, - "answer": answer, - "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy), - } - } - - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while processing batch: %s", e) - return {} - - async for result in tqdm_async( - asyncio.as_completed([_process_single_batch(batch) for batch in processing_batches]), - total=len(processing_batches), - desc="[4/4]Generating QAs" - ): - try: - if progress_bar is not None: - progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs") - results.update(await result) - if progress_bar is not None and len(results) == len(processing_batches): - progress_bar(1, desc="[4/4]Generating QAs") - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while generating QA: %s", e) - return results diff --git a/graphgen/run.py b/graphgen/run.py new file mode 100644 index 00000000..b56f79c6 --- /dev/null +++ b/graphgen/run.py @@ -0,0 +1,151 @@ +import argparse +import logging +import os +import sys +import time +from importlib import resources +from typing import Any, Dict + +import ray +import yaml +from dotenv import load_dotenv +from ray.data.block import Block +from ray.data.datasource.filename_provider import FilenameProvider + +from graphgen.engine import Engine +from graphgen.operators import operators +from graphgen.utils import CURRENT_LOGGER_VAR, logger, set_logger + +sys_path = os.path.abspath(os.path.dirname(__file__)) + +load_dotenv() + +# Suppress non-error output temporarily +# Save original streams for restoration +_original_stdout = sys.stdout +_original_stderr = sys.stderr +_devnull = None + + +def set_working_dir(folder): + os.makedirs(folder, exist_ok=True) + + +def save_config(config_path, global_config): + if not os.path.exists(os.path.dirname(config_path)): + os.makedirs(os.path.dirname(config_path)) + with open(config_path, "w", encoding="utf-8") as config_file: + yaml.dump( + global_config, config_file, default_flow_style=False, allow_unicode=True + ) + + +class NodeFilenameProvider(FilenameProvider): + def __init__(self, node_id: str): + self.node_id = node_id + + def get_filename_for_block( + self, block: Block, write_uuid: str, task_index: int, block_index: int + ) -> str: + # format: {node_id}_{write_uuid}_{task_index:06}_{block_index:06}.json + return f"{self.node_id}_{write_uuid}_{task_index:06d}_{block_index:06d}.jsonl" + + def get_filename_for_row( + self, + row: Dict[str, Any], + write_uuid: str, + task_index: int, + block_index: int, + row_index: int, + ) -> str: + raise NotImplementedError( + f"Row-based filenames are not supported by write_json. " + f"Node: {self.node_id}, write_uuid: {write_uuid}" + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config_file", + help="Config parameters for GraphGen.", + default=resources.files("graphgen") + .joinpath("configs") + .joinpath("aggregated_config.yaml"), + type=str, + ) + + args = parser.parse_args() + + with open(args.config_file, "r", encoding="utf-8") as f: + config = yaml.load(f, Loader=yaml.FullLoader) + + working_dir = config.get("global_params", {}).get("working_dir", "cache") + unique_id = int(time.time()) + output_path = os.path.join(working_dir, "output", f"{unique_id}") + set_working_dir(output_path) + log_path = os.path.join(working_dir, "logs", "Driver.log") + driver_logger = set_logger( + log_path, + name="GraphGen", + console_level=logging.ERROR, + if_stream=True, + ) + CURRENT_LOGGER_VAR.set(driver_logger) + logger.info( + "GraphGen with unique ID %s logging to %s", + unique_id, + log_path, + ) + + # Temporarily suppress non-error output (print statements, third-party libraries, Ray Data progress) + # Only redirect stdout to preserve stderr for logger error output + global _devnull + _devnull = open(os.devnull, 'w', encoding='utf-8') + sys.stdout = _devnull + + try: + engine = Engine(config, operators) + ds = ray.data.from_items([]) + results = engine.execute(ds) + + for node_id, dataset in results.items(): + node_output_path = os.path.join(output_path, f"{node_id}") + os.makedirs(node_output_path, exist_ok=True) + dataset.write_json( + node_output_path, + filename_provider=NodeFilenameProvider(node_id), + pandas_json_args_fn=lambda: { + "force_ascii": False, + "orient": "records", + "lines": True, + }, + ) + logger.info("Node %s results saved to %s", node_id, node_output_path) + + save_config(os.path.join(output_path, "config.yaml"), config) + logger.info("GraphGen completed successfully. Data saved to %s", output_path) + finally: + # Restore original stdout before printing results + sys.stdout = _original_stdout + if _devnull: + _devnull.close() + _devnull = None + + # Print save information to console + if 'results' in locals() and results: + print("\n" + "="*60) + print("GraphGen execution completed successfully!") + print("="*60) + for node_id, dataset in results.items(): + node_output_path = os.path.join(output_path, f"{node_id}") + print(f"✓ Node '{node_id}' results saved to: {node_output_path}") + print(f"✓ Config saved to: {os.path.join(output_path, 'config.yaml')}") + print(f"✓ Logs saved to: {log_path}") + print("="*60 + "\n") + else: + print("\n⚠️ Warning: No results were generated.\n") + + +if __name__ == "__main__": + main() diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py index 6e362d08..99e297ee 100644 --- a/graphgen/templates/__init__.py +++ b/graphgen/templates/__init__.py @@ -1,9 +1,20 @@ -from .kg_extraction import KG_EXTRACTION_PROMPT -from .kg_summarization import KG_SUMMARIZATION_PROMPT -from .search_judgement import SEARCH_JUDGEMENT_PROMPT +from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT -from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT -from .answer_rephrasing import ANSWER_REPHRASING_PROMPT +from .extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT +from .generation import ( + AGGREGATED_GENERATION_PROMPT, + ATOMIC_GENERATION_PROMPT, + COT_GENERATION_PROMPT, + MULTI_HOP_GENERATION_PROMPT, + OMICS_QA_GENERATION_PROMPT, + VQA_GENERATION_PROMPT, +) +from .kg import ( + KG_EXTRACTION_PROMPT, + KG_SUMMARIZATION_PROMPT, + MMKG_EXTRACTION_PROMPT, + OMICS_KG_EXTRACTION_PROMPT, +) from .question_generation import QUESTION_GENERATION_PROMPT -from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT -from .coreference_resolution import COREFERENCE_RESOLUTION_TEMPLATE +from .search_judgement import SEARCH_JUDGEMENT_PROMPT +from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT diff --git a/graphgen/templates/coreference_resolution.py b/graphgen/templates/coreference_resolution.py index b29394ad..bc03e671 100644 --- a/graphgen/templates/coreference_resolution.py +++ b/graphgen/templates/coreference_resolution.py @@ -1,4 +1,3 @@ -# pylint: disable=C0301 TEMPLATE_ZH: str = """请根据参考文本识别并消解文本中的指代词,明确每个代词所指代的具体实体,并直接输出消解后的文本。 -示例- @@ -16,7 +15,8 @@ 输出: """ -TEMPLATE_EN: str = """Please identify and resolve the pronouns in the reference text, specify the specific entities referred to by each pronoun, and directly output the resolved text. +TEMPLATE_EN: str = """Please identify and resolve the pronouns in the reference text, \ +specify the specific entities referred to by each pronoun, and directly output the resolved text. -Example- Input: @@ -33,7 +33,4 @@ Output: """ -COREFERENCE_RESOLUTION_TEMPLATE = { - "en": TEMPLATE_EN, - "zh": TEMPLATE_ZH -} +COREFERENCE_RESOLUTION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH} diff --git a/graphgen/templates/description_rephrasing.py b/graphgen/templates/description_rephrasing.py index a0e38012..5c4dc861 100644 --- a/graphgen/templates/description_rephrasing.py +++ b/graphgen/templates/description_rephrasing.py @@ -19,7 +19,7 @@ ################ -Real Data- ################ -Input: +Input: {input_sentence} ################ Please directly output the rewritten sentence without any additional information. @@ -110,11 +110,11 @@ DESCRIPTION_REPHRASING_PROMPT= { - "English": { + "en": { "ANTI_TEMPLATE": ANTI_TEMPLATE_EN, "TEMPLATE": TEMPLATE_EN }, - "Chinese": { + "zh": { "ANTI_TEMPLATE": ANTI_TEMPLATE_ZH, "TEMPLATE": TEMPLATE_ZH } diff --git a/graphgen/templates/extraction/__init__.py b/graphgen/templates/extraction/__init__.py new file mode 100644 index 00000000..70f1369a --- /dev/null +++ b/graphgen/templates/extraction/__init__.py @@ -0,0 +1 @@ +from .schema_guided_extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT diff --git a/graphgen/templates/extraction/schema_guided_extraction.py b/graphgen/templates/extraction/schema_guided_extraction.py new file mode 100644 index 00000000..e96c3886 --- /dev/null +++ b/graphgen/templates/extraction/schema_guided_extraction.py @@ -0,0 +1,70 @@ +TEMPLATE_EN = """You are an expert at extracting information from text based on a given schema. +Extract relevant information about {field} from a given contract document according to the provided schema. + +Instructions: +1. Carefully read the entire document provided at the end of this prompt. +2. Extract the relevant information. +3. Present your findings in JSON format as specified below. + +Important Notes: +- Extract only relevant information. +- Consider the context of the entire document when determining relevance. +- Do not be verbose, only respond with the correct format and information. +- Some docs may have multiple relevant excerpts -- include all that apply. +- Some questions may have no relevant excerpts -- just return "". +- Do not include additional JSON keys beyond the ones listed here. +- Do not include the same key multiple times in the JSON. +- Use English for your response. + +Expected JSON keys and explanation of what they are: +{schema_explanation} + +Expected format: +{{ + "key1": "value1", + "key2": "value2", + ... +}} + +{examples} + +Document to extract from: +{text} +""" + +TEMPLATE_ZH = """你是一个擅长根据给定的模式从文本中提取信息的专家。 +根据提供的模式,从合同文件中提取与{field}相关的信息。 +操作说明: +1. 仔细阅读本提示末尾提供的整份文件。 +2. 提取相关信息。 +3. 按照下面指定的JSON格式呈现你的发现。 + +重要注意事项: +- 仅提取相关信息。 +- 在确定相关性时,考虑整份文件的上下文。 +- 不要冗长,只需以正确的格式和信息进行回应。 +- 有些文件可能有多个相关摘录——请包含所有适用的内容。 +- 有些问题可能没有相关摘录——只需返回""。 +- 不要在JSON中包含除列出的键之外的其他键。 +- 不要多次包含同一个键。 +- 使用中文回答。 + +预期的JSON键及其说明: +{schema_explanation} + +预期格式: +{{ + "key1": "value1", + "key2": "value2", + ... +}} + +{examples} +要提取的文件: +{text} +""" + +SCHEMA_GUIDED_EXTRACTION_PROMPT = { + "en": TEMPLATE_EN, + "zh": TEMPLATE_ZH, +} diff --git a/graphgen/templates/extraction/schemas/legal_contract.json b/graphgen/templates/extraction/schemas/legal_contract.json new file mode 100644 index 00000000..00b8baca --- /dev/null +++ b/graphgen/templates/extraction/schemas/legal_contract.json @@ -0,0 +1,48 @@ +{ + "type": "object", + "name": "legal contract", + "description": "A legal contract for leasing property.", + "properties": { + "leased_space": { + "type": "string", + "description": "Description of the space that is being leased." + }, + "lessee": { + "type": "string", + "description": "The lessee's name (and possibly address)." + }, + "lessor": { + "type": "string", + "description": "The lessor's name (and possibly address)." + }, + "signing_date": { + "type": "string", + "description": "The date the contract was signed." + }, + "start_date": { + "type": "string", + "description": "The start date of the lease." + }, + "end_date": { + "type": "string", + "description": "The end date of the lease." + }, + "term_of_payment": { + "type": "string", + "description": "Description of the payment terms." + }, + "designated_use": { + "type": "string", + "description": "Designated use of the property being leased." + }, + "extension_period": { + "type": "string", + "description": "Description of the extension options for the lease." + }, + "expiration_date_of_lease": { + "type": "string", + "description": "The expiration date of the lease." + } + }, + "required": ["lessee", "lessor", "start_date", "end_date"] +} diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py new file mode 100644 index 00000000..22c3797a --- /dev/null +++ b/graphgen/templates/generation/__init__.py @@ -0,0 +1,6 @@ +from .aggregated_generation import AGGREGATED_GENERATION_PROMPT +from .atomic_generation import ATOMIC_GENERATION_PROMPT +from .cot_generation import COT_GENERATION_PROMPT +from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT +from .omics_qa_generation import OMICS_QA_GENERATION_PROMPT +from .vqa_generation import VQA_GENERATION_PROMPT diff --git a/graphgen/templates/answer_rephrasing.py b/graphgen/templates/generation/aggregated_generation.py similarity index 83% rename from graphgen/templates/answer_rephrasing.py rename to graphgen/templates/generation/aggregated_generation.py index a33e9d9e..305064e7 100644 --- a/graphgen/templates/answer_rephrasing.py +++ b/graphgen/templates/generation/aggregated_generation.py @@ -1,7 +1,7 @@ -TEMPLATE_CONTEXT_EN: str = """---Role--- - +# pylint: disable=C0301 +ANSWER_REPHRASING_CONTEXT_EN: str = """---Role--- You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below. You may refer to the original text to assist in generating the rephrased version, but ensure that the final output text meets the requirements. -Use {language} as output language. +Use English as output language. ---Goal--- To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while: @@ -50,13 +50,11 @@ """ -TEMPLATE_CONTEXT_ZH: str = """---角色--- - +ANSWER_REPHRASING_CONTEXT_ZH: str = """---角色--- 你是一位NLP专家,负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。你可以参考原始文本辅助生成,但需要确保最终输出的文本符合要求。 -使用{language}作为输出语言。 +使用中文作为输出语言。 ---目标--- - 生成文本的重述版本,使其传达与原始实体和关系描述相同的含义,同时: 1. 遵循清晰的逻辑流和结构 2. 建立适当的因果关系 @@ -100,10 +98,9 @@ """ -TEMPLATE_EN: str = """---Role--- - +ANSWER_REPHRASING_EN: str = """---Role--- You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below. -Use {language} as output language. +Use English as output language. ---Goal--- To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while: @@ -147,13 +144,11 @@ """ -TEMPLATE_ZH: str = """---角色--- - +ANSWER_REPHRASING_ZH: str = """---角色--- 你是一位NLP专家,负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。 -使用{language}作为输出语言。 +使用中文作为输出语言。 ---目标--- - 生成文本的重述版本,使其传达与原始实体和关系描述相同的含义,同时: 1. 遵循清晰的逻辑流和结构 2. 建立适当的因果关系 @@ -206,14 +201,33 @@ Rephrased Text: """ +QUESTION_GENERATION_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer. + +################ +Answer: +{answer} +################ +Question: +""" + +QUESTION_GENERATION_ZH: str = """下面提供了一个问题的答案,请生成一个与答案对应的问题。 + +################ +答案: +{answer} +################ +问题: +""" -ANSWER_REPHRASING_PROMPT= { - "English": { - "TEMPLATE": TEMPLATE_EN + REQUIREMENT_EN, - "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_EN + REQUIREMENT_EN +AGGREGATED_GENERATION_PROMPT = { + "en": { + "ANSWER_REPHRASING": ANSWER_REPHRASING_EN + REQUIREMENT_EN, + "ANSWER_REPHRASING_CONTEXT": ANSWER_REPHRASING_CONTEXT_EN + REQUIREMENT_EN, + "QUESTION_GENERATION": QUESTION_GENERATION_EN, + }, + "zh": { + "ANSWER_REPHRASING": ANSWER_REPHRASING_ZH + REQUIREMENT_ZH, + "ANSWER_REPHRASING_CONTEXT": ANSWER_REPHRASING_CONTEXT_ZH + REQUIREMENT_ZH, + "QUESTION_GENERATION": QUESTION_GENERATION_ZH, }, - "Chinese": { - "TEMPLATE": TEMPLATE_ZH + REQUIREMENT_ZH, - "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_ZH + REQUIREMENT_ZH - } } diff --git a/graphgen/templates/generation/atomic_generation.py b/graphgen/templates/generation/atomic_generation.py new file mode 100644 index 00000000..141c40e4 --- /dev/null +++ b/graphgen/templates/generation/atomic_generation.py @@ -0,0 +1,32 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text. +The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text. +For example: +Question: What is the effect of overexpressing the BG1 gene on grain size and development? +Answer: Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development. + +Question: What role does TAC4 play in the gravitropism of rice shoots? +Answer: TAC4 is a key regulator of gravitropism in rice shoots, promoting the bending of shoots towards the gravity vector. + +Here is the text passage you need to generate a QA pair for: +{context} +""" + +TEMPLATE_ZH: str = """给定一个文本段落。你的任务是根据该文本的内容生成一个问答(QA)对。 +答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。 +例如: +问题:过表达BG1基因对谷粒大小和发育有什么影响? +答案:BG1基因的过表达显著增加了谷粒大小,表明其在谷物发育中的作用。 + +问题:TAC4在水稻茎的重力性状中扮演什么角色? +答案:TAC4是水稻茎重力性状的关键调节因子,促进茎向重力矢量弯曲。 + +以下是你需要为其生成QA对的文本段落: +{context} +""" + + +ATOMIC_GENERATION_PROMPT = { + "en": TEMPLATE_EN, + "zh": TEMPLATE_ZH, +} diff --git a/graphgen/templates/generation/cot_generation.py b/graphgen/templates/generation/cot_generation.py new file mode 100644 index 00000000..e179a704 --- /dev/null +++ b/graphgen/templates/generation/cot_generation.py @@ -0,0 +1,196 @@ +COT_GENERATION_ZH = """根据给定的知识图谱原始信息及已生成的推理路径,产出一条符合模板要求、可直接用于下游训练或推理的 CoT 数据。\ +CoT(Chain-of-Thought,思维链)指在回答复杂问题时,把中间推理步骤一步一步显式写出来,使推理过程透明、可追溯,而不是直接给出最终答案。 + +-输入格式- +[Entities:] +(实体名:实体描述) +... + +[Relationships:] +(来源实体)-[关系描述]->(目标实体) +... + +[Question and Reasoning Path:] +(问题) +(推理路径) + +-输出要求- +1. 每一步只完成一个不可分割的子任务,并用自然语言衔接,但是要避免生硬的连接词。 +2. 使用中文。 +3. 不要使用有序列表或编号。 +4. 请直接给出答案,不要生成无关信息。 + +-真实数据- +输入: +[Entities:]: +{entities} + +[Relationships:]: +{relationships} + +[Question:]: +{question} + +[Reasoning_Template:]: +{reasoning_template} + +输出: + +""" + +COT_GENERATION_EN = """Given the raw knowledge graph information and the provided reasoning-path, \ +produce one Chain-of-Thought (CoT) sample that strictly follows the template \ +and can be directly used for downstream training or inference. +CoT (Chain-of-Thought) means that when answering a complex question, the intermediate reasoning steps are \ +explicitly written out one by one, making the reasoning process transparent and traceable instead of giving \ +only the final answer. + +-Input Format- +[Entities:]: +(ENTITY_NAME: ENTITY_DESCRIPTION) +... + +[Relationships:]: +(ENTITY_SOURCE)-[RELATIONSHIP_DESCRIPTION]->(ENTITY_TARGET) +... + +[Question and Reasoning Path:]: +(QUESTION) +(REASONING_PATH) + +-Output Requirements- +1. Each step completes a single, indivisible sub-task and is naturally connected, avoiding abrupt transition words. +2. Use English. +3. Do not use ordered lists or numbering. +4. Do not generate extraneous information, just provide the answer. + +-Real Data- +Input: +[Entities:]: +{entities} + +[Relationships:]: +{relationships} + +[Question:]: +{question} + +[Reasoning_Template:]: +{reasoning_template} + +Output: +""" + +COT_TEMPLATE_DESIGN_ZH = """你是一位“元推理架构师”。你的任务不是回答问题,\ +而是根据给定的知识图谱中的实体和关系的名称以及描述信息,设计一条可复用、可泛化的 CoT 推理路径模板。\ + +-步骤- +1. 实体识别 +- 准确地识别[Entities:]章节中的实体信息,包括实体名、实体描述信息。 +- 实体信息的一般格式为: +(实体名:实体描述) + +2. 关系识别 +- 准确地识别[Relationships:]章节中的关系信息,包括来源实体名、目标实体名、关系描述信息。 +- 关系信息的一般格式为: +(来源实体名)-[关系描述]->(目标实体名) + +3. 图结构理解 +- 正确地将关系信息中的来源实体名与实体信息关联。 +- 根据提供的关系信息还原出图结构。 + +4. 问题设计 +- 围绕知识图谱所表达的“核心主题”设计一个问题。 +- 问题必须能在图谱内部通过实体、关系或属性直接验证;避免主观判断。 +- 问题应该能够模型足够的思考,充分利用图谱中的实体和关系,避免过于简单或无关的问题。 + +5. 推理路径生成 +- 根据问题设计一个**可被后续模型直接执行的推理蓝图**。 +- 保持步骤最小化:每一步只解决一个“不可分割”的子问题。 + +-约束条件- +1. 不要在回答中描述你的思考过程,直接给出回复,只给出问题和推理路径设计,不要生成无关信息。 +2. 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的逻辑。 +3. 避免使用停用词和过于常见的词汇。 +4. 不要出现具体数值或结论,不要出现“识别实体”、“识别关系”这类无意义的操作描述。 +5. 使用中文作为输出语言。 +6. 输出格式为: +问题: +推理路径设计: + +-真实数据- +输入: +[Entities:]: +{entities} + +[Relationships:]: +{relationships} + +输出: +""" + + +COT_TEMPLATE_DESIGN_EN = """You are a “meta-reasoning architect”. \ +Your task is NOT to answer the question, but to design a reusable, generalizable CoT reasoning-path \ +template based solely on the names and descriptions of entities and \ +relationships in the provided knowledge graph. + +- Steps - +1. Entity Recognition +- Accurately recognize entity information in the [Entities:] section, including entity names and descriptions. +- The general formats for entity information are: +(ENTITY_NAME: ENTITY_DESCRIPTION) + +2. Relationship Recognition +- Accurately recognize relationship information in the [Relationships:] section, including source_entity_name, target_entity_name, and relationship descriptions. +- The general formats for relationship information are: +(SOURCE_ENTITY_NAME)-[RELATIONSHIP_DESCRIPTION]->(TARGET_ENTITY_NAME) + +3. Graph Structure Understanding +- Correctly associate the source entity name in the relationship information with the entity information. +- Reconstruct the graph structure based on the provided relationship information. + +4. Question Design +- Design a question around the "core theme" expressed by the knowledge graph. +- The question must be verifiable directly within the graph through entities, relationships, or attributes; avoid subjective judgments. +- The question should allow the model to think sufficiently, fully utilizing the entities and relationships in the graph, avoiding overly simple or irrelevant questions. + +5. Reasoning-Path Design +- Output a **blueprint that any later model can directly execute**. +- Keep steps minimal: each step solves one indivisible sub-problem. + + +- Constraints - +1. Do NOT describe your thinking; output only the reasoning-path design. +2. If the provided descriptions are contradictory, resolve conflicts and provide a single coherent logic. +3. Avoid using stop words and overly common words. +4. Do not include specific numerical values or conclusions, \ +and DO NOT describing meaningless operations like "Identify the entity" or "Identify the relationship". +5. Use English as the output language. +6. The output format is: +Question: +Reasoning-Path Design: + +Please summarize the information expressed by the knowledge graph based on the following [Entities:] and [Relationships:] provided. + +- Real Data - +Input: +[Entities:]: +{entities} + +[Relationships:]: +{relationships} + +Output: +""" + +COT_GENERATION_PROMPT = { + "en": { + "COT_GENERATION": COT_GENERATION_EN, + "COT_TEMPLATE_DESIGN": COT_TEMPLATE_DESIGN_EN, + }, + "zh": { + "COT_GENERATION": COT_GENERATION_ZH, + "COT_TEMPLATE_DESIGN": COT_TEMPLATE_DESIGN_ZH, + }, +} diff --git a/graphgen/templates/multi_hop_generation.py b/graphgen/templates/generation/multi_hop_generation.py similarity index 95% rename from graphgen/templates/multi_hop_generation.py rename to graphgen/templates/generation/multi_hop_generation.py index dad2ee36..73857ebb 100644 --- a/graphgen/templates/multi_hop_generation.py +++ b/graphgen/templates/generation/multi_hop_generation.py @@ -1,5 +1,4 @@ # pylint: disable=C0301 - TEMPLATE_ZH: str = """请基于以下知识子图生成多跳推理问题和答案。你将获得一个知识子图,其中包含一系列实体、关系和事实。你的任务是提出一个问题,该问题需要经过多次推理才能回答。问题的答案应该是从给定的知识子图中推断出来的。确保问题的难度适中,需要多步推理才能回答。 例如: @@ -54,7 +53,4 @@ Output the generated question and answer directly, please do not copy the example question and answer directly, and do not provide irrelevant information. """ -MULTI_HOP_GENERATION_PROMPT = { - "English": TEMPLATE_EN, - "Chinese": TEMPLATE_ZH -} +MULTI_HOP_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH} diff --git a/graphgen/templates/generation/omics_qa_generation.py b/graphgen/templates/generation/omics_qa_generation.py new file mode 100644 index 00000000..1d356e62 --- /dev/null +++ b/graphgen/templates/generation/omics_qa_generation.py @@ -0,0 +1,99 @@ +# pylint: disable=C0301 +OMICS_QA_TEMPLATE_EN: str = """You are a senior computational biologist specializing in multi-omics data analysis (genomics, transcriptomics, proteomics). Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given biological sample described by the provided ENTITIES and RELATIONSHIPS. +Use English as the output language. + +---Objectives--- +Create multiple sets of omics-centric QA pairs that satisfy the following: +1. Only ask about objectively existing facts in the provided data (e.g., gene names, sequence information, functional annotations, regulatory elements, structural features, experimental metadata, etc.). Avoid subjective or speculative questions. +2. Ensure that each question has a single, clear and verifiable answer that can be directly confirmed from the given entities/relationships. +3. Questions should cover diverse aspects: sequence, structure, function, interactions, regulation, experimental annotations, etc. +4. Avoid repetitive questions; each question must be unique and meaningful. +5. Use concise, unambiguous language; do not invent information beyond the provided data. + +---Instructions--- +1. Carefully analyse the supplied ENTITIES and RELATIONSHIPS to identify: + - Biological entities (genes, proteins, RNA molecules, regulatory elements, pathways, etc.) + - Sequence information (DNA sequences, RNA sequences, protein sequences) + - Functional annotations (gene function, protein function, RNA function, biological processes) + - Structural features (chromosomal location, genomic coordinates, domain structures, etc.) + - Regulatory relationships (transcription, translation, regulation, interaction) + - Experimental metadata (database IDs, organism, experimental methods, etc.) +2. Organise information logically: + - Start with sequence/primary structure information + - Proceed to functional annotations and biological roles + - Include regulatory relationships and interactions + - End with experimental context and metadata +3. Maintain scientific accuracy and consistent nomenclature (standard gene names, sequence identifiers, etc.). +4. Review each QA pair to guarantee logical consistency and absence of hallucination. + +################ +-ENTITIES- +################ +{entities} + +################ +-RELATIONSHIPS- +################ +{relationships} +################ +Directly output the generated QA pairs below. Do NOT copy any example questions, and do NOT include extraneous text. + +Question: +Answer: + +Question: +Answer: + +""" + +OMICS_QA_TEMPLATE_ZH: str = """你是一位资深的多组学数据计算生物学家(基因组学、转录组学、蛋白质组学)。你的任务是根据下述提供的实体与关系,为给定的生物样本生成逻辑连贯、可验证、无幻觉的中英双语问答对(这里仅输出中文)。 +使用中文作为输出语言。 + +---目标--- +创建多组以组学数据为中心的问答对,满足: +1. 仅询问数据中客观存在的事实(如基因名称、序列信息、功能注释、调控元件、结构特征、实验元数据等),避免主观或推测性问题。 +2. 每个问题必须有单一、明确且可直接验证的答案,答案必须能从给定实体/关系中直接确认。 +3. 问题需覆盖:序列、结构、功能、相互作用、调控、实验注释等多个维度,确保多样性与全面性。 +4. 避免重复提问,每个问题都独特且有意义。 +5. 语言简洁、无歧义,严禁编造超出给定数据的信息。 + +---说明--- +1. 仔细分析提供的实体与关系,识别: + - 生物实体(基因、蛋白质、RNA分子、调控元件、通路等) + - 序列信息(DNA序列、RNA序列、蛋白质序列) + - 功能注释(基因功能、蛋白质功能、RNA功能、生物学过程) + - 结构特征(染色体位置、基因组坐标、结构域等) + - 调控关系(转录、翻译、调控、相互作用) + - 实验元数据(数据库ID、生物体、实验方法等) +2. 按逻辑顺序组织信息: + - 从序列/一级结构信息入手 + - 再到功能注释和生物学作用 + - 包括调控关系和相互作用 + - 最后到实验背景和元数据 +3. 保持科学准确性,使用统一命名规范(标准基因名、序列标识符等)。 +4. 检查每对问答,确保逻辑一致且无幻觉。 + +################ +-实体- +################ +{entities} + +################ +-关系- +################ +{relationships} +################ +请直接在下方输出生成的问答对,不要复制任何示例,不要输出无关内容。 + +问题: <问题1> +答案: <答案1> + +问题: <问题2> +答案: <答案2> + +""" + +OMICS_QA_GENERATION_PROMPT = { + "en": OMICS_QA_TEMPLATE_EN, + "zh": OMICS_QA_TEMPLATE_ZH, +} diff --git a/graphgen/templates/generation/vqa_generation.py b/graphgen/templates/generation/vqa_generation.py new file mode 100644 index 00000000..4826be0e --- /dev/null +++ b/graphgen/templates/generation/vqa_generation.py @@ -0,0 +1,104 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are a senior VQA data engineer. Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given multi-modal samples. +Use English as the output language. + +---Objectives--- +Create multiple sets of VQA question-answer pairs that satisfy the following: +1. Only ask about objectively existing facts in the given data, avoiding subjective or ambiguous questions. +2. Ensure that each question has a clear and verifiable answer, avoiding questions with no answer or uncertainty. +3. Questions should cover various aspects of both image and text content, ensuring diversity and comprehensiveness. +4. Avoid repetitive questions, ensuring that each question is unique and meaningful. +5. Use clear and concise language, avoiding complex or ambiguous wording. + +---Instructions--- +1. Carefully analyze the provided entities and relationships to identify: + - Key concepts and their hierarchical relationships + - Temporal sequences and time order + - Cause-and-effect relationships + - Dependencies between different elements +2. Organize the information into a logical sequence by: + - Starting with foundational concepts + - Gradually building up to more complex relationships + - Grouping related ideas together + - Creating clear transitions between sections +3. Maintain the following when generating question-answer pairs: + - Logical flow + - Clear connections between concepts + - Appropriate context and background + - Coherent narrative structure +4. Review and refine the question-answer pairs to ensure: + - Overall logical consistency + - Clear cause-and-effect relationships + +################ +-Entities- +################ +{entities} +################ +-Relationships- +################ +{relationships} +################ +Directly output the generated questions and answers, please do not directly copy the example questions and answers, and do not provide irrelevant information. +Here is the response format you should follow: +Question: +Answer: + +Question: +Answer: + +""" + +TEMPLATE_ZH: str = """---角色--- +你是一位资深 VQA 数据工程师。你需要为给定的多模态样本生成逻辑连贯、可验证、无幻觉的问答对。 +使用中文作为输出语言。 + +---目标--- +创建多组 VQA 问答对,满足: +1. 仅询问给定数据中客观存在的事实,避免主观或模糊的问题。 +2. 确保每个问题都有明确且可验证的答案,避免无答案或不确定的问题。 +3. 问题应涵盖图像和文本内容的各个方面,确保多样性和全面性。 +4. 避免重复问题,确保每个问题都是独特且有意义的。 +5. 使用清晰简洁的语言,避免复杂或含糊的措辞。 + +---说明--- +1. 仔细分析提供的实体和关系,以识别: + - 关键概念及其层级关系 + - 时间序列和时间顺序 + - 因果关系 + - 不同元素之间的依赖关系 +2. 通过以下方式将信息组织成逻辑顺序: + - 从基础概念开始 + - 逐步建立更复杂的关系 + - 将相关的想法分组在一起 + - 在各部分之间创建清晰的过渡 +3. 生成问答对时保持: + - 逻辑流畅 + - 概念之间的清晰联系 + - 适当的上下文和背景 + - 连贯的叙述结构 +4. 检查和完善问答对以确保: + - 整体逻辑一致性 + - 清晰的因果关系 + +################ +-实体- +################ +{entities} + +################ +-关系- +################ +{relationships} +################ +直接输出生成的问题和答案,请不要直接复制示例问题和答案,不要输出无关内容。 +以下是你应该遵循的响应格式: +问题: <问题1> +答案: <答案1> + +问题: <问题2> +答案: <答案2> + +""" + +VQA_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH} diff --git a/graphgen/templates/kg/__init__.py b/graphgen/templates/kg/__init__.py new file mode 100644 index 00000000..e39c1408 --- /dev/null +++ b/graphgen/templates/kg/__init__.py @@ -0,0 +1,5 @@ +from .kg_extraction import KG_EXTRACTION_PROMPT +from .kg_summarization import KG_SUMMARIZATION_PROMPT +from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT +from .omics_kg_extraction import OMICS_KG_EXTRACTION_PROMPT +from .protein_kg_extraction import PROTEIN_KG_EXTRACTION_PROMPT diff --git a/graphgen/templates/kg_extraction.py b/graphgen/templates/kg/kg_extraction.py similarity index 98% rename from graphgen/templates/kg_extraction.py rename to graphgen/templates/kg/kg_extraction.py index 8d98bb95..930cbb99 100644 --- a/graphgen/templates/kg_extraction.py +++ b/graphgen/templates/kg/kg_extraction.py @@ -1,10 +1,9 @@ # pylint: disable=C0301 - TEMPLATE_EN: str = """You are an NLP expert, skilled at analyzing text to extract named entities and their relationships. -Goal- Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. -Use {language} as output language. +Use English as output language. -Steps- 1. Identify all entities. For each identified entity, extract the following information: @@ -23,7 +22,7 @@ 3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. Format the content-level key words as ("content_keywords"{tuple_delimiter}) -4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. +4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. 5. When finished, output {completion_delimiter} @@ -85,7 +84,7 @@ -目标- 给定一个实体类型列表和可能与列表相关的文本,从文本中识别所有这些类型的实体,以及这些实体之间所有的关系。 -使用{language}作为输出语言。 +使用中文作为输出语言。 -步骤- 1. 识别所有实体。对于每个识别的实体,提取以下信息: @@ -93,7 +92,7 @@ - entity_type:以下类型之一:[{entity_types}] - entity_summary:实体的属性与活动的全面总结 将每个实体格式化为("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - + 2. 从步骤1中识别的实体中,识别所有(源实体,目标实体)对,这些实体彼此之间*明显相关*。 对于每对相关的实体,提取以下信息: - source_entity:步骤1中识别的源实体名称 @@ -189,12 +188,12 @@ IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加,请回答YES | NO。""" KG_EXTRACTION_PROMPT: dict = { - "English": { + "en": { "TEMPLATE": TEMPLATE_EN, "CONTINUE": CONTINUE_EN, "IF_LOOP": IF_LOOP_EN, }, - "Chinese": { + "zh": { "TEMPLATE": TEMPLATE_ZH, "CONTINUE": CONTINUE_ZH, "IF_LOOP": IF_LOOP_ZH, @@ -205,6 +204,5 @@ "completion_delimiter": "<|COMPLETE|>", "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \ science, technology, mission, gene", - "language": "English", }, } diff --git a/graphgen/templates/kg_summarization.py b/graphgen/templates/kg/kg_summarization.py similarity index 86% rename from graphgen/templates/kg_summarization.py rename to graphgen/templates/kg/kg_summarization.py index 7cf75180..3e7cb6cb 100644 --- a/graphgen/templates/kg_summarization.py +++ b/graphgen/templates/kg/kg_summarization.py @@ -3,7 +3,7 @@ Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. Make sure it is written in third person, and include the entity names so we the have full context. -Use {language} as output language. +Use English as output language. ####### -Data- @@ -18,7 +18,7 @@ 请将所有这些描述整合成一个综合描述。确保包含所有描述中收集的信息。 如果提供的描述是矛盾的,请解决这些矛盾并提供一个连贯的总结。 确保以第三人称写作,并包含实体名称,以便我们有完整的上下文。 -使用{language}作为输出语言。 +使用中文作为输出语言。 ####### -数据- @@ -30,14 +30,9 @@ KG_SUMMARIZATION_PROMPT = { - "Chinese": { - "TEMPLATE": TEMPLATE_ZH - }, - "English": { - "TEMPLATE": TEMPLATE_EN - }, + "zh": {"TEMPLATE": TEMPLATE_ZH}, + "en": {"TEMPLATE": TEMPLATE_EN}, "FORMAT": { - "language": "English", "tuple_delimiter": "<|>", "record_delimiter": "##", "completion_delimiter": "<|COMPLETE|>", diff --git a/graphgen/templates/kg/mm_kg_extraction.py b/graphgen/templates/kg/mm_kg_extraction.py new file mode 100644 index 00000000..28327175 --- /dev/null +++ b/graphgen/templates/kg/mm_kg_extraction.py @@ -0,0 +1,131 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are an expert in multi-modal data analysis and knowledge graph construction. Your task is to extract named entities and relationships from a given multi-modal data chunk and its accompanying text. + +-Objective- +Given a multi-modal data chunk (e.g., image, table, formula, etc. + accompanying text), construct a knowledge graph centered around the "central multi-modal entity": +- The central entity must be the image/table/formula itself (e.g., image-c71ef797e99af81047fbc7509609c765). +- Related entities and relationships must be extracted from the accompanying text. +- Only retain edges directly connected to the central entity, forming a star-shaped graph. +Use English as the output language. + +-Steps- +1. Identify the unique central multi-modal entity and recognize all text entities directly related to the central entity from the accompanying text. + For the central entity, extract the following information: + - entity_name: Use the unique identifier of the data chunk (e.g., image-c71ef797e99af81047fbc7509609c765). + - entity_type: Label according to the type of data chunk (image, table, formula, etc.). + - entity_summary: A brief description of the content of the data chunk and its role in the accompanying text. + For each entity recognized from the accompanying text, extract the following information: + - entity_name: The name of the entity, capitalized + - entity_type: One of the following types: [{entity_types}] + - entity_summary: A comprehensive summary of the entity's attributes and activities + Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. From the entities identified in Step 1, recognize all (source_entity, target_entity) pairs that are *obviously related* to each other. + For each pair of related entities, extract the following information: + - source_entity: The name of the source entity identified in Step 1 + - target_entity: The name of the target entity identified in Step 1 + - relationship_summary: Explain why you think the source entity and target entity are related to each other + Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Return the output list of all entities and relationships identified in Steps 1 and 2 in English. Use **{record_delimiter}** as the list separator. + +4. Upon completion, output {completion_delimiter} + +################ +-Example- +################ +Multi-modal data chunk type: image +Multi-modal data chunk unique identifier: image-c71ef797e99af81047fbc7509609c765 +Accompanying text: The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889. It stands 324 meters tall and is one of the tallest structures in the world. The Eiffel Tower is located on the banks of the Seine River and attracts millions of visitors each year. It is not only an engineering marvel but also an important symbol of French culture. +################ +Output: +("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"This is an image showcasing the iconic structure in Paris, France, the Eiffel Tower, highlighting its full height of 324 meters along with the riverside scenery, symbolizing both engineering and cultural significance"){record_delimiter} +("entity"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"landmark"{tuple_delimiter}"The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889, standing 324 meters tall, located on the banks of the Seine River, attracting millions of visitors each year"){record_delimiter} +("entity"{tuple_delimiter}"Paris, France"{tuple_delimiter}"location"{tuple_delimiter}"Paris, France is the capital of France, known for its rich historical and cultural heritage and as the location of the Eiffel Tower"){record_delimiter} +("entity"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"person"{tuple_delimiter}"Gustave Eiffel is a renowned French engineer who designed and built the Eiffel Tower"){record_delimiter} +("entity"{tuple_delimiter}"Seine River"{tuple_delimiter}"location"{tuple_delimiter}"The Seine River is a major river flowing through Paris, France, with the Eiffel Tower located on its banks"){completion_delimiter} +("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"The image showcases the iconic structure, the Eiffel Tower"){record_delimiter} +("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Paris, France"{tuple_delimiter}"The image's background is Paris, France, highlighting the geographical location of the Eiffel Tower"){record_delimiter} +("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"The Eiffel Tower in the image was designed by Gustave Eiffel"){record_delimiter} +("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Seine River"{tuple_delimiter}"The image showcases the scenery of the Eiffel Tower located on the banks of the Seine River"){completion_delimiter} +################ + +-Real Data- +Multi-modal data chunk type: {chunk_type} +Multi-modal data chunk unique identifier: {chunk_id} +Accompanying text: {chunk_text} +################ +Output: +""" + +TEMPLATE_ZH: str = """你是一个多模态数据分析和知识图谱构建专家。你的任务是从给定的多模态数据块及其伴随文本中抽取命名实体与关系。 + +-目标- +给定一个多模态数据块(例如图像、表格、公式等 + 伴随文本),构建以「中心多模态实体」为核心的知识图: +- 中心实体必须是图像/表格/公式本身(如 image-c71ef797e99af81047fbc7509609c765)。 +- 相关实体和关系必须从伴随文本中抽取。 +- 只保留与中心实体直接相连的边,形成星型图。 +使用中文作为输出语言。 + +-步骤- +1. 确定唯一的中心多模态实体,从伴随文本中识别所有与中心实体直接相关的文本实体。 + 对于中心实体,提取以下信息: + - entity_name:使用数据块的唯一标识符(如 image-c71ef797e99af81047fbc7509609c765)。 + - entity_type:根据数据块类型(图像、表格、公式等)进行标注。 + - entity_summary:简要描述数据块的内容和其在伴随文本中的作用。 + 对于从伴随文本中识别的每个实体,提取以下信息: + - entity_name:实体的名称,首字母大写 + - entity_type:以下类型之一:[{entity_types}] + - entity_summary:实体的属性与活动的全面总结 + 将每个实体格式化为("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. 从步骤1中识别的实体中,识别所有(源实体,目标实体)对,这些实体彼此之间*明显相关*。 + 对于每对相关的实体,提取以下信息: + - source_entity:步骤1中识别的源实体名称 + - target_entity:步骤1中识别的目标实体名称 + - relationship_summary:解释为什么你认为源实体和目标实体彼此相关 + 将每个关系格式化为("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。 + +4. 完成后,输出{completion_delimiter} + +################ +-示例- +################ +多模态数据块类型:image +多模态数据块唯一标识符:image-c71ef797e99af81047fbc7509609c765 +伴随文本:埃菲尔铁塔是法国巴黎的标志性结构,由古斯塔夫·埃菲尔设计并于1889年建成。它高324米,是世界上最高的建筑之一。埃菲尔铁塔位于塞纳河畔,吸引了数百万游客前来参观。它不仅是工程学的奇迹,也是法国文化的重要象征。 +################ +输出: +("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"这是一张展示法国巴黎标志性建筑的图像,主体为埃菲尔铁塔,呈现其324米高度的全貌与河畔景观,具有工程与文化双重象征意义"){record_delimiter} +("entity"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"landmark"{tuple_delimiter}"埃菲尔铁塔是法国巴黎的标志性结构,由古斯塔夫·埃菲尔设计并于1889年建成,高324米,是世界上最高的建筑之一,位于塞纳河畔,吸引了数百万游客前来参观"){record_delimiter} +("entity"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"location"{tuple_delimiter}"法国巴黎是法国的首都,以其丰富的历史文化遗产和作为埃菲尔铁塔所在地而闻名"){record_delimiter} +("entity"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"person"{tuple_delimiter}"古斯塔夫·埃菲尔是法国著名的工程师,设计并建造了埃菲尔铁塔"){record_delimiter} +("entity"{tuple_delimiter}"塞纳河"{tuple_delimiter}"location"{tuple_delimiter}"塞纳河是流经法国巴黎的重要河流,埃菲尔铁塔位于其畔"){completion_delimiter} +("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"图像展示了埃菲尔铁塔这一标志性建筑"){record_delimiter} +("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"图像背景为法国巴黎,突显了埃菲尔铁塔的地理位置"){record_delimiter} +("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"图像中的埃菲尔铁塔是由古斯塔夫·埃菲尔设计的"){record_delimiter} +("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"塞纳河"{tuple_delimiter}"图像展示了埃菲尔铁塔位于塞纳河畔的景观"){completion_delimiter} +################ + +-真实数据- +多模态数据块类型: {chunk_type} +多模态数据块唯一标识符: {chunk_id} +伴随文本: {chunk_text} +################ +输出: +""" + + +MMKG_EXTRACTION_PROMPT: dict = { + "en": TEMPLATE_EN, + "zh": TEMPLATE_ZH, + "FORMAT": { + "tuple_delimiter": "<|>", + "record_delimiter": "##", + "completion_delimiter": "<|COMPLETE|>", + "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \ +science, technology, mission, gene", + }, +} diff --git a/graphgen/templates/kg/omics_kg_extraction.py b/graphgen/templates/kg/omics_kg_extraction.py new file mode 100644 index 00000000..d105dd38 --- /dev/null +++ b/graphgen/templates/kg/omics_kg_extraction.py @@ -0,0 +1,209 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are a bioinformatics expert, skilled at analyzing biological sequences (DNA, RNA, protein) and their metadata to extract biological entities and their relationships. + +-Goal- +Given a biological sequence chunk (DNA, RNA, or protein) along with its metadata, identify all relevant biological entities and their relationships. +Use English as output language. + +-Steps- +1. Identify all biological entities. For each identified entity, extract the following information: +- entity_name: Name of the entity (gene name, protein name, RNA name, domain name, etc.), capitalized +- entity_type: One of the following types: [{entity_types}] +- entity_summary: Comprehensive summary of the entity's biological function, structure, or properties +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *biologically related* to each other. +For each pair of related entities, extract the following information: +- source_entity: name of the source entity, as identified in step 1 +- target_entity: name of the target entity, as identified in step 1 +- relationship_summary: explanation of the biological relationship (e.g., encodes, transcribes, translates, interacts, regulates, homologous_to, located_in, etc.) +Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify high-level key words that summarize the main biological concepts, functions, or themes. +Format the content-level key words as ("content_keywords"{tuple_delimiter}) + +4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +5. When finished, output {completion_delimiter} + +################ +-Examples- +################ +-Example 1- +Sequence Metadata: +################ +molecule_type: DNA +database: NCBI +id: NG_033923 +gene_name: BRCA1 +gene_description: BRCA1 DNA repair associated +organism: Homo sapiens +gene_type: protein-coding +chromosome: 17 +genomic_location: 43044295-43125483 +function: BRCA1 is a tumor suppressor gene involved in DNA repair +sequence_chunk: ATGCGATCGATCGATCG... (first 500bp of BRCA1 gene) +################ +Output: +("entity"{tuple_delimiter}"BRCA1"{tuple_delimiter}"gene"{tuple_delimiter}"BRCA1 is a protein-coding tumor suppressor gene located on chromosome 17 in humans, involved in DNA repair mechanisms."){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which BRCA1 gene is found."){record_delimiter} +("entity"{tuple_delimiter}"chromosome 17"{tuple_delimiter}"location"{tuple_delimiter}"Chromosome 17 is the chromosomal location of the BRCA1 gene in humans."){record_delimiter} +("entity"{tuple_delimiter}"DNA repair"{tuple_delimiter}"biological_process"{tuple_delimiter}"DNA repair is a biological process in which BRCA1 is involved as a tumor suppressor."){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"BRCA1 is a gene found in Homo sapiens."){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"chromosome 17"{tuple_delimiter}"BRCA1 is located on chromosome 17 in the human genome."){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"DNA repair"{tuple_delimiter}"BRCA1 is involved in DNA repair processes as a tumor suppressor gene."){record_delimiter} +("content_keywords"{tuple_delimiter}"tumor suppressor, DNA repair, genetic disease, cancer genetics"){completion_delimiter} + +-Example 2- +Sequence Metadata: +################ +molecule_type: RNA +database: RNAcentral +id: URS0000000001 +rna_type: miRNA +description: hsa-let-7a-1 microRNA +organism: Homo sapiens +related_genes: ["LIN28", "HMGA2"] +sequence_chunk: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG +################ +Output: +("entity"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"rna"{tuple_delimiter}"hsa-let-7a-1 is a microRNA (miRNA) found in Homo sapiens, involved in gene regulation."){record_delimiter} +("entity"{tuple_delimiter}"LIN28"{tuple_delimiter}"gene"{tuple_delimiter}"LIN28 is a gene related to hsa-let-7a-1 microRNA, involved in RNA processing and development."){record_delimiter} +("entity"{tuple_delimiter}"HMGA2"{tuple_delimiter}"gene"{tuple_delimiter}"HMGA2 is a gene related to hsa-let-7a-1 microRNA, involved in chromatin structure and gene expression."){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which hsa-let-7a-1 is found."){record_delimiter} +("entity"{tuple_delimiter}"microRNA"{tuple_delimiter}"rna_type"{tuple_delimiter}"MicroRNA is a type of small non-coding RNA involved in post-transcriptional gene regulation."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"hsa-let-7a-1 is a microRNA found in Homo sapiens."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"LIN28"{tuple_delimiter}"hsa-let-7a-1 microRNA is related to LIN28 gene, potentially regulating its expression."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"HMGA2"{tuple_delimiter}"hsa-let-7a-1 microRNA is related to HMGA2 gene, potentially regulating its expression."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"microRNA"{tuple_delimiter}"hsa-let-7a-1 belongs to the microRNA class of RNA molecules."){record_delimiter} +("content_keywords"{tuple_delimiter}"microRNA, gene regulation, post-transcriptional control, RNA processing"){completion_delimiter} + +-Example 3- +Sequence Metadata: +################ +molecule_type: protein +database: UniProt +id: P01308 +protein_name: Insulin +organism: Homo sapiens +function: ["Regulates glucose metabolism", "Hormone signaling"] +sequence_chunk: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN +################ +Output: +("entity"{tuple_delimiter}"Insulin"{tuple_delimiter}"protein"{tuple_delimiter}"Insulin is a protein hormone in Homo sapiens that regulates glucose metabolism and hormone signaling."){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which Insulin is produced."){record_delimiter} +("entity"{tuple_delimiter}"glucose metabolism"{tuple_delimiter}"biological_process"{tuple_delimiter}"Glucose metabolism is a biological process regulated by Insulin."){record_delimiter} +("entity"{tuple_delimiter}"hormone signaling"{tuple_delimiter}"biological_process"{tuple_delimiter}"Hormone signaling is a biological process in which Insulin participates as a signaling molecule."){record_delimiter} +("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"Insulin is a protein produced in Homo sapiens."){record_delimiter} +("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"glucose metabolism"{tuple_delimiter}"Insulin regulates glucose metabolism in the body."){record_delimiter} +("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"hormone signaling"{tuple_delimiter}"Insulin participates in hormone signaling pathways."){record_delimiter} +("content_keywords"{tuple_delimiter}"hormone, metabolism, glucose regulation, signaling pathway"){completion_delimiter} + +################ +-Real Data- +################ +Entity_types: {entity_types} +Sequence Metadata: {metadata_text} +Sequence Chunk: {sequence_chunk} +################ +Output: +""" + + +TEMPLATE_ZH: str = """你是一个生物信息学专家,擅长分析生物序列(DNA、RNA、蛋白质)及其元数据,提取生物实体及其关系。 + +-目标- +给定一个生物序列片段(DNA、RNA或蛋白质)及其元数据,识别所有相关的生物实体及其关系。 +使用中文作为输出语言。 + +-步骤- +1. 识别所有生物实体。对于每个识别的实体,提取以下信息: + - entity_name:实体的名称(基因名、蛋白质名、RNA名、功能域名等),首字母大写 + - entity_type:以下类型之一:[{entity_types}] + - entity_summary:实体生物学功能、结构或属性的全面总结 + 将每个实体格式化为("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. 从步骤1中识别的实体中,识别所有(源实体,目标实体)对,这些实体彼此之间*在生物学上相关*。 + 对于每对相关的实体,提取以下信息: + - source_entity:步骤1中识别的源实体名称 + - target_entity:步骤1中识别的目标实体名称 + - relationship_summary:生物学关系的解释(例如:编码、转录、翻译、相互作用、调控、同源、位于等) + 将每个关系格式化为("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. 识别总结主要生物学概念、功能或主题的高级关键词。 + 将内容级关键词格式化为("content_keywords"{tuple_delimiter}) + +4. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。 + +5. 完成后,输出{completion_delimiter} + +################ +-示例- +################ +-示例 1- +序列元数据: +################ +molecule_type: DNA +database: NCBI +id: NG_033923 +gene_name: BRCA1 +gene_description: BRCA1 DNA repair associated +organism: Homo sapiens +gene_type: protein-coding +chromosome: 17 +genomic_location: 43044295-43125483 +function: BRCA1 is a tumor suppressor gene involved in DNA repair +sequence_chunk: ATGCGATCGATCGATCG... (BRCA1基因的前500bp) +################ +输出: +("entity"{tuple_delimiter}"BRCA1"{tuple_delimiter}"gene"{tuple_delimiter}"BRCA1是位于人类17号染色体上的蛋白质编码肿瘤抑制基因,参与DNA修复机制。"){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"人类,BRCA1基因所在的生物体。"){record_delimiter} +("entity"{tuple_delimiter}"17号染色体"{tuple_delimiter}"location"{tuple_delimiter}"17号染色体是BRCA1基因在人类基因组中的位置。"){record_delimiter} +("entity"{tuple_delimiter}"DNA修复"{tuple_delimiter}"biological_process"{tuple_delimiter}"DNA修复是BRCA1作为肿瘤抑制基因参与的生物学过程。"){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"BRCA1是在人类中发现的基因。"){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"17号染色体"{tuple_delimiter}"BRCA1位于人类基因组的17号染色体上。"){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"DNA修复"{tuple_delimiter}"BRCA1作为肿瘤抑制基因参与DNA修复过程。"){record_delimiter} +("content_keywords"{tuple_delimiter}"肿瘤抑制, DNA修复, 遗传疾病, 癌症遗传学"){completion_delimiter} + +################ +-真实数据- +################ +实体类型:{entity_types} +序列元数据:{metadata_text} +序列片段:{sequence_chunk} +################ +输出: +""" + + +CONTINUE_EN: str = """MANY entities and relationships were missed in the last extraction. \ +Add them below using the same format: +""" + +CONTINUE_ZH: str = """很多实体和关系在上一次的提取中可能被遗漏了。请在下面使用相同的格式添加它们:""" + +IF_LOOP_EN: str = """It appears some entities and relationships may have still been missed. \ +Answer YES | NO if there are still entities and relationships that need to be added. +""" + +IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加,请回答YES | NO。""" + +OMICS_KG_EXTRACTION_PROMPT: dict = { + "en": { + "TEMPLATE": TEMPLATE_EN, + "CONTINUE": CONTINUE_EN, + "IF_LOOP": IF_LOOP_EN, + }, + "zh": { + "TEMPLATE": TEMPLATE_ZH, + "CONTINUE": CONTINUE_ZH, + "IF_LOOP": IF_LOOP_ZH, + }, + "FORMAT": { + "tuple_delimiter": "<|>", + "record_delimiter": "##", + "completion_delimiter": "<|COMPLETE|>", + "entity_types": "gene, rna, protein, organism, location, biological_process, rna_type, protein_domain, \ +mutation, pathway, disease, function, structure", + }, +} diff --git a/graphgen/templates/kg/protein_kg_extraction.py b/graphgen/templates/kg/protein_kg_extraction.py new file mode 100644 index 00000000..16e9c868 --- /dev/null +++ b/graphgen/templates/kg/protein_kg_extraction.py @@ -0,0 +1,144 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are an expert in protein science and knowledge-graph construction. +Your task is to extract a star-shaped knowledge graph centered on **a single protein** mentioned in the given text. + +-Goal- +Given free-text that discusses one or more proteins, identify: +1. The **central protein** (the first-mentioned protein or the protein explicitly indicated by the user). +2. All entities that are **directly related** to this central protein. +3. All relationships that **directly link** those entities to the central protein (star edges). + +Use English as the output language. Please provide only the result without any extra explanations. + +-Steps- +1. Identify the **central protein entity** and all **directly-related entities** from the text. + For the **central protein**, extract: + - entity_name: use the full name or UniProt ID if given; capitalized. + - entity_type: always `protein`. + - entity_summary: concise description of its main biological role, location, or significance in the text. + + For each **directly-related entity**, extract: + - entity_name: capitalized. + - entity_type: one of [{entity_types}]. + - entity_summary: comprehensive summary of its attributes/activities **as stated in the text**. + + Format each entity as + ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. From the entities found in Step 1, list every **(central protein → related entity)** pair that is **clearly related**. + For each pair extract: + - source_entity: the **central protein** name. + - target_entity: the related entity name. + - relationship_summary: short explanation of how the central protein is connected to this entity **according to the text**. + + Format each relationship as + ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Output a single list of all entities and relationships from Steps 1–2, using **{record_delimiter}** as the delimiter. + +4. Finish by printing {completion_delimiter} + +################ +-Example- +################ +Text: +################ +The tumor-suppressor protein p53 is a transcription factor that responds to DNA damage. +Phosphorylation of p53 by ATM kinase at serine-15 enhances its stability. +MDM2, an E3 ubiquitin ligase, negatively regulates p53 via ubiquitination. +################ +Output: +("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"Tumor-suppressor transcription factor that responds to DNA damage and is regulated by post-translational modifications."){record_delimiter} +("entity"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"protein"{tuple_delimiter}"Protein kinase that phosphorylates p53 at serine-15, thereby enhancing p53 stability."){record_delimiter} +("entity"{tuple_delimiter}"serine-15"{tuple_delimiter}"site"{tuple_delimiter}"Phosphorylation site on p53 that is targeted by ATM kinase."){record_delimiter} +("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3 ubiquitin ligase that negatively regulates p53 through ubiquitination."){record_delimiter} +("entity"{tuple_delimiter}"DNA damage"{tuple_delimiter}"concept"{tuple_delimiter}"Cellular stress signal that activates p53-mediated transcriptional response."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"ATM kinase phosphorylates p53, enhancing its stability."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"serine-15"{tuple_delimiter}"p53 is phosphorylated at serine-15 by ATM kinase."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2 ubiquitinates p53, negatively regulating its activity."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA damage"{tuple_delimiter}"p53 acts as a sensor-transcription factor in response to DNA damage."){completion_delimiter} + +################ +-Real Data- +Entity_types: {entity_types} +Text: {input_text} +################ +Output: +""" + + +TEMPLATE_ZH: str = """您是蛋白质科学与知识图谱构建专家。 +任务:从给定文本中抽取以**一个中心蛋白质**为核心的星型知识图谱。 + +-目标- +文本可能提及一个或多个蛋白质,请: +1. 确定**中心蛋白质**(文本首个提及或用户指定的蛋白)。 +2. 识别所有与中心蛋白**直接相关**的实体。 +3. 仅保留**中心蛋白→相关实体**的直接关系(星型边)。 + +使用中文输出。请直接给出结果,不要增加额外说明。 + +-步骤- +1. 确定**中心蛋白质实体**及所有**直接相关实体**。 + 对于**中心蛋白质**: + - entity_name:全名或UniProt ID,首字母大写。 + - entity_type:固定为`protein`。 + - entity_summary:简述其在文中的生物学功能、定位或意义。 + + 对于每个**直接相关实体**: + - entity_name:首字母大写。 + - entity_type:可选类型[{entity_types}]。 + - entity_summary:全面总结其在文中与中心蛋白相关的属性/活动。 + + 格式:("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. 在步骤1的实体中,列出所有**(中心蛋白→相关实体)**的明显关系对。 + 每对提取: + - source_entity:中心蛋白名称。 + - target_entity:相关实体名称。 + - relationship_summary:简要说明文中二者如何直接关联。 + + 格式:("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. 将步骤1–2的所有实体与关系合并为单列表,用**{record_delimiter}**分隔。 + +4. 输出结束标记{completion_delimiter} + +################ +-示例- +################ +文本: +################ +肿瘤抑制蛋白p53是一种转录因子,可响应DNA损伤。ATM激酶在第15位丝氨酸磷酸化p53,增强其稳定性。E3泛素连接酶MDM2通过泛素化负调控p53。 +################ +输出: +("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"肿瘤抑制转录因子,能感知DNA损伤并通过翻译后修饰被调控。"){record_delimiter} +("entity"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"protein"{tuple_delimiter}"蛋白激酶,在丝氨酸-15位点磷酸化p53,从而提高其稳定性。"){record_delimiter} +("entity"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"site"{tuple_delimiter}"p53上被ATM激酶靶向的磷酸化位点。"){record_delimiter} +("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3泛素连接酶,通过泛素化负调控p53。"){record_delimiter} +("entity"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"concept"{tuple_delimiter}"细胞内应激信号,可激活p53介导的转录应答。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"ATM激酶磷酸化p53,增强其稳定性。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"p53在该位点被ATM激酶磷酸化。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2对p53进行泛素化,负向调控其活性。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"p53作为感受器-转录因子响应DNA损伤。"){completion_delimiter} + +################ +-真实数据- +实体类型:{entity_types} +文本:{input_text} +################ +输出: +""" + + +PROTEIN_KG_EXTRACTION_PROMPT: dict = { + "en": TEMPLATE_EN, + "zh": TEMPLATE_ZH, + "FORMAT": { + "tuple_delimiter": "<|>", + "record_delimiter": "##", + "completion_delimiter": "<|COMPLETE|>", + "entity_types": "protein, gene, site, modification, pathway, disease, drug, organism, tissue, cell_line, " + "experiment, technology, concept, location, organization, person, mission, science", + }, +} diff --git a/graphgen/templates/question_generation.py b/graphgen/templates/question_generation.py index d9ca9128..e75bf169 100644 --- a/graphgen/templates/question_generation.py +++ b/graphgen/templates/question_generation.py @@ -1,47 +1,5 @@ # pylint: disable=C0301 -TEMPLATE_SINGLE_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer. -################ -Answer: -{answer} -################ -Question: -""" - -TEMPLATE_SINGLE_ZH: str = """下面提供了一个问题的答案,请生成一个与答案对应的问题。 - -################ -答案: -{answer} -################ -问题: -""" - -TEMPLATE_SINGLE_QA_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text. -The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text. -For example: -Question: What is the effect of overexpressing the BG1 gene on grain size and development? -Answer: Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development. - -Question: What role does TAC4 play in the gravitropism of rice shoots? -Answer: TAC4 is a key regulator of gravitropism in rice shoots, promoting the bending of shoots towards the gravity vector. - -Here is the text passage you need to generate a QA pair for: -{doc} -""" - -TEMPLATE_SINGLE_QA_ZH: str = """给定一个文本段落。你的任务是根据该文本的内容生成一个问答(QA)对。 -答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。 -例如: -问题:过表达BG1基因对谷粒大小和发育有什么影响? -答案:BG1基因的过表达显著增加了谷粒大小,表明其在谷物发育中的作用。 - -问题:TAC4在水稻茎的重力性状中扮演什么角色? -答案:TAC4是水稻茎重力性状的关键调节因子,促进茎向重力矢量弯曲。 - -以下是你需要为其生成QA对的文本段落: -{doc} -""" # TODO: 修改这里的prompt TEMPLATE_MULTI_EN = """You are an assistant to help read a article and then rephrase it in a question answering format. The user will provide you with an article with its content. You need to generate a paraphrase of the same article in question and answer format with one tag of "Question: ..." followed by "Answer: ...". Remember to keep the meaning and every content of the article intact. @@ -66,13 +24,9 @@ QUESTION_GENERATION_PROMPT = { "English": { - "SINGLE_TEMPLATE": TEMPLATE_SINGLE_EN, - "SINGLE_QA_TEMPLATE": TEMPLATE_SINGLE_QA_EN, - "MULTI_TEMPLATE": TEMPLATE_MULTI_EN + "MULTI_TEMPLATE": TEMPLATE_MULTI_EN, }, "Chinese": { - "SINGLE_TEMPLATE": TEMPLATE_SINGLE_ZH, - "SINGLE_QA_TEMPLATE": TEMPLATE_SINGLE_QA_ZH, - "MULTI_TEMPLATE": TEMPLATE_MULTI_ZH - } + "MULTI_TEMPLATE": TEMPLATE_MULTI_ZH, + }, } diff --git a/graphgen/templates/search_judgement.py b/graphgen/templates/search_judgement.py index ca9e7e12..06837ac8 100644 --- a/graphgen/templates/search_judgement.py +++ b/graphgen/templates/search_judgement.py @@ -1,23 +1,23 @@ # pylint: disable=C0301 TEMPLATE: str = """-Goal- -Please select the most relevant search result for the given entity. -The name and description of the entity are provided. The search results are provided as a list. -Please select the most relevant search result from the list. If none of the search results are relevant, please select 'None of the above'. +Please select the most relevant searcher result for the given entity. +The name and description of the entity are provided. The searcher results are provided as a list. +Please select the most relevant searcher result from the list. If none of the searcher results are relevant, please select 'None of the above'. Steps: 1. Read the name and description of the entity. -2. Read the search results. For each search result, compare it with the entity name and description to determine if it is relevant. +2. Read the searcher results. For each searcher result, compare it with the entity name and description to determine if it is relevant. -3. Select the most relevant search result from the list. If none of the search results are relevant, select 'None of the above'. +3. Select the most relevant searcher result from the list. If none of the searcher results are relevant, select 'None of the above'. 4. Output your selection directly, please do not provide any additional information. ################ -Examples- ################ -{examples} +{input_examples} ################ -Real Data- diff --git a/graphgen/utils/__init__.py b/graphgen/utils/__init__.py index 932f8df1..ec118816 100644 --- a/graphgen/utils/__init__.py +++ b/graphgen/utils/__init__.py @@ -1,9 +1,22 @@ -from .log import logger, set_logger, parse_log -from .loop import create_event_loop -from .format import (pack_history_conversations, split_string_by_multi_markers, - handle_single_entity_extraction, handle_single_relationship_extraction, - load_json, write_json) -from .hash import compute_content_hash, compute_args_hash -from .detect_lang import detect_main_language, detect_if_chinese from .calculate_confidence import yes_no_loss_entropy +from .detect_lang import detect_if_chinese, detect_main_language +from .device import pick_device +from .format import ( + handle_single_entity_extraction, + handle_single_relationship_extraction, + load_json, + pack_history_conversations, + split_string_by_multi_markers, + write_json, +) +from .hash import ( + compute_args_hash, + compute_content_hash, + compute_dict_hash, + compute_mm_hash, +) from .help_nltk import NLTKHelper +from .log import CURRENT_LOGGER_VAR, logger, set_logger +from .loop import create_event_loop +from .run_concurrent import run_concurrent +from .wrap import async_to_sync_method diff --git a/graphgen/utils/calculate_confidence.py b/graphgen/utils/calculate_confidence.py index 1b596d91..9a055cc2 100644 --- a/graphgen/utils/calculate_confidence.py +++ b/graphgen/utils/calculate_confidence.py @@ -1,34 +1,41 @@ import math -from typing import List -from graphgen.models.llm.topk_token_model import Token +from typing import Dict, List + +from graphgen.bases.datatypes import Token + def preprocess_tokens(tokens: List[Token]) -> List[Token]: """Preprocess tokens for calculating confidence.""" tokens = [x for x in tokens if x.prob > 0] return tokens + def joint_probability(tokens: List[Token]) -> float: """Calculate joint probability of a list of tokens.""" tokens = preprocess_tokens(tokens) logprob_sum = sum(x.logprob for x in tokens) return math.exp(logprob_sum / len(tokens)) + def min_prob(tokens: List[Token]) -> float: """Calculate the minimum probability of a list of tokens.""" tokens = preprocess_tokens(tokens) return min(x.prob for x in tokens) + def average_prob(tokens: List[Token]) -> float: """Calculate the average probability of a list of tokens.""" tokens = preprocess_tokens(tokens) return sum(x.prob for x in tokens) / len(tokens) + def average_confidence(tokens: List[Token]) -> float: """Calculate the average confidence of a list of tokens.""" tokens = preprocess_tokens(tokens) confidence = [x.prob / sum(y.prob for y in x.top_candidates[:5]) for x in tokens] return sum(confidence) / len(tokens) + def yes_no_loss(tokens_list: List[List[Token]], ground_truth: List[str]) -> float: """Calculate the loss for yes/no question.""" losses = [] @@ -41,14 +48,112 @@ def yes_no_loss(tokens_list: List[List[Token]], ground_truth: List[str]) -> floa losses.append(token.prob) return sum(losses) / len(losses) -def yes_no_loss_entropy(tokens_list: List[List[Token]], ground_truth: List[str]) -> float: + +def _normalize_yes_no(tokens: List[Token]) -> Dict[str, float]: + """ + Mapping yes/no synonyms to their probabilities and normalizing. + For example, given tokens with probabilities: + - "yes" (0.6) + - "yeah" (0.2) + - "no" (0.1) + - "nope" (0.1) + The function will return: + {"yes": 0.8, "no": 0.2} + Among them, "yes" and "yeah" are synonyms for "yes", + while "no" and "nope" are synonyms for "no". + If no "yes" or "no" synonyms are present, it will be judged as uncertain. + An uncertain result will also be considered as opposite to the ground truth. + """ + yes_syno = { + # English yes synonyms + "yes", + "yeah", + "yea", + "yep", + "yup", + "yay", + "ya", + "yah", + "sure", + "certainly", + "absolutely", + "definitely", + "exactly", + "indeed", + "right", + "correct", + "true", + "t", + "1", + # Chinese yes synonyms + "是", + "对", + "好的", + "行", + "可以", + "没错", + "当然", + "确实", + "正确", + "真", + "对的", + } + no_syno = { + # English no synonyms + "no", + "nope", + "nop", + "nah", + "naw", + "na", + "negative", + "never", + "not", + "false", + "f", + "0", + # Chinese no synonyms + "不", + "不是", + "没有", + "错", + "不对", + "不行", + "不能", + "否", + "假的", + } + + yes_prob = 0.0 + no_prob = 0.0 + uncertain_prob = 0.0 + for tok in tokens: + t = tok.text.lower().strip() + if t in yes_syno: + yes_prob += tok.prob + elif t in no_syno: + no_prob += tok.prob + else: + uncertain_prob += tok.prob + + total = yes_prob + no_prob + uncertain_prob + + return { + "yes": yes_prob / total, + "no": no_prob / total, + "uncertain": uncertain_prob / total, + } + + +def yes_no_loss_entropy( + tokens_list: List[List[Token]], ground_truth: List[str] +) -> float: """Calculate the loss for yes/no question using entropy.""" losses = [] - for i, tokens in enumerate(tokens_list): - token = tokens[0] - assert token.text.lower() in ["yes", "no"] - if token.text == ground_truth[i]: - losses.append(-math.log(token.prob)) - else: - losses.append(-math.log(1 - token.prob)) + for toks, gt in zip(tokens_list, ground_truth): + dist = _normalize_yes_no(toks) + gt = gt.lower() + assert gt in {"yes", "no"} + prob_correct = dist[gt] + losses.append(-math.log(prob_correct)) return sum(losses) / len(losses) diff --git a/graphgen/utils/detect_lang.py b/graphgen/utils/detect_lang.py index c34ddac4..944ccb2a 100644 --- a/graphgen/utils/detect_lang.py +++ b/graphgen/utils/detect_lang.py @@ -1,40 +1,41 @@ def detect_main_language(text): """ - 识别文本的主要语言 + Detect the main language of the text, 'zh' for Chinese, 'en' for English :param text: :return: """ assert isinstance(text, str) + def is_chinese_char(char): - return '\u4e00' <= char <= '\u9fff' + return "\u4e00" <= char <= "\u9fff" def is_english_char(char): return char.isascii() and char.isalpha() - # 去除空格和标点符号 - text = ''.join(char for char in text if char.strip()) + text = "".join(char for char in text if char.strip()) chinese_count = sum(1 for char in text if is_chinese_char(char)) english_count = sum(1 for char in text if is_english_char(char)) total = chinese_count + english_count if total == 0: - return 'en' + return "en" chinese_ratio = chinese_count / total if chinese_ratio >= 0.5: - return 'zh' - return 'en' + return "zh" + return "en" + def detect_if_chinese(text): """ - 判断文本是否包含有中文 + Detect if the text contains any Chinese characters :param text: :return: """ assert isinstance(text, str) - return any('\u4e00' <= char <= '\u9fff' for char in text) + return any("\u4e00" <= char <= "\u9fff" for char in text) diff --git a/graphgen/utils/device.py b/graphgen/utils/device.py new file mode 100644 index 00000000..1e5d8642 --- /dev/null +++ b/graphgen/utils/device.py @@ -0,0 +1,44 @@ +import shutil +import subprocess +import sys + + +def pick_device() -> str: + """Return the best available device string for MinerU.""" + # 1. NVIDIA GPU + if shutil.which("nvidia-smi") is not None: + try: + # check if there's any free GPU memory + out = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=memory.free", + "--format=csv,noheader,nounits", + ], + text=True, + ) + if any(int(line) > 0 for line in out.strip().splitlines()): + return "cuda:0" + except Exception: # pylint: disable=broad-except + pass + + # 2. Apple Silicon + if sys.platform == "darwin" and shutil.which("sysctl"): + try: + brand = subprocess.check_output( + ["sysctl", "-n", "machdep.cpu.brand_string"], text=True + ) + if "Apple" in brand: + return "mps" + except Exception: # pylint: disable=broad-except + pass + + # 3. Ascend NPU + if shutil.which("npu-smi") is not None: + try: + subprocess.check_call(["npu-smi", "info"], stdout=subprocess.DEVNULL) + return "npu" + except Exception: # pylint: disable=broad-except + pass + + return "cpu" diff --git a/graphgen/utils/format.py b/graphgen/utils/format.py index 0a0c101d..1f0675f1 100644 --- a/graphgen/utils/format.py +++ b/graphgen/utils/format.py @@ -1,16 +1,17 @@ -import re -import os -import json import html - +import json +import os +import re from typing import Any + def pack_history_conversations(*args: str): roles = ["user", "assistant"] return [ {"role": roles[i % 2], "content": content} for i, content in enumerate(args) ] + def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]: """Split a string by multiple markers""" if not markers: @@ -18,6 +19,7 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str] results = re.split("|".join(re.escape(marker) for marker in markers), content) return [r.strip() for r in results if r.strip()] + # Refer the utils functions of the official GraphRAG implementation: # https://github.com/microsoft/graphrag def clean_str(input: Any) -> str: @@ -30,6 +32,7 @@ def clean_str(input: Any) -> str: # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) + async def handle_single_entity_extraction( record_attributes: list[str], chunk_key: str, @@ -50,9 +53,11 @@ async def handle_single_entity_extraction( "source_id": entity_source_id, } + def is_float_regex(value): return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) + async def handle_single_relationship_extraction( record_attributes: list[str], chunk_key: str, @@ -72,12 +77,14 @@ async def handle_single_relationship_extraction( "source_id": edge_source_id, } + def load_json(file_name): if not os.path.exists(file_name): return None with open(file_name, encoding="utf-8") as f: return json.load(f) + def write_json(json_obj, file_name): if not os.path.exists(os.path.dirname(file_name)): os.makedirs(os.path.dirname(file_name), exist_ok=True) diff --git a/graphgen/utils/hash.py b/graphgen/utils/hash.py index bf93ec5f..04ba96e7 100644 --- a/graphgen/utils/hash.py +++ b/graphgen/utils/hash.py @@ -1,7 +1,28 @@ from hashlib import md5 + def compute_args_hash(*args): return md5(str(args).encode()).hexdigest() + def compute_content_hash(content, prefix: str = ""): return prefix + md5(content.encode()).hexdigest() + + +def compute_mm_hash(item, prefix: str = ""): + if item.get("type") == "text" and item.get("text"): + content = item["text"].strip() + elif item.get("type") == "image" and item.get("img_path"): + content = f"image:{item['img_path']}" + elif item.get("type") == "table" and item.get("table_body"): + content = f"table:{item['table_body']}" + elif item.get("type") == "equation" and item.get("text"): + content = f"equation:{item['text']}" + else: + content = str(item) + return prefix + md5(content.encode()).hexdigest() + + +def compute_dict_hash(d: dict, prefix: str = ""): + items = tuple(sorted(d.items())) + return prefix + md5(str(items).encode()).hexdigest() diff --git a/graphgen/utils/log.py b/graphgen/utils/log.py index 32b9bac6..e29e994e 100644 --- a/graphgen/utils/log.py +++ b/graphgen/utils/log.py @@ -1,32 +1,101 @@ +import contextvars import logging +import os +from logging.handlers import RotatingFileHandler +from typing import Any -logger = logging.getLogger("graphgen") +from rich.logging import RichHandler -def set_logger(log_file: str, log_level: int = logging.INFO, if_stream: bool = True): - logger.setLevel(log_level) - formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - ) +def set_logger( + log_file: str, + name: str, + file_level: int = logging.DEBUG, + console_level: int = logging.INFO, + *, + if_stream: bool = True, + max_bytes: int = 50 * 1024 * 1024, # 50 MB + backup_count: int = 5, + force: bool = False, +): + + current_logger = logging.getLogger(name) + if current_logger.hasHandlers() and not force: + return current_logger + + if force: + current_logger.handlers.clear() - file_handler = logging.FileHandler(log_file, mode='w') - file_handler.setLevel(log_level) - file_handler.setFormatter(formatter) + current_logger.setLevel( + min(file_level, console_level) + ) # Set to the lowest level to capture all logs + current_logger.propagate = False - stream_handler = None + if log_file: + os.makedirs(os.path.dirname(log_file), exist_ok=True) if if_stream: - stream_handler = logging.StreamHandler() - stream_handler.setLevel(log_level) - stream_handler.setFormatter(formatter) + console = RichHandler( + level=console_level, show_path=False, rich_tracebacks=True + ) + console.setFormatter(logging.Formatter("%(message)s")) + current_logger.addHandler(console) + + file_handler = RotatingFileHandler( + log_file, + maxBytes=max_bytes, + backupCount=backup_count, + encoding="utf-8", + ) + file_handler.setLevel(file_level) + file_handler.setFormatter( + logging.Formatter( + "[%(asctime)s] %(levelname)s [%(name)s:%(filename)s:%(lineno)d] %(message)s", + datefmt="%y-%m-%d %H:%M:%S", + ) + ) + current_logger.addHandler(file_handler) + return current_logger + + +CURRENT_LOGGER_VAR = contextvars.ContextVar("current_logger") + + +def get_current_logger() -> logging.Logger: + current_logger = CURRENT_LOGGER_VAR.get() + if not current_logger: + raise RuntimeError("No logger is set in the current context.") + return current_logger + + +class ContextAwareLogger: + @staticmethod + def _get_logger() -> logging.Logger: + return get_current_logger() + + def debug(self, msg: object, *args: Any, **kwargs: Any) -> None: + self._get_logger().debug(msg, *args, **kwargs) + + def info(self, msg: object, *args: Any, **kwargs: Any) -> None: + self._get_logger().info(msg, *args, **kwargs) + + def warning(self, msg: object, *args: Any, **kwargs: Any) -> None: + self._get_logger().warning(msg, *args, **kwargs) + + def error(self, msg: object, *args: Any, **kwargs: Any) -> None: + self._get_logger().error(msg, *args, **kwargs) + + def exception(self, msg: object, *args: Any, **kwargs: Any) -> None: + self._get_logger().exception(msg, *args, **kwargs) + + def critical(self, msg: object, *args: Any, **kwargs: Any) -> None: + self._get_logger().critical(msg, *args, **kwargs) + + def log(self, level: int, msg: object, *args: Any, **kwargs: Any) -> None: + self._get_logger().log(level, msg, *args, **kwargs) - if not logger.handlers: - logger.addHandler(file_handler) - if if_stream and stream_handler: - logger.addHandler(stream_handler) + def __getattr__(self, name: str) -> Any: + return getattr(self._get_logger(), name) -def parse_log(log_file: str): - with open(log_file, "r", encoding='utf-8') as f: - lines = f.readlines() - return lines +logger = ContextAwareLogger() diff --git a/graphgen/utils/loop.py b/graphgen/utils/loop.py index 5f12fa5b..f0ab7dfd 100644 --- a/graphgen/utils/loop.py +++ b/graphgen/utils/loop.py @@ -1,9 +1,10 @@ import asyncio +from typing import Tuple from .log import logger -def create_event_loop() -> asyncio.AbstractEventLoop: +def create_event_loop() -> Tuple[asyncio.AbstractEventLoop, bool]: """ Ensure that there is always an event loop available. @@ -11,18 +12,25 @@ def create_event_loop() -> asyncio.AbstractEventLoop: it creates a new event loop and sets it as the current event loop. Returns: - asyncio.AbstractEventLoop: The current or newly created event loop. + Tuple[asyncio.AbstractEventLoop, bool]: The event loop and a flag + indicating if we created it (True) or it was already running (False). """ try: - # Try to get the current event loop - current_loop = asyncio.get_event_loop() - if current_loop.is_closed(): - raise RuntimeError("Event loop is closed.") - return current_loop - + # Try to get the running event loop (Python 3.7+) + running_loop = asyncio.get_running_loop() + # If we get here, there's already a running loop + return running_loop, False except RuntimeError: - # If no event loop exists or it is closed, create a new one - logger.info("Creating a new event loop in main thread.") - new_loop = asyncio.new_event_loop() - asyncio.set_event_loop(new_loop) - return new_loop + # No running loop, try to get the current event loop + try: + current_loop = asyncio.get_event_loop() + if current_loop.is_closed(): + raise RuntimeError("Event loop is closed.") from None + # Loop exists but not running, we can use it + return current_loop, False + except RuntimeError: + # No event loop exists, create a new one + logger.info("Creating a new event loop in main thread.") + new_loop = asyncio.new_event_loop() + asyncio.set_event_loop(new_loop) + return new_loop, True diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py new file mode 100644 index 00000000..8315b953 --- /dev/null +++ b/graphgen/utils/run_concurrent.py @@ -0,0 +1,122 @@ +import asyncio +from typing import Awaitable, Callable, List, Optional, TypeVar, Union + +import gradio as gr +from tqdm.asyncio import tqdm as tqdm_async + +from graphgen.utils.log import logger + +from .loop import create_event_loop + +T = TypeVar("T") +R = TypeVar("R") + + +def run_concurrent( + coro_fn: Callable[[T], Awaitable[R]], + items: List[T], + *, + desc: str = "processing", + unit: str = "item", + progress_bar: Optional[gr.Progress] = None, + save_interval: int = 0, + save_callback: Optional[Callable[[List[R], int], None]] = None, + max_concurrent: Optional[int] = None, +) -> Union[List[R], Awaitable[List[R]]]: + """ + Run coroutines concurrently with optional periodic saving. + This function can be used in both sync and async contexts: + - In sync context: returns List[R] directly + - In async context: returns Awaitable[List[R]] (use with 'await') + :return: List of results (in sync context) or coroutine (in async context) + """ + async def _run_all(): + if not items: + return [] + # Use semaphore to limit concurrent tasks if max_concurrent is specified + semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent is not None and max_concurrent > 0 else None + + async def run_with_semaphore(item: T) -> R: + """Wrapper to apply semaphore if needed.""" + if semaphore: + async with semaphore: + return await coro_fn(item) + else: + return await coro_fn(item) + + # Create tasks with concurrency limit + if max_concurrent is not None and max_concurrent > 0: + # Use semaphore-controlled wrapper + tasks = [asyncio.create_task(run_with_semaphore(it)) for it in items] + else: + # Original behavior: create all tasks at once + tasks = [asyncio.create_task(coro_fn(it)) for it in items] + + completed_count = 0 + results = [] + pending_save_results = [] + pbar = tqdm_async(total=len(items), desc=desc, unit=unit) + + for future in asyncio.as_completed(tasks): + try: + result = await future + results.append(result) + if save_interval > 0 and save_callback is not None: + pending_save_results.append(result) + except Exception as e: # pylint: disable=broad-except + logger.exception("Task failed: %s", e) + # even if failed, record it to keep results consistent with tasks + results.append(e) + + completed_count += 1 + pbar.update(1) + + if progress_bar is not None: + progress = completed_count / len(items) + progress_bar(progress, desc=f"{desc} ({completed_count}/{len(items)})") + + # Periodic save + if save_interval > 0 and save_callback is not None and completed_count % save_interval == 0: + try: + # Filter out exceptions before saving + valid_results = [res for res in pending_save_results if not isinstance(res, Exception)] + save_callback(valid_results, completed_count) + pending_save_results = [] # Clear after saving + logger.info("Saved intermediate results: %d/%d completed", completed_count, len(items)) + except Exception as e: + logger.warning("Failed to save intermediate results: %s", e) + + pbar.close() + + if progress_bar is not None: + progress_bar(1.0, desc=f"{desc} (completed)") + + # Save remaining results if any + if save_interval > 0 and save_callback is not None and pending_save_results: + try: + valid_results = [res for res in pending_save_results if not isinstance(res, Exception)] + save_callback(valid_results, completed_count) + logger.info("Saved final intermediate results: %d completed", completed_count) + except Exception as e: + logger.warning("Failed to save final intermediate results: %s", e) + + # filter out exceptions + return [res for res in results if not isinstance(res, Exception)] + + # Check if we're in an async context (event loop is running) + try: + _ = asyncio.get_running_loop() + # If we're in an async context, return the coroutine directly + # The caller should use 'await run_concurrent(...)' + return _run_all() + except RuntimeError: + # No running loop, we can create one and run until complete + if not items: + return [] + loop, created = create_event_loop() + try: + return loop.run_until_complete(_run_all()) + finally: + # Only close the loop if we created it + if created: + loop.close() diff --git a/graphgen/utils/wrap.py b/graphgen/utils/wrap.py new file mode 100644 index 00000000..9689cea6 --- /dev/null +++ b/graphgen/utils/wrap.py @@ -0,0 +1,21 @@ +from functools import wraps +from typing import Any, Callable + +from .loop import create_event_loop + + +def async_to_sync_method(func: Callable) -> Callable: + @wraps(func) + def wrapper(self, *args, **kwargs) -> Any: + loop, created = create_event_loop() + try: + if loop.is_running(): + raise RuntimeError( + "Cannot use async_to_sync_method when event loop is already running." + ) + return loop.run_until_complete(func(self, *args, **kwargs)) + finally: + if created: + loop.close() + + return wrapper diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..2c6b870e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,12 @@ +[tool.black] +line-length = 88 # black 默认 88 +include = '\.pyi?$' + +[tool.isort] +profile = "black" # 一键适配 black +line_length = 88 # 与 black 保持一致 +multi_line_output = 3 # black 偏好的括号换行风格 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..55b033e9 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pytest \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ab329cb5..44079ab5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,8 +12,30 @@ nltk jieba plotly pandas -gradio>=5.25.0 -gradio-i18n==0.3.0 +gradio==5.44.1 kaleido pyyaml langcodes +requests +fastapi +trafilatura +aiohttp +socksio +pydantic +ray==2.52.1 + +leidenalg +igraph +python-louvain + +# storage +rocksdict + +# KG +rdflib + +# Bioinformatics +biopython + +# For visualization +matplotlib diff --git a/resources/examples/chunked_demo.json b/resources/examples/chunked_demo.json deleted file mode 100644 index ad7219a3..00000000 --- a/resources/examples/chunked_demo.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - [ - {"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} - ], - [ - {"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} - ], - [ - {"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} - ], - [ - {"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} - ] -] diff --git a/resources/examples/raw_demo.jsonl b/resources/examples/raw_demo.jsonl deleted file mode 100644 index 024559a6..00000000 --- a/resources/examples/raw_demo.jsonl +++ /dev/null @@ -1,4 +0,0 @@ -{"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} -{"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} -{"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} -{"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} diff --git a/resources/images/flow.png b/resources/images/flow.png deleted file mode 100644 index 9800a7b0..00000000 Binary files a/resources/images/flow.png and /dev/null differ diff --git a/resources/images/logo.png b/resources/images/logo.png deleted file mode 100644 index 1acc963d..00000000 Binary files a/resources/images/logo.png and /dev/null differ diff --git a/resources/nltk_data/corpora/stopwords/chinese b/resources/nltk_data/corpora/stopwords/chinese deleted file mode 100644 index 0873a903..00000000 --- a/resources/nltk_data/corpora/stopwords/chinese +++ /dev/null @@ -1,841 +0,0 @@ -一 -一下 -一些 -一切 -一则 -一天 -一定 -一方面 -一旦 -一时 -一来 -一样 -一次 -一片 -一直 -一致 -一般 -一起 -一边 -一面 -万一 -上下 -上升 -上去 -上来 -上述 -上面 -下列 -下去 -下来 -下面 -不一 -不久 -不仅 -不会 -不但 -不光 -不单 -不变 -不只 -不可 -不同 -不够 -不如 -不得 -不怕 -不惟 -不成 -不拘 -不敢 -不断 -不是 -不比 -不然 -不特 -不独 -不管 -不能 -不要 -不论 -不足 -不过 -不问 -与 -与其 -与否 -与此同时 -专门 -且 -两者 -严格 -严重 -个 -个人 -个别 -中小 -中间 -丰富 -临 -为 -为主 -为了 -为什么 -为什麽 -为何 -为着 -主张 -主要 -举行 -乃 -乃至 -么 -之 -之一 -之前 -之后 -之後 -之所以 -之类 -乌乎 -乎 -乘 -也 -也好 -也是 -也罢 -了 -了解 -争取 -于 -于是 -于是乎 -云云 -互相 -产生 -人们 -人家 -什么 -什么样 -什麽 -今后 -今天 -今年 -今後 -仍然 -从 -从事 -从而 -他 -他人 -他们 -他的 -代替 -以 -以上 -以下 -以为 -以便 -以免 -以前 -以及 -以后 -以外 -以後 -以来 -以至 -以至于 -以致 -们 -任 -任何 -任凭 -任务 -企图 -伟大 -似乎 -似的 -但 -但是 -何 -何况 -何处 -何时 -作为 -你 -你们 -你的 -使得 -使用 -例如 -依 -依照 -依靠 -促进 -保持 -俺 -俺们 -倘 -倘使 -倘或 -倘然 -倘若 -假使 -假如 -假若 -做到 -像 -允许 -充分 -先后 -先後 -先生 -全部 -全面 -兮 -共同 -关于 -其 -其一 -其中 -其二 -其他 -其余 -其它 -其实 -其次 -具体 -具体地说 -具体说来 -具有 -再者 -再说 -冒 -冲 -决定 -况且 -准备 -几 -几乎 -几时 -凭 -凭借 -出去 -出来 -出现 -分别 -则 -别 -别的 -别说 -到 -前后 -前者 -前进 -前面 -加之 -加以 -加入 -加强 -十分 -即 -即令 -即使 -即便 -即或 -即若 -却不 -原来 -又 -及 -及其 -及时 -及至 -双方 -反之 -反应 -反映 -反过来 -反过来说 -取得 -受到 -变成 -另 -另一方面 -另外 -只是 -只有 -只要 -只限 -叫 -叫做 -召开 -叮咚 -可 -可以 -可是 -可能 -可见 -各 -各个 -各人 -各位 -各地 -各种 -各级 -各自 -合理 -同 -同一 -同时 -同样 -后来 -后面 -向 -向着 -吓 -吗 -否则 -吧 -吧哒 -吱 -呀 -呃 -呕 -呗 -呜 -呜呼 -呢 -周围 -呵 -呸 -呼哧 -咋 -和 -咚 -咦 -咱 -咱们 -咳 -哇 -哈 -哈哈 -哉 -哎 -哎呀 -哎哟 -哗 -哟 -哦 -哩 -哪 -哪个 -哪些 -哪儿 -哪天 -哪年 -哪怕 -哪样 -哪边 -哪里 -哼 -哼唷 -唉 -啊 -啐 -啥 -啦 -啪达 -喂 -喏 -喔唷 -嗡嗡 -嗬 -嗯 -嗳 -嘎 -嘎登 -嘘 -嘛 -嘻 -嘿 -因 -因为 -因此 -因而 -固然 -在 -在下 -地 -坚决 -坚持 -基本 -处理 -复杂 -多 -多少 -多数 -多次 -大力 -大多数 -大大 -大家 -大批 -大约 -大量 -失去 -她 -她们 -她的 -好的 -好象 -如 -如上所述 -如下 -如何 -如其 -如果 -如此 -如若 -存在 -宁 -宁可 -宁愿 -宁肯 -它 -它们 -它们的 -它的 -安全 -完全 -完成 -实现 -实际 -宣布 -容易 -密切 -对 -对于 -对应 -将 -少数 -尔后 -尚且 -尤其 -就 -就是 -就是说 -尽 -尽管 -属于 -岂但 -左右 -巨大 -巩固 -己 -已经 -帮助 -常常 -并 -并不 -并不是 -并且 -并没有 -广大 -广泛 -应当 -应用 -应该 -开外 -开始 -开展 -引起 -强烈 -强调 -归 -当 -当前 -当时 -当然 -当着 -形成 -彻底 -彼 -彼此 -往 -往往 -待 -後来 -後面 -得 -得出 -得到 -心里 -必然 -必要 -必须 -怎 -怎么 -怎么办 -怎么样 -怎样 -怎麽 -总之 -总是 -总的来看 -总的来说 -总的说来 -总结 -总而言之 -恰恰相反 -您 -意思 -愿意 -慢说 -成为 -我 -我们 -我的 -或 -或是 -或者 -战斗 -所 -所以 -所有 -所谓 -打 -扩大 -把 -抑或 -拿 -按 -按照 -换句话说 -换言之 -据 -掌握 -接着 -接著 -故 -故此 -整个 -方便 -方面 -旁人 -无宁 -无法 -无论 -既 -既是 -既然 -时候 -明显 -明确 -是 -是否 -是的 -显然 -显著 -普通 -普遍 -更加 -曾经 -替 -最后 -最大 -最好 -最後 -最近 -最高 -有 -有些 -有关 -有利 -有力 -有所 -有效 -有时 -有点 -有的 -有着 -有著 -望 -朝 -朝着 -本 -本着 -来 -来着 -极了 -构成 -果然 -果真 -某 -某个 -某些 -根据 -根本 -欢迎 -正在 -正如 -正常 -此 -此外 -此时 -此间 -毋宁 -每 -每个 -每天 -每年 -每当 -比 -比如 -比方 -比较 -毫不 -没有 -沿 -沿着 -注意 -深入 -清楚 -满足 -漫说 -焉 -然则 -然后 -然後 -然而 -照 -照着 -特别是 -特殊 -特点 -现代 -现在 -甚么 -甚而 -甚至 -用 -由 -由于 -由此可见 -的 -的话 -目前 -直到 -直接 -相似 -相信 -相反 -相同 -相对 -相对而言 -相应 -相当 -相等 -省得 -看出 -看到 -看来 -看看 -看见 -真是 -真正 -着 -着呢 -矣 -知道 -确定 -离 -积极 -移动 -突出 -突然 -立即 -第 -等 -等等 -管 -紧接着 -纵 -纵令 -纵使 -纵然 -练习 -组成 -经 -经常 -经过 -结合 -结果 -给 -绝对 -继续 -继而 -维持 -综上所述 -罢了 -考虑 -者 -而 -而且 -而况 -而外 -而已 -而是 -而言 -联系 -能 -能否 -能够 -腾 -自 -自个儿 -自从 -自各儿 -自家 -自己 -自身 -至 -至于 -良好 -若 -若是 -若非 -范围 -莫若 -获得 -虽 -虽则 -虽然 -虽说 -行为 -行动 -表明 -表示 -被 -要 -要不 -要不是 -要不然 -要么 -要是 -要求 -规定 -觉得 -认为 -认真 -认识 -让 -许多 -论 -设使 -设若 -该 -说明 -诸位 -谁 -谁知 -赶 -起 -起来 -起见 -趁 -趁着 -越是 -跟 -转动 -转变 -转贴 -较 -较之 -边 -达到 -迅速 -过 -过去 -过来 -运用 -还是 -还有 -这 -这个 -这么 -这么些 -这么样 -这么点儿 -这些 -这会儿 -这儿 -这就是说 -这时 -这样 -这点 -这种 -这边 -这里 -这麽 -进入 -进步 -进而 -进行 -连 -连同 -适应 -适当 -适用 -逐步 -逐渐 -通常 -通过 -造成 -遇到 -遭到 -避免 -那 -那个 -那么 -那么些 -那么样 -那些 -那会儿 -那儿 -那时 -那样 -那边 -那里 -那麽 -部分 -鄙人 -采取 -里面 -重大 -重新 -重要 -鉴于 -问题 -防止 -阿 -附近 -限制 -除 -除了 -除此之外 -除非 -随 -随着 -随著 -集中 -需要 -非但 -非常 -非徒 -靠 -顺 -顺着 -首先 -高兴 -是不是 diff --git a/resources/nltk_data/corpora/stopwords/english b/resources/nltk_data/corpora/stopwords/english deleted file mode 100644 index 1280aa3b..00000000 --- a/resources/nltk_data/corpora/stopwords/english +++ /dev/null @@ -1,179 +0,0 @@ -i -me -my -myself -we -our -ours -ourselves -you -you're -you've -you'll -you'd -your -yours -yourself -yourselves -he -him -his -himself -she -she's -her -hers -herself -it -it's -its -itself -they -them -their -theirs -themselves -what -which -who -whom -this -that -that'll -these -those -am -is -are -was -were -be -been -being -have -has -had -having -do -does -did -doing -a -an -the -and -but -if -or -because -as -until -while -of -at -by -for -with -about -against -between -into -through -during -before -after -above -below -to -from -up -down -in -out -on -off -over -under -again -further -then -once -here -there -when -where -why -how -all -any -both -each -few -more -most -other -some -such -no -nor -not -only -own -same -so -than -too -very -s -t -can -will -just -don -don't -should -should've -now -d -ll -m -o -re -ve -y -ain -aren -aren't -couldn -couldn't -didn -didn't -doesn -doesn't -hadn -hadn't -hasn -hasn't -haven -haven't -isn -isn't -ma -mightn -mightn't -mustn -mustn't -needn -needn't -shan -shan't -shouldn -shouldn't -wasn -wasn't -weren -weren't -won -won't -wouldn -wouldn't diff --git a/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt b/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt deleted file mode 100644 index b38862b8..00000000 --- a/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +++ /dev/null @@ -1,156 +0,0 @@ -ct -m.j -t -a.c -n.h -ms -p.a.m -dr -pa -p.m -u.k -st -dec -u.s.a -lt -g.k -adm -p -h.m -ga -tenn -yr -sen -n.c -j.j -d.h -s.g -inc -vs -s.p.a -a.t -n -feb -sr -jan -s.a.y -n.y -col -g.f -c.o.m.b -d -ft -va -r.k -e.f -chg -r.i -a.g -minn -a.h -k -n.j -m -l.f -f.j -gen -i.m.s -s.a -aug -j.p -okla -m.d.c -ltd -oct -s -vt -r.a -j.c -ariz -w.w -b.v -ore -h -w.r -e.h -mrs -cie -corp -w -n.v -a.d -r.j -ok -. . -e.m -w.c -ill -nov -u.s -prof -conn -u.s.s.r -mg -f.g -ph.d -g -calif -messrs -h.f -wash -tues -sw -bros -u.n -l -wis -mr -sep -d.c -ave -e.l -co -s.s -reps -c -r.t -h.c -r -wed -a.s -v -fla -jr -r.h -c.v -m.b.a -rep -a.a -e -c.i.t -l.a -b.f -j.b -d.w -j.k -ala -f -w.va -sept -mich -n.m -j.r -l.p -s.c -colo -fri -a.m -g.d -kan -maj -ky -a.m.e -n.d -t.j -cos -nev \ No newline at end of file diff --git a/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab b/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab deleted file mode 100644 index c7032b7f..00000000 --- a/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab +++ /dev/null @@ -1,37 +0,0 @@ -##number## international -##number## rj -##number## commodities -##number## cooper -b stewart -##number## genentech -##number## wedgestone -i toussie -##number## pepper -j fialka -o ludcke -##number## insider -##number## aes -i magnin -##number## credit -##number## corrections -##number## financing -##number## henley -##number## business -##number## pay-fone -b wigton -b edelman -b levine -##number## leisure -b smith -j walter -##number## pegasus -##number## dividend -j aron -##number## review -##number## abreast -##number## who -##number## letters -##number## colgate -##number## cbot -##number## notable -##number## zimmer \ No newline at end of file diff --git a/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab b/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab deleted file mode 100644 index a33a3648..00000000 --- a/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +++ /dev/null @@ -1,20366 +0,0 @@ -coverage 36 -springsteen 4 -sandoz 4 -board-room 32 -eastman 6 -enter 32 -procedure 36 -blowing 32 -competition 46 -sinai 4 -t-word 4 -entangled 32 -clines 12 -seasonally 34 -firing 32 -legal-trade 32 -disruptive 32 -wagner 12 -soften 32 -ex-grenfell 4 -cementing 32 -threats 32 -sleepers 32 -bulgarian 4 -him 96 -citing 42 -phelps-stokes 4 -oglesby 12 -dieter 4 -showman 32 -half-hour 32 -deliveries 32 -monetary 110 -counters 32 -hill 12 -exceed 32 -miscreants 32 -poisoning 32 -neutral 32 -allocated 32 -angry 32 -registers 32 -chose 32 -cardinal 4 -also 110 -magnin 12 -vernacular 32 -rebound 32 -penicillin 32 -behave 36 -restrained 32 -diver 36 -journalists 34 -pops 32 -x-ray 4 -mansion 32 -overhauling 32 -merely 32 -julian 2 -veer 32 -fueled 36 -affordable 32 -amos-lee 4 -longstanding 32 -two-part 32 -warrants 36 -text 32 -aetna 10 -recollections 32 -entirely 32 -washing 32 -mathematical 32 -improvement 32 -reexamined 32 -therefore 34 -month 38 -mover 32 -agreement 36 -shoppers 32 -arabian 4 -cyclical 32 -creepers 32 -solicitation 32 -sculptures 32 -troupe 36 -break-even 32 -expressively 32 -indications 34 -kremlin 4 -czech 4 -darts 4 -worksheets 32 -secondary 40 -diplomat 32 -prescriptions 32 -cream 32 -devote 32 -repertoire 32 -berton 4 -waiting 32 -sculptured 32 -energy-related 32 -amplify 32 -apologetic 32 -indelible 32 -piggyback 32 -dwarfism 32 -respectively 32 -ahp 12 -bis 4 -wkr 8 -bit 32 -blew 32 -pgulf 8 -property-casualty 32 -banded 32 -interviewed 32 -expatriate 32 -branches 32 -superintendent 32 -stinginess 32 -bid 36 -big 46 -enigmatic 32 -attacked 32 -corporate 110 -southeastern 4 -one-fourth 32 -jacobs 4 -entertaining 32 -sbo 12 -sharpest 32 -sbc 4 -pest-resistant 32 -beating 32 -berkowitz 4 -serious 36 -skepticism 32 -shelf 32 -revisions 32 -kravis 4 -ellsworth 4 -intimidate 32 -pearson 6 -swarming 32 -bureaucracy 32 -pmi 8 -final 34 -speculative 32 -violence 36 -chocolates 44 -purists 2 -shapiro 12 -distillates 32 -saatchi 6 -temptation 32 -immigration 36 -drooping 32 -walk 32 -publishers 6 -waxing 32 -assembles 32 -reimbursed 32 -peculiar 32 -butter 32 -anywhere 32 -hoots 32 -cavalier 4 -banxquote 4 -turkish 4 -buried 34 -contributed 32 -intermittent 32 -omniscient 32 -appalled 32 -moved 32 -electromagnetism 32 -fancy 32 -sorts 36 -uncertainty 32 -front 36 -mcneill 4 -plainly 32 -firmly 32 -relations 38 -plasma 32 -enduring 32 -sayer 4 -stumbling 32 -exit 36 -non-refundable 32 -regan 12 -refrigerators 32 -pieter 4 -recipes 32 -doubtful 32 -sue 36 -undercut 32 -patent 44 -liberalization 32 -alvin 4 -progress 38 -brothers 36 -leaves 36 -revert 32 -lover 34 -handmade 32 -deceiving 32 -kessler 12 -ropak 6 -baytown 4 -attacks 32 -juncture 4 -martha 4 -questioning 32 -loved 32 -wiedemann 8 -acquire 36 -falsified 32 -corrigan 4 -romanticized 32 -stealing 32 -intel 12 -recruited 32 -calculating 32 -berger 4 -quantify 32 -gifts 32 -exuding 32 -bigger 32 -evaluate 32 -distributor 32 -restraining 34 -highlights 40 -madness 32 -budget/tax 32 -popularizing 32 -leningrad 6 -bread-and-butter 32 -groups 36 -leaders 34 -amadou-mahtar 2 -non-profit 32 -ivan 4 -wicklow 4 -logo 32 -devoting 32 -teach 32 -luis 4 -bigger-than-expected 32 -credentials 32 -conglomerate 32 -talmudic 4 -banning 2 -steelmaker 40 -ingratiate 32 -anguish 32 -handelman 4 -clymer 8 -acid-rain 96 -your 42 -concluded 32 -atco 8 -calmly 32 -pleads 32 -restart 32 -swimming 32 -boyish 32 -batchelder 12 -japan-style 4 -westport 4 -nominate 32 -perverse 32 -petrochemicals 32 -sunday 4 -trafficking 32 -epidemic 32 -curtailed 32 -rosenbaum 4 -exist 32 -submarine 32 -stepping 32 -lighthearted 32 -then-standard 32 -backwater 32 -religion 32 -plow 32 -colman 4 -nuclei 32 -plot 32 -full-hearted 32 -visit 38 -oriented 32 -abilities 32 -defeated 32 -ddb 6 -horse 32 -halts 32 -mistakes 32 -descended 32 -requesting 32 -bain 14 -reserved 32 -carolina 4 -allocates 32 -speeches 32 -horst 4 -ollmann 4 -russia 4 -basket 32 -wordperfect 4 -seating 32 -unrealized 32 -mistaken 32 -thoroughly 32 -mdest 12 -oldest 32 -mich. 4 -telephone-utility 32 -depot 4 -methodically 32 -talcs 32 -d.c. 4 -jason 2 -implicate 32 -models 36 -sorely 32 -barbecue 38 -triton 14 -bias 32 -subatomic 32 -modell 12 -window 32 -self-indulgent 32 -adelman 4 -complicate 32 -favorite 32 -can-sealing 32 -harvest 32 -boil 32 -warner 12 -miti 14 -tax-law 32 -welton 4 -commodity-chip 32 -student-loan 32 -dearly 32 -fiance 32 -restricting 34 -pick 32 -album 32 -topping 32 -hussmann 4 -fabulously 32 -razzmatazz 32 -skeptic 32 -withstand 32 -interfered 32 -plaza 4 -assertion 32 -e.f.ihutton 4 -diversify 32 -patents 32 -tack 32 -poor 38 -devising 32 -baldness 32 -commented 32 -cooperated 32 -personal-computer 32 -necessity 32 -vintage 10 -entitles 32 -hardymon 8 -elusiveness 32 -outgoing 34 -complacent 32 -keflex 4 -adding 34 -punitive 36 -hoechst 4 -percentage-point 32 -safety 36 -flannigan 4 -brakes 32 -collective 32 -mexican 6 -cours-la-reine 4 -frustrated 34 -anatomy 8 -distributes 32 -daring 32 -overpaid 32 -monty 4 -phaseout 32 -nationally 34 -moonlight 4 -primer 4 -surfeit 32 -distractions 32 -billet 4 -back-to-basics 32 -writ 32 -complying 32 -telephoned 32 -fence 32 -hearing 40 -billed 32 -drabness 32 -balding 32 -glamorous 32 -telephones 32 -adapted 32 -campaigner 32 -rubbed 32 -mandate 32 -economy 100 -breast 32 -spectra 4 -sealed 32 -achievements 32 -odds 32 -keyboard 32 -ambassador 32 -keener 32 -lyonnais 4 -registering 32 -ghetto 4 -apply 32 -bronx 4 -sogevalor 4 -hung 36 -sign 44 -plate 32 -sigh 32 -niles 8 -molten 32 -moves 36 -free-fall 32 -landmark 32 -raton 4 -accrediting 36 -investigations 32 -single-state 32 -disarray 32 -mccann-erickson 4 -pensions 32 -consumer-products 32 -horns 32 -bondholders 32 -plugged 32 -earner 32 -investment 102 -uni.a 12 -smallest 32 -ordinary 32 -ffod 4 -connecting 32 -snack 32 -iron-bending 32 -norman 14 -telco 14 -hmos 4 -repurchases 32 -kilowatt-hour 32 -advertise 32 -outfits 32 -chopin 4 -choreographers 32 -oliver 14 -emirates 4 -bestowed 32 -brink 32 -prize 36 -assuredly 32 -modification 32 -campanies 32 -beckoning 32 -princeton 12 -managing 32 -rebutted 32 -soaring 32 -veteran 32 -loudly 32 -tarrytown 4 -green 44 -flagging 32 -greek 6 -intricate 32 -georgia 6 -unprotected 32 -attain 32 -impromptu 32 -naive 32 -francs 32 -greene 4 -rigid 32 -requiring 32 -circulatory 32 -context 32 -million-dollar-a-year 32 -france 14 -greens 36 -browns 4 -accent 32 -pictures 36 -greer 4 -totals 40 -cosby 12 -peabody 12 -sapiens 32 -wafer 32 -program 34 -swelling 32 -cohesive 32 -prison 36 -recycling 32 -size 32 -quitting 32 -hot-strip 32 -troika 32 -strengthened 32 -hanff 4 -tavris 4 -unissued 32 -mental 32 -oscher 4 -fearsome 32 -arguello 4 -pre-game 32 -would 102 -packing 32 -schoenberg 4 -butler 12 -considering 34 -threshold 32 -persuasion 4 -lessons 32 -cordis 14 -unwind 32 -evades 32 -mascot 32 -joked 32 -telecom 14 -french 14 -thorns 4 -pre-selected 32 -constance 12 -jackman 4 -logical 32 -pressing 32 -competent 32 -bluhm 12 -scenario 32 -sold 100 -sprees 32 -incredible 32 -charities 32 -rigorous 32 -auto-receivables 64 -regains 32 -gradual 32 -dark-skinned 32 -consultations 32 -endurance 32 -s.a.y. 8 -drug-delivery 32 -japanese-style 4 -platinum 34 -insignificant 36 -moderate-income 32 -whatsoever 32 -sweaty 32 -interviewing 32 -raged 4 -myth 32 -household 38 -drawbacks 32 -bluhdorn 4 -twisted 32 -nicotine-stained 32 -conventions 32 -shoots 32 -place 36 -data 46 -pre-boesky 32 -lenses 32 -pre-reagan 32 -unnamed 32 -weaken 32 -experience 32 -dqu 8 -weaker 32 -kleinwort 6 -interest-rate 32 -horizon 32 -sole 32 -tweedledum 4 -commuted 32 -automatic-focus 32 -tension 32 -plows 32 -climbed 36 -poaching 32 -disregard 32 -ginger-haired 32 -flimsy 34 -moorman 8 -acknowledging 40 -aids-related 4 -kremlins 32 -invent 32 -sleeping 36 -koryagin 12 -low-profile 32 -niche 36 -climber 32 -moralists 32 -mcqueen 12 -efforts 102 -ribs 32 -overdraft 32 -swenson 8 -persistently 32 -appointed 32 -plasminogen 32 -cato 4 -harvey 12 -imported 32 -ammonium 32 -constructive 32 -coaxing 32 -progresses 32 -neurosurgical 32 -birds 32 -nazis 4 -offspring 32 -television 102 -donoghue 4 -weldon 8 -cliche 32 -documentation 32 -points 32 -dettmar 4 -half-dozen 32 -bor 8 -prefers 32 -underscores 32 -tranquilizer 32 -boy 32 -proving 34 -diplomacy 32 -rollbacks 32 -tomasson 12 -bob 38 -adult 32 -bon 4 -fuel 36 -microchips 40 -bragg 12 -traditions 32 -nigger 32 -supplant 32 -hope 36 -hutton 14 -probation 32 -adaptation 32 -sdi 4 -co-produced 32 -overreaction 32 -gillis 4 -friedman 12 -archaeologists 32 -quadrupled 40 -gamble 36 -intentional 32 -enemy 36 -where 38 -evenly 32 -emotionally 32 -paltry 32 -experimenting 32 -notification 32 -recapture 32 -underscored 32 -milling 36 -wanb 8 -finch 4 -tuesday 4 -wang 14 -efac 4 -anti-consumer 32 -ryan 4 -forbes 12 -subpeona 32 -banana-republic 32 -want 36 -lacroix 12 -skeptical 32 -weather 32 -single-a-1 32 -probable 32 -single-a-3 32 -generously 32 -exclusive 32 -striving 32 -variously 32 -practically 32 -hunters 36 -tire-worker 32 -aerospace 108 -passengers 32 -impression 32 -spate 32 -learns 32 -industrialized 32 -viruses 32 -juvenile 36 -tien 4 -four-year 32 -holyoke 4 -mexicans 6 -frydenlund 12 -co-founder 32 -payment 36 -weapon 32 -fairer 32 -boomlet 32 -top-grade 32 -currency-stabilization 32 -amendment 36 -number-crunching 32 -direct-sales 32 -sketch 32 -newspaper 32 -pricked 32 -culbro 4 -hands 32 -been 36 -volatile 32 -solvents 32 -refute 32 -incensed 32 -handy 40 -giving 40 -rate 108 -remove 32 -simple 96 -valid 32 -compensation 32 -simply 32 -usual 36 -funeral 36 -backer 32 -carving 32 -ambient 4 -costumes 32 -war-vintage 4 -write-in 32 -redemptions 4 -fared 32 -yugoslavia 4 -geographic 32 -chance 32 -gamse 4 -tabor 12 -self-evaluation 32 -double-a 32 -webb 12 -microchip 32 -spinach 32 -tdy 12 -displays 32 -reelection 32 -items 32 -resurgence 32 -shielding 32 -charity-ball 32 -rubric 32 -meanwhile 46 -installment 32 -rhi 4 -dls 8 -nasty 34 -mx 4 -singing 36 -prostitution 32 -clarity 32 -loan 100 -ml 8 -mecca 32 -close-knit 32 -pecora 4 -u.s.-canadian 4 -luv 8 -bioscience 36 -accepting 32 -load 46 -vast 32 -delfzijl 4 -frx 4 -mug 32 -construction 44 -restricted 32 -carlucci 4 -reduce 36 -epton 4 -remark 32 -peggy 4 -graphics 36 -instance 32 -clemenza 4 -echoed 32 -non-ibm 32 -hospitalization 32 -boogie 32 -cowbells 32 -requirement 36 -jeff 6 -roadways 32 -coca-cola 14 -mishima 4 -men 36 -graphic 6 -spoiler 32 -enteprises 4 -m2 4 -m3 6 -m1 4 -m. 12 -short-term 38 -debt-backed 32 -ludcke 8 -gypsum 36 -manna 32 -admiral 36 -perform 32 -blundered 32 -fillings 32 -songs 32 -dosage 32 -associated 38 -nashville 14 -truckloads 32 -wound 32 -reorganized 96 -ohio 4 -ranches 32 -sends 32 -hanging 34 -europe 14 -reading 38 -alden 12 -conventional-mortgage 32 -profound 32 -success 32 -medium-duty 32 -parts 100 -thoughout 32 -periodic 32 -five-country 32 -something-for-everyone 32 -salvaging 32 -sumo 4 -scrambling 32 -gleam 32 -long-distance 98 -resembles 32 -vandenburg 4 -unsolicited 32 -seafood 32 -envelopes 32 -color 36 -contraceptive 32 -respond 32 -anyhow 32 -sings 32 -ampersand 4 -extinguishment 32 -farm 46 -imperial 36 -salad 32 -terrace 4 -fare 32 -prides 32 -russell 6 -farming 32 -ahe 12 -memorably 32 -accuses 32 -howes 4 -monitor 36 -brazil 14 -flamingo 4 -cyprus 14 -artillery 32 -waited 32 -memorable 32 -reed 12 -surmises 32 -operational 32 -architectures 32 -pr 4 -accused 32 -contrary 32 -shave 32 -no-tax 32 -replaced 32 -annually 34 -swing 40 -r.i. 4 -private-ownership 32 -toughen 36 -celanese 4 -schlender 8 -blends 32 -tougher 32 -unaffiliated 32 -vira 4 -claudio 4 -expansion 32 -alma 36 -cyclops 14 -moore 12 -saxton 12 -anna 4 -revolving 32 -hostage 32 -fringe 32 -exacerbate 32 -specialists 32 -yellows 32 -watches 32 -capitalized 32 -greeted 32 -stands 32 -pettee 12 -signed 96 -longer 36 -el-barajneh 32 -habits 32 -socialists 4 -misused 32 -smuggled 32 -dreamed 32 -reptiles 32 -appointees 32 -watched 32 -mollusks 32 -reducing 36 -supervise 32 -disrupting 32 -tax-favored 32 -ireland 14 -liabilities 32 -oskar 4 -amenities 32 -afternoons 32 -beginners 4 -fail 36 -box-office 32 -unambiguous 32 -wounded 32 -comptroller 44 -trays 32 -lingus 4 -scrutinizes 32 -below 36 -incompetent 32 -yells 32 -mhp 8 -isbell 4 -hardgoods 4 -interprets 32 -caught 32 -attractively 32 -accosted 32 -insisted 32 -mhc 12 -wolverine 4 -trailing 32 -outpaced 32 -coursed 32 -forming 32 -inability 32 -vergennes 4 -suzdal 4 -miami 14 -steady 32 -compromising 32 -shore-grown 32 -sullivan 12 -chemical-physics 32 -ethnicity 32 -boardroom 4 -hull 32 -standard 46 -predicted 32 -well-informed 32 -rivals 36 -stutters 32 -choreography 32 -income 36 -parks 32 -gerlach 8 -oslo 10 -agriculture 36 -fairfield 12 -telex 32 -premium 32 -infringed 32 -franchised 32 -life-insurance 32 -espionage 32 -bur 8 -belgian 4 -contending 32 -heinike 4 -sharky 32 -bellwether 96 -fining 32 -violate 32 -ambiguities 32 -deflator 32 -paragraphing 4 -forgiven 32 -schrader 12 -amazement 32 -p. 4 -eugenia 4 -ltv 4 -particularly 34 -industrial-systems 32 -film-making 32 -recoup 32 -embarrass 32 -semester 32 -dramatized 32 -co-managing 32 -imports 102 -shine 32 -swung 40 -so-called 32 -pre-recorded 32 -infected 32 -icx 8 -icy 32 -laundering 32 -supply-side 32 -bailing 32 -scrutinize 36 -transmission 32 -leisure 40 -dramatizes 32 -bay-based 4 -camouflage 32 -icn 14 -upon 32 -previous 34 -truth 32 -bearish 32 -off 36 -pianist 2 -swallow 32 -plenty 32 -leaked 32 -campus 36 -remaining 32 -hype 32 -delayed 32 -lt. 4 -grousing 32 -hunter 36 -write-downs 32 -nano 4 -processor 32 -sedona 4 -opportunities 32 -neared 32 -frontrunners 32 -wrath 32 -laurie 4 -poison-pill 32 -commandos 32 -duthie 4 -drug-industry 32 -engine 36 -conversion 32 -bargain-hunting 32 -strained 32 -dependency 32 -sun-baked 32 -boczek 4 -promote 36 -little 46 -juan 4 -berrill 4 -home-dish 32 -ruled 32 -policies 36 -deadpan 32 -inflicted 32 -affable 32 -carlough 8 -ruler 32 -rules 36 -seventh 34 -melt-through 32 -tapping 32 -aircruisers 6 -gulf 46 -borrowing 32 -stings 32 -schuster 4 -protection 36 -prime-3 4 -prime-1 4 -refinanced 32 -insulated 32 -kneale 4 -those 46 -family-life 32 -conversation 32 -aesthetic 32 -weaver 12 -ah 2 -personal 32 -decorated 32 -boys 32 -sleep 36 -rustler 4 -grace 46 -reneged 32 -abalone-lover 32 -broker-adviser 32 -strengths 32 -halls 32 -re-elected 32 -plaintiffs 32 -discretionary 32 -vfc 8 -substantive 32 -decentralize 32 -australasia 4 -pen 32 -periods 32 -aspect 32 -granting 32 -ride 36 -bury 32 -polish 36 -imminent 32 -incredulous 32 -cooperate 32 -trust 100 -pep 40 -statistics 44 -w.va. 4 -danvers 4 -feasible 32 -refining 32 -odor 32 -rats 32 -frost 32 -securties 4 -f-20 4 -frost-retarding 32 -kamin 4 -deeds 32 -charles 14 -japan 14 -classic 36 -dissident 32 -pre-approved 32 -guarantee 32 -airline 40 -disproportionately 32 -loves 32 -playing 40 -losing 32 -food-processing 32 -live 38 -sustain 32 -incessant 32 -backup 32 -visiting 32 -theme 36 -exempts 32 -incontrovertible 32 -transworld 6 -wps 8 -filling 96 -prudent 32 -abandoning 32 -implications 32 -shackled 32 -loophole 32 -treading 32 -innkeeper 32 -fig 32 -victimized 32 -entrenched 32 -helicopter 32 -staunchly 32 -cujo 4 -fin 4 -jog 32 -camouflaging 32 -fit 32 -venal 32 -abound 36 -ocean 40 -confidence 36 -fis 8 -fir 36 -complex 36 -space-launch 32 -wants 96 -pastore 4 -fix 32 -movies 108 -little-known 32 -cheap-labor 32 -cbot 12 -fourth-quarter 110 -litigation 32 -refreshing 32 -relman 12 -up&l 6 -year-round 32 -austin 6 -suitcase 32 -mideast 4 -cboe 12 -chips 100 -dioxide 32 -u.s.-italian 4 -treated 32 -stringers 32 -funneled 32 -bestowal 32 -dreams 36 -circulated 32 -recalculated 32 -privy 32 -easier-to-use 32 -mid-teens 32 -oncor 2 -theoretical 32 -translates 32 -acceleration 40 -centre 4 -enchant 32 -margin 32 -much 102 -academicians 32 -medicine 36 -challenge 32 -react 36 -housing-finance 32 -admit 36 -appearances 32 -rebuttal 32 -shaib 4 -distortions 32 -bombed 32 -riled 32 -volume 46 -great 46 -alienating 32 -interest-only 32 -tightly 32 -territory 32 -mirror 36 -chart 32 -standards 36 -toughened 4 -hell 36 -objective 32 -motorist 32 -natchez 4 -intemperate 32 -advising 32 -held 44 -udf 4 -brewers 32 -redress 32 -ticketed 32 -reverses 32 -coloreds 32 -two 110 -tremendously 32 -help 36 -fargo 4 -journal/europe 4 -lemon 32 -ingersoll 4 -fledgling 32 -substances 32 -boveri 4 -rebels 36 -antics 32 -tfb 8 -big-time 32 -lovastatin 34 -halcion 4 -swell 32 -competence 32 -bashful 32 -bromberg 12 -upturn 32 -abreast 40 -budweiser 4 -t.pwf 4 -afflicted 32 -reached 38 -someday 32 -delaying 32 -trusts 32 -influenced 32 -overdrafts 32 -citizen 36 -directors 110 -squish 32 -magic 36 -large-screen 32 -eased 34 -reads 32 -indeed 42 -middlemen 32 -insider 46 -generate 32 -listened 32 -disposed 32 -determination 32 -arentsen 8 -express-parcel 32 -save 36 -builder 32 -wyden 4 -guns 32 -helpless 32 -campeau 4 -gunn 12 -settlements 36 -portions 36 -midst 36 -seemed 32 -all-natural 32 -recruiter 32 -monoclonal 32 -marketplace 32 -telegram 32 -raking 32 -czyrek 12 -columbia-based 4 -rejects 36 -workman 36 -three-hour 32 -due 108 -fine 38 -stacks 32 -sentimental 32 -ala.-based 4 -dun 12 -softwood 96 -grisdela 8 -bowl 4 -pga 8 -consumers 38 -elders 36 -legislators 32 -freezing 32 -walter 14 -steam 32 -pgn 8 -housing 102 -baby-boom 32 -calamari 32 -refuted 32 -financing 44 -reimbursement 32 -odds-makers 32 -steak 4 -deleted 32 -d 44 -quickie 32 -langdon 4 -fierce 32 -enhanced 32 -takeover-trading 32 -population 100 -unabashedly 32 -supplement 32 -levin 14 -marketer 32 -intimately 32 -equally 34 -digger 4 -disadvantage 32 -monica 4 -jacqmin 8 -peterson 4 -jean-pierre 4 -warplanes 32 -broad-based 32 -burdines 4 -t.pow 12 -starts 36 -lyric 32 -corn 36 -weekday 32 -injected 32 -three-cent 32 -lifting 34 -suit 44 -dependence 96 -core 32 -processed 32 -hauck 4 -j.p. 14 -machine-tool 32 -critic 32 -lingo 32 -paths 32 -slots 32 -high-level 32 -dank 32 -molecule 32 -steel-production 32 -vowing 32 -passenger 32 -corp 4 -indexed 32 -painful 34 -carey 4 -cause 32 -carew 8 -did 38 -weinberg 8 -reorganizations 32 -r.k. 4 -across 36 -indexes 36 -helping 32 -mcgraw-edison 4 -fifth 38 -snowstorm 32 -cared 32 -isgur 4 -jersey 4 -rohstoff 4 -kills 32 -ferns 32 -well-managed 32 -watson 12 -streeter 4 -floury 32 -medicines 32 -renovation 32 -scores 38 -scorer 32 -substitute 32 -tomatoes 32 -alicia 4 -monarch 14 -convincing 32 -municipalities 32 -movie 36 -scored 32 -seattle 12 -temper 32 -result 32 -corrugated 32 -creep 32 -creer 4 -lens 32 -enhances 36 -biotechnology 36 -occupied 34 -ordinance 32 -cqx 4 -stint 32 -doubted 32 -ramshackle 32 -delicately 32 -non-family 32 -energy 46 -urgently 32 -dutton 12 -archly 32 -creek 4 -goose 32 -defections 32 -improves 32 -backlash 32 -stanton 4 -splashy 32 -specimens 32 -texcel 10 -auto 102 -tendency 32 -scripting 32 -in-state 32 -polystyrene 32 -arc 8 -rte 10 -relocate 32 -employees 38 -blue-eyed 32 -improved 32 -eyeing 32 -reviving 32 -one-quarter 32 -entregrowth 6 -deteriorate 32 -mastermind 32 -narration 32 -ranging 32 -flooding 32 -reappeared 32 -malfitano 12 -forsythe 12 -swirly 32 -accordance 32 -maladies 32 -whr 8 -share-trading 32 -mature 32 -first-ever 32 -up-and-coming 32 -specialization 32 -senate 14 -pretty 32 -electric 110 -programs 36 -solids 32 -fined 32 -angolan 6 -showed 36 -ramada 12 -unseated 32 -yen-dollar 32 -constituencies 32 -frontier 36 -counts 96 -applause 32 -watch 38 -straits 4 -thou 4 -bare 2 -fines 32 -syntech 14 -introduction 38 -ceiling 36 -oreffice 8 -rye 4 -light-years 32 -westin 6 -ovens 32 -shares 46 -giveaway 32 -laura 4 -large-denomination 32 -small-time 32 -provisional 32 -copyrights 32 -individuals 34 -vs. 32 -toes 32 -safe 32 -shared 32 -congregations 32 -rubel 8 -discourage 32 -peking 8 -fiefdom 32 -diversifications 32 -unraveling 32 -luxury 32 -retrench 32 -aircraft 100 -bistro 32 -rubes 4 -search 44 -dive 32 -mercurial 32 -endorsing 32 -scotland 4 -wrested 32 -madonna-meets-the-duchess-of-windsor 4 -lachenbruch 4 -judd 4 -problems 40 -hynes 12 -eurodollar 14 -vortmann 8 -x-%chg 32 -retooled 32 -tinged 32 -illegality 32 -redesignation 32 -polaroid 14 -soul 32 -refusal 32 -silently 32 -motivated 32 -tailored 32 -arrives 32 -herwitz 4 -sour 32 -dimensionless 32 -soup 36 -aaron 4 -muller 12 -hear 32 -post-it 4 -lawmakers 34 -dense 32 -basing 36 -truck-assembly 32 -daisy 4 -boldly 32 -delays 40 -termed 32 -brokerage 98 -christian 4 -controversies 32 -comic 32 -kuan 4 -usair 14 -cline 8 -crankshaft 32 -shamrock 14 -compensate 32 -salamon 4 -minicomputers 32 -contradictory 32 -shedding 32 -provincial 32 -lear-siegler 4 -cooker 32 -handkerchief 32 -rel 12 -located 32 -explain 32 -genenchem 4 -controllers 32 -reactivated 32 -boards 32 -untraveled 32 -devastating 32 -renamed 32 -microscopic 32 -swedish 14 -cheese 32 -newsman 32 -inexpensive 32 -rescue 32 -royce 4 -seidman 12 -pathology 32 -mechanicsburg 8 -torrid 32 -inescapable 32 -northeast 36 -shattered 32 -emerson 14 -honeywell 6 -assure 32 -anglican 4 -springfield 4 -panasonic 4 -chamberlain 12 -midcon 4 -friend 32 -less-powerful 32 -appraisal 36 -void 32 -strides 32 -large 36 -fruits 32 -holiday-shortened 32 -non-voting 32 -overheated 32 -ricupero 4 -emporia 12 -unfriendly 40 -advances 38 -estimates 96 -breaded 32 -seattle-based 4 -fugitive 32 -estimated 34 -advanced 46 -furthering 32 -wellington 8 -litan 8 -captured 32 -educators 32 -hydraulic 32 -countries 36 -westinghouse 4 -manages 32 -manager 96 -genicom 6 -detectors 32 -needn 32 -environments 32 -formula 32 -inspectors 32 -replenished 4 -tiaa-cref 4 -diminutive 32 -element 32 -midday 32 -contractor 32 -fantastic 32 -public-works 32 -negotiator 32 -preface 32 -throes 32 -marines 4 -privatization 32 -salute 32 -rodeo 36 -james 14 -associations 32 -spielberg 12 -limbo 32 -exercising 32 -chalmers 4 -hollis 8 -placement 36 -appeasing 32 -smiths 4 -fanfare 32 -wry 32 -roosevelt 4 -butcher 36 -punjab 4 -iroquois 4 -billy 4 -humphrey 4 -foolproof 32 -incorrect 32 -gadhafi 12 -mess 32 -housewife 32 -income-producing 32 -language 32 -textron 10 -newly 32 -pervading 32 -toiling 32 -westamerica 14 -post 36 -pair 32 -kwon 8 -disagreeable 32 -subsides 32 -gangster 32 -foreseeable 32 -exiting 32 -hides 32 -scrupulous 32 -myself 32 -pose 36 -spook 32 -delicacy 32 -toxic-waste 32 -graceful 32 -oversaw 32 -pasadena 12 -phenomenal 32 -satirist 32 -gallery 36 -erase 32 -parent 100 -morella 4 -liquidation 32 -severance 32 -casually 32 -conduits 32 -shortening 32 -fallacies 32 -practices 36 -filming 32 -lady 36 -tags 32 -dictatorial 32 -breakers 32 -calif. 4 -dubroc 4 -sponsor 32 -tunnel 32 -occurring 32 -afterward 32 -verdict 32 -breakfast 36 -damages 32 -danse 32 -tenancy 32 -positioned 32 -massaged 32 -course 32 -who 108 -fortress 32 -content 32 -adamant 32 -constituting 32 -yuppie 36 -industrial-talc 32 -erich 8 -enforces 32 -practicing 32 -bookkeeping 32 -fired 32 -overcall 32 -commissars 32 -fraudulent 32 -goal 36 -loynd 8 -unit-trust 32 -supervisor 32 -striking 34 -variation 32 -goat 32 -macedo 4 -enforced 32 -promised 32 -symbols 32 -defuse 32 -psychiatrists 32 -courier-journal 8 -therapy 32 -sandwich 32 -schwartz 4 -hinton 4 -guyford 8 -ian 4 -suffering 32 -wrongful-discharge 32 -hubs 32 -redoute 4 -contingency 32 -presidente 4 -calculated 32 -thin 32 -chiles 12 -semiconductor-manufacturing 32 -sick 12 -bucks 32 -this 110 -wrongdoers 32 -wilfred 4 -bulging 32 -glasnost 2 -baritone 32 -hoover-dempsey 4 -emergence 32 -responses 32 -baltimore-washington 8 -flag 32 -recommendations 32 -institutional 102 -fca 4 -union-busting 32 -fires 64 -schroder 4 -crazy 38 -rudin 4 -cologne 4 -flak 32 -mom-and-pop 32 -getting 38 -flat 36 -provoking 32 -barring 36 -craze 32 -intuitively 32 -optional 32 -crickets 32 -target-company 32 -establishes 32 -defrauded 32 -transaction 36 -threatening 32 -feeling 32 -blossoms 32 -southwestern 32 -unjustified 32 -freeman 12 -alexandria 4 -beginnings 32 -polonsky 4 -affiliate 32 -nonpublic 32 -unlikely 32 -cherry 8 -kotlowitz 4 -flamboyant 32 -scoring 32 -passages 32 -all-knowing 32 -t.fcy 4 -protests 32 -ashore 32 -loewi 4 -character 32 -neutralist 32 -silversmiths 32 -impedes 32 -chillicothe 4 -wage-price 32 -bovine 32 -pa. 4 -chromed 32 -plus 36 -insurer 36 -ambiguous 32 -non-financial 32 -maybe 38 -turnout 34 -delta 14 -joints 32 -muscle 32 -sure-fire 32 -blunt 36 -nursery 4 -air-cargo 32 -spoiling 32 -taxpayers 36 -lt 8 -overregulation 32 -trident 4 -bright 32 -drastic 32 -usable 32 -hazardous 32 -atico 14 -begin 96 -chaotic 32 -le 34 -different 40 -specially 32 -multibillion-dollar 32 -deregulating 36 -steering 32 -provider 32 -lo 32 -abbreviation 32 -waste-water 32 -staley 8 -oiler 32 -pac 4 -prodigious 32 -mammoth 36 -kaplan 12 -assisting 32 -horse-racing 32 -doldrums 36 -dicey 32 -theory 32 -car-assembly 96 -big-college 32 -pat 4 -pas 4 -par 32 -funny 32 -pap 4 -question 32 -cook 4 -pleasurable 32 -pay 36 -supermerchant 32 -dissemination 32 -thrilled 32 -poland 6 -trading 46 -misunderstood 4 -coupled 42 -generators 36 -ambulatory 32 -currency-exchange 32 -thriller 32 -vegetable 32 -now-dormant 32 -merchant-banking 32 -various 34 -eye 32 -lawsuits 38 -seneca 4 -superannuation 4 -purposes 32 -reimposition 32 -unleashing 32 -bodies 32 -rouged 32 -f-100 4 -grieux 4 -deliberate 32 -passing 32 -quirky 32 -looming 32 -baldrige 4 -drums 32 -meritless 32 -cnn 4 -compulsory 32 -collectively 32 -falconbridge 4 -runoff 32 -yelled 32 -hormone-treated 32 -bros 4 -lindley 4 -laureate 32 -victory 32 -checking 32 -match 32 -action-forcing 32 -well-capitalized 32 -recent 42 -sanford 12 -arrested 38 -hand-wringing 32 -burry 8 -frantic 32 -residents 32 -fun 36 -pared 4 -residential 34 -neronian 4 -rko 4 -capture 32 -deadlines 32 -clients 36 -fourteen 2 -testified 32 -redeem 36 -pre-trial 32 -hunt 44 -have 100 -simultaneously 32 -testifies 32 -baked 32 -sagging 32 -iraqis 4 -one-cent-a-share 32 -prevost 4 -deux 4 -mestizo 32 -rail-car 32 -rican 4 -breasts 32 -abington 4 -outdo 32 -generals 32 -staffer 32 -sumitomo 4 -savvy 32 -still 42 -gracefully 32 -contributing 34 -rather 38 -negatives 32 -rewritten 32 -public-sector 32 -agency 46 -nicest 32 -players 36 -wabc-tv 4 -inconclusive 32 -expected 36 -oresman 8 -obliquely 32 -innovation 32 -csx 14 -northrup 4 -getty 12 -calny 14 -dryden 8 -stimulants 32 -purcell 4 -near-infrared-spectroscopy 32 -brings 32 -descendant 32 -rifadin 4 -forth 32 -treats 32 -f.j. 4 -dirks 4 -cen 8 -treaty 36 -weekly-average 32 -roughly 32 -demonstration 32 -expense-reduction 32 -mcgaw 4 -montera 4 -maneuver 32 -rethink 32 -malted 32 -overshot 32 -mulroney 12 -prolong 32 -educate 32 -wars 36 -realities 32 -entrance 32 -discoloring 32 -wary 40 -of 126 -mcgegan 4 -enlightened 32 -ward 44 -water 36 -research 102 -clemens 8 -hedging 32 -warm 32 -warn 32 -montero 4 -home-mortgage 32 -contracted 40 -baer 4 -mamelodi 4 -reproductive 32 -pastel 32 -males 32 -pledging 8 -asian-based 4 -descending 32 -entrepeneurial 32 -walker 4 -poindexter 12 -computer 38 -on 110 -zealously 32 -portion 32 -seized 32 -plastic 32 -nashua 12 -indisputable 32 -uncomfortably 32 -coordinated 32 -promotes 32 -promoter 32 -soldering 36 -exceptional 32 -sell 36 -scrawled 32 -uncomfortable 32 -michael 14 -stockyards 4 -gorki 4 -confections 32 -keynes 8 -attack 40 -attach 32 -sgic 4 -intentionally 32 -alexanders 4 -prohibited 32 -plunges 4 -ashton 4 -bridesmaid 32 -under 110 -constructed 32 -fire 36 -suspense 32 -pool 32 -nabisco 4 -useful 32 -plunged 36 -involving 44 -atlanta 14 -belding 4 -tact 32 -review 44 -lessened 32 -chases 32 -rosenfeld 4 -proliferating 32 -tent-meeting 32 -establishments 32 -response 32 -went 96 -discotheques 32 -vehicle-leasing 40 -private 34 -skimp 32 -preventive 36 -flurry 32 -floyd 6 -sir 46 -chased 32 -cruise 36 -distributing 32 -wyomissing 8 -change 36 -wallace 12 -convertibles 4 -educational 36 -yacht 4 -six-to 32 -sayad 4 -arnold 14 -publicizing 32 -weathered 32 -levittowns 4 -swamp 32 -practical 32 -customer 32 -tortuously 32 -crematories 32 -julius 6 -anthropologists 32 -reagan-proposed 4 -downsizing 32 -less-upholstered 32 -disgrace 40 -arrival 32 -nonferrous 32 -public-broadcasting 32 -speak 32 -tripled 32 -contemporary 32 -dilute 32 -genres 32 -maintains 36 -spear 32 -theatricality 32 -thicker 32 -distinguish 32 -vow 32 -spoke 32 -seems 32 -ky.-based 4 -once-thriving 32 -economists 46 -independent 34 -employee-benefits 32 -curacao-registered 4 -buchanan 12 -ailing 36 -ef-18a 4 -payoffs 32 -closet 32 -washington 14 -lipman 4 -filthy 4 -fine-tuned 32 -tomihiro 4 -aryan 4 -kong-based 4 -representative 36 -dubbed 32 -upham 4 -missions 32 -execute 32 -leakage 32 -neighbor 32 -renters 32 -discriminatory 32 -general-purpose 32 -atlasair 4 -pharmacy 4 -nine 38 -smoke 32 -colony 32 -nassau 4 -controversial 32 -external 32 -leanest 32 -protest 36 -unannounced 32 -covert 36 -worldwide 36 -confesses 32 -pacemaker 96 -pullman 12 -ecology 4 -likens 32 -lawsuit 32 -prognosticators 32 -brown-forman 4 -palestine 4 -conclusive 32 -distiller 32 -stereos 32 -hampshire 4 -lock 32 -missouri 4 -consciences 32 -reaction 32 -project 36 -c.v. 4 -counterpoint 32 -dependent 32 -roux 12 -macintyre 4 -clyde 4 -etched 32 -keep 36 -fresno 4 -choosing 32 -atom 36 -arson 32 -installing 32 -tycoons 32 -directors-and-officers 32 -possessed 32 -nsc-directed 4 -upscale 32 -c. 6 -co-head 32 -symmetrical 32 -thursday/sunday 4 -garcia 8 -municipal 36 -actors 32 -channell 12 -likenot 32 -perspective 36 -yuppies 32 -botlek 4 -two-year 32 -apartheid 32 -laying 32 -debacle 32 -major-league 32 -graft-vs.-host 32 -alumni 32 -cropping 32 -midrange 32 -richard 14 -left-wing 32 -intensive 32 -causes 32 -cq 8 -worker 36 -suskind 12 -e-6a 4 -call 34 -neatly 32 -cz 8 -mid-1960s 32 -berth 32 -co-artistic 32 -oriental 4 -exotic 32 -worked 32 -ms. 14 -believing 32 -parry 8 -agenda 32 -cl 8 -well-received 32 -filled 32 -revision 32 -setting 32 -majorities 32 -complications 32 -compounded 32 -thiokol 4 -molders 6 -diving 34 -pmsc 4 -activator 32 -following 42 -door-to-door-delivery 32 -rekindle 32 -deterrent 32 -jurors 32 -i`ve 4 -wpob 4 -nasa 2 -downey 4 -much-larger 32 -partners 36 -inhabit 32 -pandora 4 -caused 32 -p.a.m. 4 -henry 14 -marred 32 -golden 44 -arbitrage 36 -inflict 32 -acquiescent 32 -orben 4 -ginning 32 -tuxedos 32 -pre-dated 32 -bedford 12 -deteriorated 32 -fortifying 32 -midgets 32 -b.a.it 8 -hagan 8 -pessimism 32 -nonexistent 32 -bottlers 32 -death 38 -checkers 4 -transportations 4 -arrange 32 -simi 4 -portland 14 -fiber 32 -baucus 12 -flindt 4 -recorded 32 -alasdair 4 -black-and-white 32 -gutzwiller 8 -ratio 32 -ogorodnikov 4 -counting 34 -arriving 32 -elimination 32 -underwrite 32 -drinks 32 -determine 32 -curtailment 32 -recorder 32 -money-market 4 -israelis 4 -osha 4 -glazer 12 -cash 38 -high-tech 96 -consensus 32 -lurked 32 -mitsubishi 4 -cycle 32 -weston 4 -drain 32 -stirred 32 -winegardner 12 -wonders 32 -bollinger 10 -concerned 32 -anti-japanese 32 -vanoff 4 -purchasers 34 -toga 32 -cultural 32 -shipbuilding 36 -philosophy 32 -masse 32 -financial-aid 32 -hayden 4 -priced 98 -societal 32 -elsinore 2 -continuous 32 -doorstep 32 -shirt 32 -ski 32 -miniseries 32 -malvern 4 -trained 34 -sporadic 32 -themselves 32 -valentine 12 -unsold 32 -goings-on 32 -disproportionate 32 -salvo 32 -raiders 32 -viewers 38 -blame 34 -corresponds 32 -eastbound 32 -enga 4 -bombs 32 -gameshow 8 -notified 96 -price-slashing 32 -things 34 -repeatedly 32 -robins 14 -tentatively 96 -linkup 6 -unknowable 32 -creditors 36 -iran 6 -kolber 8 -communications-based 32 -hammered 32 -gets 44 -succession 32 -angle 32 -cameras 38 -low-sulfur 32 -deandome 4 -service-company 32 -record-keeping 32 -caw 4 -leads 96 -snowy 32 -jarring 32 -bayh 4 -iraq 6 -untenable 32 -crows 32 -serves 32 -regular 40 -pre-payment 32 -fremantle 4 -norris 4 -minute 32 -jamie 2 -motives 32 -aids-treatment 4 -deny 32 -bradstreet 4 -elmwood 8 -ak-47 4 -tra 4 -non-oil 36 -out-of-court 32 -bs 8 -heir 32 -screen 36 -bt 4 -two-thirds 32 -separation 32 -by 126 -nondurable-goods 32 -benched 32 -economist 40 -tumbling 32 -mile 36 -bb 4 -ba 12 -bg 4 -be 108 -satisfaction 32 -bk 8 -mill 36 -leased 32 -interstate 46 -bn 8 -milk 32 -hve 8 -clever 32 -monarchs 32 -plumbing 32 -sticker 32 -friction 32 -pro-rata 32 -est 12 -segundo 4 -mckanic 4 -co-op 32 -flexi-van 4 -sending 36 -anti-climactic 32 -vatican 4 -optic 32 -partly 34 -controversy 32 -regrouping 32 -brio 8 -orbiting 32 -supporters 96 -shafts 32 -take 102 -per-minute 32 -slight 32 -minds 32 -sandinista 4 -rim 4 -rio 12 -car-parts 32 -vastness 32 -undetermined 32 -mcmxxv 4 -radio-television 4 -abalone 38 -confirming 32 -kerley 8 -beaver 4 -standstill 32 -distribute 32 -raburn 4 -arms-sales 32 -city-council 32 -crummy 32 -accidental 32 -space 36 -panicky 32 -bethlehem 12 -credit-research 32 -prime 38 -reimposed 32 -slash 32 -exhibitions 32 -pipeline 32 -woodville 8 -splintered 32 -monopoles 34 -representations 32 -mid-march 32 -best-managed 32 -movements 32 -third-largest 32 -binding 32 -private-property 32 -dislocated 32 -awarded 32 -everyday 32 -hoyt 4 -universities 96 -mortgage 36 -natan 4 -palates 32 -drilling 36 -additive 32 -incremental 32 -boozer 32 -driven 32 -puat 4 -federalism 32 -argue 32 -enrile 4 -mouths 32 -pennzoil 14 -they 46 -portable 44 -regulate 32 -doors 32 -minichain 32 -females 34 -archaic 32 -gloom 4 -stalked 32 -eight-month-old 32 -them 32 -then 126 -setbacks 32 -shoes 32 -orderly 32 -maghdousheh 4 -blond 34 -director-generalship 32 -culver 4 -protectionism 32 -merged 32 -positive 32 -trifle 32 -prophets 32 -underpaid 32 -chevrefeuille 4 -clarinetist 32 -capability 32 -lobbyists 32 -negligible 32 -persist 32 -perversely 32 -yardstick 32 -sacco 4 -classics 32 -deserve 32 -guest 32 -bacha 12 -muscular 32 -foldessy 8 -development 36 -freezes 32 -assistants 32 -cardholder 4 -imagery 32 -introduce 36 -andre 6 -stopping 34 -thick 32 -brackets 32 -asics 4 -arrangement 32 -dominion 14 -desks 32 -imposition 96 -slope 4 -escrow 32 -renault 8 -credibility 32 -superintendents 32 -brilliant 32 -zink 12 -measured 32 -blessing 32 -sent 32 -diego-based 4 -keating 12 -darkly 32 -doctor 32 -coins 32 -apple 6 -relish 32 -organizational 32 -speculates 32 -confection 32 -haunting 32 -mistake 32 -send 32 -pre-agreement 32 -uranium 32 -substitutes 32 -compulsion 32 -cathy 4 -rebellion 32 -trustee 32 -moderately 32 -esther 4 -kaye 2 -details 34 -motors 36 -low-margin 32 -knowledgeable 32 -occidental 2 -increasingly 38 -engage 32 -infectious 36 -twist 32 -jewel 32 -ivory-tower 2 -liberal 36 -distinctly 32 -enjoined 32 -irritating 32 -backdrop 32 -elementary 32 -entire 32 -towing 32 -database 32 -farce 32 -coordinate 32 -hubert 4 -cattle-on-feed 32 -tends 32 -sen. 6 -karen 4 -tests 46 -spinoff 36 -soundness 32 -benton 8 -seven-page 32 -bored 32 -low-light 32 -morton 14 -looks 44 -whittaker 6 -jawbone 32 -innovative 32 -hunkered 32 -horrified 32 -invested 32 -movement 36 -gutsy 32 -investigators 46 -elgie 4 -unforgivable 32 -escalating 32 -dimona 4 -resale 4 -quicker 32 -terse 32 -sara 8 -exterior 32 -amalgam 32 -nomination 40 -debts 36 -chisholm 12 -godiva 4 -substituted 32 -risk-taking 32 -ci 8 -iosif 4 -walhalla 4 -selas 12 -saliva 36 -dolls 32 -ardently 32 -diaper 32 -negotiations 32 -over-the-counter 32 -stakes 36 -dolly 4 -dreaded 32 -created 34 -defend 36 -accountant 32 -anticipating 34 -wilmette 4 -perceptible 32 -fhp 4 -shelter 32 -extracting 32 -aired 32 -alleging 32 -four-way 32 -detailing 32 -subdued 32 -airs 32 -popularize 32 -shopping 36 -relatively 32 -bowing 32 -cosmetic 32 -motels 32 -vied 32 -baseless 32 -brash 32 -porter 4 -redefine 32 -embraced 32 -gatherings 32 -b.f. 4 -fallacy 32 -grain 32 -asbestos-related 32 -gourmets 36 -pcs 4 -rectify 32 -cro-magnons 4 -embittered 32 -reinterpretation 32 -stockholder 32 -funded 32 -governments 32 -legitimize 32 -mordant 32 -appeared 32 -franklin 12 -ambivalence 32 -pave 32 -infants 32 -thrifts 40 -surfacing 32 -niches 32 -routine 32 -routing 32 -ribaminol 34 -refined 32 -turn 32 -victims 36 -essays 32 -issuance 32 -earings 32 -hostile 40 -turf 32 -landmarks 32 -launch 32 -financial-futures 32 -repeated 34 -early 106 -concentrates 32 -picked 32 -overexposure 32 -properly 32 -caramels 32 -likely 36 -simian 32 -lebanon 4 -restore 32 -spear-throwing 32 -lust 32 -depth 32 -mentor 4 -concentrated 32 -germ-warfare 32 -chess 32 -read 44 -chest 32 -ern 4 -surge 36 -bachelor 32 -real 46 -refrain 32 -rows 32 -innkeepers 32 -ya 32 -oestreich 12 -n.d. 4 -reap 32 -liggio 4 -investigation 100 -firmed 32 -anti-sandinista 32 -cane 36 -dail 4 -voting 36 -trudeau 4 -admirers 32 -scintillation 32 -inmates 32 -relinquish 32 -crack 32 -erg 4 -vilify 32 -higgins 4 -gladly 32 -technologic 4 -mothers 32 -intensity 32 -seven-state 32 -blocked 32 -maximizing 32 -teri 6 -unacceptable 32 -trend 32 -bombers 32 -repulsed 32 -micheal 2 -subscription 32 -dorfman 8 -oh-god-if-i-pull-this-one-off-i 4 -lest 32 -less 38 -cocolat 4 -numbering 32 -non-technical 32 -tourist-drawing 32 -post-colonial 32 -gesture 32 -developed-country 32 -organizes 32 -organizer 32 -lesa 4 -strip-searched 32 -quotes 32 -resolution 34 -renewed 32 -gambler 32 -lines 44 -first-rate 32 -four-wheel-drive 32 -gratitude 34 -massoud 4 -gambled 4 -free-wheeling 32 -executes 32 -quoted 32 -targeted 32 -wasteful 32 -fiery 32 -cops 32 -survey 36 -abuses 32 -parkway 4 -exam-preparation 32 -successes 36 -attendants 32 -windfalls 32 -brace 4 -personal-income 32 -mind 32 -bcl 8 -hoffmann 4 -health-products 32 -bcw 8 -bcv 8 -constituents 32 -duty 100 -cephalosporin 32 -cambridge 4 -bcy 4 -info 4 -brushed 32 -moderate 32 -x 12 -cinematic 32 -hobbies 32 -airing 32 -default 32 -hambrecht 4 -conservancy 32 -alix 12 -five-point 32 -terror 2 -south 44 -breeds 32 -bailed 32 -atuwi 4 -quarterlies 32 -procedural 32 -heirs 32 -psychological 32 -distinctive 32 -likes 32 -england-based 4 -alive 32 -teneff 12 -liked 32 -one-man 32 -fixed-income 2 -unfilled 32 -outerspace 32 -stock-index 32 -ediger 8 -criminal-sentencing 32 -lumber 102 -dismissals 32 -obsessive 32 -primarily 32 -benito 4 -ships 32 -econometric-forecasting 32 -occupation 32 -downturns 32 -luftwaffe 4 -kearney 12 -macmillan 14 -deployed 32 -achievement 36 -fertility 32 -photovoltaic 32 -bolling 4 -actually 38 -baseball 32 -strife 32 -reports 110 -magazines 36 -darwinism 4 -bolsters 32 -verge 32 -manuverings 32 -dial 32 -conspiring 32 -refuses 36 -throw 32 -bailey 4 -factions 36 -processes 32 -checks 36 -rabbits 4 -enex 10 -inhalable 32 -fending 32 -rah 12 -stalling 32 -fleets 32 -handle 32 -mailings 32 -join 36 -eastern 46 -unhappy 40 -accelerate 32 -witching 32 -reference 36 -belove 4 -disparities 32 -redford 4 -marshall 2 -toyoy 4 -frederick 14 -main 36 -fallout 32 -mail 36 -dollar-holders 32 -mair 8 -invaders 32 -nutrition 32 -myg 8 -outgrowths 32 -marshals 96 -payout 44 -comprehend 32 -precipitating 32 -collections 32 -antipathy 32 -cost-effective 32 -testifying 2 -wonderful 32 -quietly 32 -tbs 8 -philhellene 32 -provocative 32 -npt 8 -teachers 46 -honesty 32 -wildlife 4 -typically 34 -punishing 32 -gilgore 12 -capital-intensive 32 -thrombolytic 32 -scattered 32 -wide-body 32 -import 32 -squads 32 -sunny 32 -dillard 4 -obstruction 36 -ieoc 6 -shell 32 -dangerously 32 -free-trade 32 -sluggish 34 -all-cash 32 -asked 98 -molds 32 -downtown 32 -function 36 -heaven 32 -slack-jawed 32 -deal-making 32 -blood-clot 32 -housewives 32 -galactic 32 -normal 36 -presently 32 -batteries 32 -svb 4 -nevada-las 4 -significant 32 -briefcases 32 -shies 32 -cocktail 32 -tenderness 32 -redeemable 32 -unwieldy 32 -hog 34 -contacted 32 -based 46 -cherished 32 -teicher 4 -february 4 -extraparliamentary 32 -bases 32 -acquisition-minded 32 -immediately 32 -rides 32 -takeover 44 -midtown 32 -endorse 32 -notations 32 -hovers 32 -alluding 32 -sachs 4 -fool 36 -relating 32 -advisers 36 -ic 14 -sachy 8 -white 46 -food 46 -noncaloric 32 -squashed 32 -racking 32 -earnings 46 -impressive 32 -wriv 4 -warring 4 -dinner 44 -karachi 4 -drafting 32 -swinging 32 -frankfurt 12 -protections 32 -lachica 4 -lovable 32 -isaacs 8 -evident 32 -largest-selling 32 -eliminated 32 -condom 32 -camino 4 -individually 32 -december 14 -locate 32 -worsened 32 -building 38 -strife-torn 32 -negotiators 32 -composition 32 -samsung 4 -modify 32 -retailing 40 -truly 32 -photocopier 32 -year-end 34 -consumer 46 -manufacturers 110 -market-by-market 32 -owe 32 -benchmark 32 -superior 36 -production-quota 32 -three-judge 32 -containing 32 -consumed 32 -rover 6 -warrant 96 -abuse 32 -gobain 4 -broadcasting 38 -overstylized 32 -oblivious 32 -higher 38 -americano 4 -two-week 32 -pugh 8 -dalkon 4 -imsi 4 -atkinson 4 -ambj 8 -iq 4 -airbus 6 -rated 32 -differing 32 -profession 32 -alleviate 32 -americans 4 -otter 32 -starring 32 -grass-roots 32 -throwing 32 -covey 32 -drams 4 -crzy 8 -cholesterol-lowering 32 -forestalls 4 -cover 36 -guideline 32 -announces 32 -develop 32 -erroneous 32 -sesit 8 -drama 32 -wessel 4 -irresponsible 32 -champ 32 -basics 32 -unionizing 32 -festering 32 -profit-taking 36 -ether 32 -everybody 38 -t3100 4 -bailly 4 -shiraz 4 -soonest 32 -radioactivity 32 -inaccuracies 32 -houses 32 -crucial 36 -archibald 4 -deducting 32 -stride 32 -readable 32 -mason 14 -disappointment 32 -namely 2 -largest 100 -housed 32 -cameron 4 -opposing 32 -circulating 32 -gorbachev 14 -startled 32 -cptc 4 -videotape 32 -blair 12 -agreed-upon 32 -consolidations 32 -agreeing 32 -tribute 32 -justin 4 -terrence 4 -antonio 4 -antonin 4 -government-owned 32 -three-member 32 -blacklisting 32 -difficulties 32 -accountability 32 -bothering 32 -advent 32 -leggett 8 -uncriticized 32 -re-regulating 32 -anastasio 4 -substantial 32 -mcgraw-hill 6 -notify 32 -appearing 32 -fate 32 -classmate 32 -brighter 36 -partial 32 -reliability 32 -entities 32 -emotions 32 -financiere 14 -sneak 32 -home-equity 34 -companions 32 -bechtel 4 -answered 32 -vern 4 -stalin 4 -discomforts 32 -keeping 38 -kapor 12 -fold 32 -pullouts 32 -purer 32 -drivers 32 -perron 4 -anti-managua 32 -unemployment 46 -netx 4 -reevaluate 32 -ness 4 -very 32 -christensen 14 -coronation 36 -uncover 32 -alberto 4 -untaxed 32 -logicon 4 -rakoff 8 -headline 32 -fund-raisers 32 -especially 34 -salt 44 -debra 4 -spreadsheets 36 -roland 4 -clevite 12 -fulfill 32 -thorny 32 -plo 4 -subway 32 -beast 32 -tightened 32 -hilton 6 -sale 44 -resource 36 -downturn 32 -reassessing 32 -popularity 32 -guides 32 -conference 36 -speaks 32 -greg 4 -forum 46 -elite 36 -unfunded 32 -guided 32 -communicate 32 -could 108 -alltime 32 -saddle 36 -historical 32 -patricia 6 -grew 40 -orion 14 -activities 36 -conceivably 32 -committed 32 -committee 38 -motion-picture 32 -bundesbank 4 -negotiates 32 -tark 4 -referees 32 -mets 8 -wrote 36 -listening 32 -ruling 44 -lefler 8 -manufacturing 108 -upkeep 32 -miller 14 -week-old 32 -pmk 8 -findings 32 -advertisers 32 -largesse 32 -apropos 2 -pmn 8 -yuri 4 -surely 34 -stuart 6 -stricter 32 -lousy 32 -caseload 32 -quist 4 -flexible 32 -unearthed 32 -trijets 32 -lead/acid 32 -dell 4 -faltering 32 -depths 32 -mutually 32 -physics 36 -blenders 32 -fairchild 4 -better 34 -pwj 4 -render 32 -upper 44 -kissinger 4 -divisive 32 -near-great 32 -tomahawk 4 -unconditional 32 -elderly 32 -reimbursements 32 -version 32 -hovering 32 -u.n. 4 -casino-hotel 32 -hoopla 32 -harmon 4 -velvet 4 -meal 32 -ploy 32 -geraldine 4 -hpc 4 -continuing-education 32 -dilutive 32 -talented 32 -twenty-five 4 -cleocin 4 -icing 32 -reaganites 4 -annual 32 -trial 36 -hobart 2 -leaving 38 -francisco 4 -goldin 4 -kanter 12 -textile 32 -against 46 -allege 32 -toll-free 34 -shoo-in 32 -recouped 32 -technicon 4 -more-focused 32 -dismayed 32 -arv 8 -endless 32 -fernao 4 -marshaling 32 -hutcheson 4 -out-of-state 36 -cupid 2 -dramatically 32 -ron 12 -irish-american 4 -rok 8 -venues 32 -thenceforward 32 -row 36 -home-shopping 32 -bermingham 4 -womanish 32 -wool 32 -midlevel 32 -relationship 32 -roy 6 -klopfenstein 4 -hilarious 32 -portables 32 -infant 32 -presplit 32 -henley 14 -hawk 4 -persons 32 -depending 34 -follow-on 32 -discredit 32 -wells 44 -magnavox 6 -calculation 32 -open-air 32 -policy-coordination 32 -candlelight 32 -prillaman 12 -sums 32 -ktcc 8 -acy 8 -montreal-based 4 -act 36 -currency 110 -exploration 36 -january 14 -allotted 32 -ashton-tate# 8 -merger-advisory 32 -company-wide 32 -richmond 4 -ace 32 -outnumbering 32 -hazardous-waste-management 32 -bbdo 4 -completes 44 -imaging 32 -acc 4 -bay 36 -bax 12 -razzak 4 -impose 36 -backfire 32 -involve 32 -crusading 32 -slvn 8 -bar 36 -bat 34 -square-toed 32 -engineered 32 -milan 6 -holland 4 -just-in-time 32 -national 46 -efficiency 32 -ban 36 -teams 32 -bab 12 -bad 36 -bag 32 -husband 32 -econometric 4 -euphemisms 32 -judged 32 -downing 32 -tidal 32 -affection 32 -eluded 32 -judges 32 -amitai 8 -tenn. 4 -connections 32 -chocolatiers 32 -gaming 4 -chastening 32 -superconducting 4 -fitness 32 -communicating 32 -tube 36 -foxborough 4 -zap 32 -rendition 32 -wausau 4 -mailer 4 -lopped 32 -unfinished 32 -dockets 32 -desk 32 -contradict 32 -fanned 32 -ottawa-based 4 -cell 32 -cue 32 -per-unit 32 -several 46 -favor 32 -hide 32 -denunzio 12 -cando 4 -completed 32 -cup 36 -braces 32 -franchisee 32 -cut 100 -hamburgers 32 -overlook 32 -adventure 32 -letting 32 -preposterous 2 -proxy 36 -trapped 32 -contains 32 -publisher 32 -narrowing 36 -loosened 32 -admen 34 -keel 36 -keen 32 -elusive 32 -cases 36 -woodruff 8 -secured 32 -authorize 32 -cancellation 40 -adjusted 98 -frates 8 -casey 14 -bang-them-over-the-head 32 -affair 36 -penny-stock 32 -aside 36 -zany 32 -careers 38 -little-noticed 32 -jewish 6 -openings 32 -adventurers 32 -invest 32 -mckerrow 4 -lemonade 32 -exaggeration 32 -maestro 32 -reputable 2 -rtn 12 -pharmaceuticals 44 -paris-dakar 4 -nolde 4 -asbestos-like 32 -coating 32 -serving 32 -disintegrate 32 -eating 32 -microsystems 4 -bolster 36 -grill 32 -nuala 4 -senior-college 32 -gate-side 32 -scholarships 32 -misled 32 -subtle 32 -fearful 32 -derivative-action 32 -communist 38 -oil-price 32 -reforms 32 -elephant 32 -court 108 -hercules 4 -kimmelman 4 -personnel 32 -levesque 2 -destructive 32 -taking 32 -noyce 4 -pilliod 8 -fooling 32 -delmas 14 -dividends 44 -shortchange 32 -echelons 32 -fast-growing 32 -captures 32 -less-favored 32 -borrowings 34 -genrad 4 -issuing 32 -pedro 4 -mollusk 32 -genral 8 -d&k 14 -bookings 32 -battery-processing 32 -component 32 -coppee 4 -multitude 32 -dark 32 -urgent 32 -exquisite 32 -terence 4 -listings 32 -mitzvahs 32 -lake 36 -repackaging 32 -pleasant 32 -make 108 -lawrence 14 -aggregate 32 -kicked 32 -manner 32 -diming 32 -boston 14 -tortillas 32 -fayva 4 -studies 44 -favored-nation 32 -signing 32 -explicit 32 -carley 8 -manned 32 -unix 4 -breaches 32 -crimes 32 -unit 108 -brasilia 4 -proteon 4 -studied 32 -ultimately 34 -then-mayor 32 -superhot 32 -kurlak 12 -trains 32 -or 102 -documents 34 -promise 32 -accompanying 32 -renowned 32 -tilt 32 -caveats 32 -stressful 32 -foothold 32 -portrait 32 -nail 36 -gross 32 -oc 4 -om 8 -honors 32 -oh 14 -more-open 32 -current 102 -retirement 36 -exception 32 -beau 32 -optics 32 -bear 38 -look 38 -influence 32 -holds 104 -asset-management 32 -guide 36 -titled 32 -more-modest 32 -roundtable 4 -assert 32 -fend 32 -titles 32 -extraordinarily 32 -saga 4 -courtney 4 -organizations 32 -fidelity 14 -myers 12 -strikes 32 -relationships 32 -abandons 32 -malveaux 12 -no-strike 32 -ex-officers 4 -hudson 12 -presentations 32 -pontiac 14 -camden 2 -largely 32 -candy 36 -disastrous 32 -lansing 12 -special-operations 32 -publishing 46 -funds 46 -year-to-year 32 -significance 32 -colleges 32 -regrets 32 -genetically 32 -city 46 -gym 36 -visas 96 -parable 32 -retains 32 -connected 32 -gigantic 32 -irvington-on-hudson 2 -overtime 32 -belongs 32 -limited 36 -davidoff 12 -gulyas 4 -scrymgeour 12 -lingering 32 -plummet 32 -weaknesses 32 -forwarded 32 -nhrd 8 -interest-sensitive 32 -integrating 32 -tightening 32 -acutely 32 -trombonist 32 -evil 32 -flopped 32 -documentary 32 -forwarder 32 -erupt 32 -shurkin 4 -ruesselsheim 4 -colorado-brewed 4 -vindicated 32 -unlucky 32 -battles 32 -north-south 32 -baffling 32 -lng 4 -became 96 -father-in-law 32 -courter 12 -lnc 4 -left-leaning 32 -insensitivity 32 -ballooning 40 -phase-down 32 -transportable 32 -processini 4 -antibiotic 32 -processing 32 -promotional 96 -two-year-old 32 -mouthpiece 32 -poses 32 -easiest 32 -mishaps 32 -upgradings 32 -royko 4 -proportion 32 -providing 36 -australia 14 -penchant 32 -equitable 38 -dropping 32 -intimidated 32 -uncounted 32 -puppeteers 32 -lisa 4 -s.s. 4 -falcone 4 -certainly 34 -marginal 32 -empress 32 -phosphate 4 -unscrupulous 32 -genius 32 -inside-information 32 -leans 32 -fluctuations 32 -executed 32 -counsel 96 -rhetoric 32 -releasing 32 -bar-illan 12 -desist 32 -consequences 32 -preferences 32 -moline 4 -laughed 32 -asthmatics 32 -theatre 4 -proposing 32 -gibson 12 -precedent 32 -broadway 4 -cutting 32 -rosalind 2 -applicants 32 -thriving 32 -accumulated 32 -pentagon 14 -screened 32 -outlet 32 -maine 4 -sounded 32 -commodity 46 -seals 32 -aerosol 32 -horoscopes 32 -subsidiary 100 -oversight 32 -grind 32 -msft 8 -downhill 32 -pty 4 -midnight-to-5-a.m. 32 -full-financing 32 -hoe 32 -utility 36 -eliminates 32 -concurred 32 -funaro 12 -disaffection 32 -weigh 32 -agios 4 -hop 32 -secure 32 -slightly 36 -hot 34 -how 46 -hilarity 32 -attorneys 42 -tokyo 14 -fortney 8 -rebellious 32 -commercial/investment 32 -governance 32 -tapped 32 -recapitalization 36 -diuretic 32 -griffith 4 -petronius 4 -gunshots 32 -rallied 32 -formally 32 -artist 32 -maneuverings 32 -resorting 32 -bbf 8 -lagged 32 -sponsoring 32 -deferring 32 -resigns 40 -studios 36 -bkne 8 -clears 12 -affluent 32 -armonk 4 -early-warning 32 -shorter 32 -maryland 4 -meat 100 -probed 4 -dwellers 32 -avoid 36 -seeing 34 -battery 32 -mean 32 -shorted 32 -likewise 34 -obsession 32 -party 36 -mead 4 -shorten 32 -protein 32 -deerfield 12 -deja 32 -soviet-backed 4 -unocal 6 -posters 32 -high-powered 32 -attention 32 -collapsing 32 -sectoral 32 -longs 32 -evade 32 -examples 34 -andy 4 -lending 32 -restructuring 44 -box 96 -write-offs 32 -weiss 4 -shade 32 -bog 32 -wilkis 12 -hammer 36 -hrs 4 -sophisticated 38 -shocking 32 -d.w. 4 -absorbed 32 -american 14 -california 12 -forecasting 36 -americas 4 -landing 32 -hrd 4 -indemnify 32 -keels 32 -condemning 32 -insights 32 -whirl 32 -deere 6 -pre-split 32 -lefty 4 -privatized 32 -fdic 6 -outsiders 32 -resolutions 32 -bargains 32 -tower 44 -tarnished 32 -daley 12 -underwater 32 -contamination 32 -homo 4 -petrol 32 -scandinavian 4 -boeing 14 -head-to-head 32 -unspoken 32 -home 46 -elevators 32 -assiduously 32 -bingham 4 -kilns 32 -plane 32 -alleged 32 -dartmouth 4 -institute 36 -among 46 -knox 4 -predominantly 32 -masayoshi 4 -elect 32 -reichstuhl 4 -bala-cynwyd 4 -foreshadowed 32 -complexion 32 -codes 32 -plant 108 -pioneer 46 -brew 32 -creators 32 -cells 32 -plans 110 -posed 32 -bilion 32 -sandbagged 32 -ill.-based 4 -arbs 32 -trash 36 -thousands 38 -garrett 14 -elkes 8 -recommending 32 -appraised 32 -florida 12 -haggarty 4 -confair 12 -enviromental 32 -horwitz 4 -lends 32 -individual 32 -ponderous 32 -dangerous 40 -vying 32 -courtier 32 -well-trained 32 -posting 32 -fatigue 32 -janeiro 4 -radically 32 -accusatory 32 -tsuyoshi 4 -shopkeeper 32 -zwerin 12 -careful 32 -shrieks 32 -unusual 32 -towers 32 -eagleton 12 -foresees 32 -fever 36 -lasts 32 -vouchsafed 32 -binlc 8 -traditionally 34 -on-call 32 -principle 32 -refunds 32 -propose 44 -replacement 32 -proportionate 32 -correctly 32 -saying 36 -disapproved 32 -severely 32 -dealers 34 -roberto 4 -melville 8 -roberts 12 -tacoma 8 -through 42 -cheerleaders 32 -doom-and-gloom 32 -hipps 4 -prime-time 32 -fdx 12 -wired 32 -exposure 32 -electoral 32 -than 100 -equation 32 -entertainer 32 -scuttled 32 -sikh 6 -bloods 32 -regained 32 -sabotage 32 -thinker 32 -shalala 8 -organ 32 -double-a-3 32 -double-a-2 32 -encouraged 36 -irvine 14 -irving 6 -coal-management 32 -warfare 32 -unfairly 32 -cameroon 4 -red 36 -wonder 36 -facets 32 -goldman 14 -wave 32 -sodium-borohydride 32 -flesh 32 -abbott 14 -permitted 32 -bikers 32 -silibis 4 -cancelling 32 -rash 32 -notre 4 -doubled 36 -italian 14 -judaism 4 -irksome 32 -blasted 32 -mendham 4 -standoff 32 -breached 32 -islands 36 -cwm 8 -outraged 32 -imagination 32 -henceforth 32 -sport 32 -triumphant 32 -kiplinger 4 -aggravated 32 -sour-cream 32 -arida 12 -antimissile 32 -cent 32 -aurora 4 -balanchinism 4 -idles 8 -transfusions 32 -spartacus 4 -vanish 32 -i.m.s. 4 -wherever 32 -washboard 32 -licenser 32 -licenses 32 -blind 32 -originate 32 -a.g. 4 -identified 32 -sight 32 -shareholder-rights 32 -accept 32 -philips 14 -dickstein 4 -purports 32 -reveal 32 -evaluation 32 -quo 32 -probably 96 -ret 4 -mccarthy 12 -rule-making 32 -point-guard 2 -contraction 36 -information-providing 32 -performances 34 -nutsiness 32 -predecessors 32 -conflicts 32 -alfa 4 -retool 32 -absurd 32 -buys 36 -contradictions 32 -flush 32 -brownsville 4 -motifs 32 -ilan 4 -magically 32 -politics 32 -escalation 32 -primitive 32 -ominous 32 -johns 12 -profile 32 -calif.-based 4 -calling 34 -absence 32 -jeffrey 6 -settlement 44 -shown 32 -gm-csf 4 -gashed 32 -cheerleader 32 -rectified 32 -mobile 32 -newcomer 32 -beverages 32 -contrast 32 -confident 32 -fact 32 -conditioned 32 -snapshot 32 -babble 32 -ringed 32 -aggression 32 -back-office 32 -inspector 32 -array 32 -escalates 32 -kriftcher 8 -blonde-laden 32 -injuring 32 -diploma 32 -designation 32 -permeate 32 -allegiance 32 -bunch 32 -ronk 8 -txel 8 -nakasone 4 -mcniff 8 -lengths 36 -moto 14 -milford 12 -rudolph 14 -city-state 32 -infrequently 36 -knowing 36 -sworn 32 -garbage 32 -barometers 32 -attempting 32 -tlr 4 -holstein 4 -logic 36 -bridesmaids 32 -space-based 32 -slimmer 32 -sensibly 32 -australians 4 -backlog 36 -scenes 32 -president 110 -far-flung 32 -merchandising 32 -entity 32 -one-year 32 -ounce 32 -starship 4 -often-ignored 32 -kathryn 4 -hideout 32 -talents 32 -caine 4 -feedback 32 -ways 36 -replied 32 -implementation 32 -tobacco 44 -follow 34 -standardizes 32 -linkenauger 8 -ghostly 32 -stomp 32 -highest-volume 32 -touchstone 32 -tough-talking 32 -containers 32 -brandon 4 -revocation 32 -seven-nation 32 -denies 32 -hedge 32 -micro-marketing 32 -indefinite 32 -elevator 32 -techniques 36 -ballerina-like 32 -constitutional 32 -e. 4 -squabble 32 -spending 34 -denied 32 -many 46 -orem 4 -ill 36 -personally 32 -quickened 32 -taint 32 -usually 38 -stevens 8 -denying 32 -menuetto 4 -contempt 32 -jordanian 4 -magnetism 32 -unnecessary 32 -bond-trading 32 -shepherd 12 -tended 32 -rosy 36 -sofaer 8 -interviews 32 -rose 44 -distorted 32 -slyly 32 -quotas 38 -dowd 4 -package 36 -tender 44 -connecticut-based 4 -heron 6 -gentle 32 -dame 4 -motor 44 -tylenol 4 -locke 4 -berry 4 -rothman 4 -miscast 32 -position 96 -damn 32 -k 4 -ed 14 -roebuck 4 -wielders 32 -ec 12 -state-funded 32 -en 36 -el 6 -confusing 32 -ek 12 -t.dhc 4 -fevers 32 -rate-increase 32 -donated 32 -gunshot 32 -uchida 4 -consists 32 -iacocca 12 -new-hire 32 -voluntary 32 -packs 32 -provincially 32 -path 32 -tend 32 -haul 32 -isaac 4 -ratios 32 -normick 4 -challenges 32 -hugel 12 -experimenter 32 -inhibit 32 -buttery 32 -rollover 4 -hopelessness 32 -preparedness 8 -foreign-currency 32 -harvesting 32 -renewal 32 -closely 102 -aai 2 -deflation 32 -reintroduced 32 -skim 32 -chm 4 -conservatorship 32 -illegally 96 -kilowatt-hours 32 -walid 4 -tickets 36 -fantasy-at-home 32 -session 40 -chancellor 36 -suppressed 32 -schlumberger 14 -amtrak 4 -aquitaine 4 -subsidies 44 -joneses 4 -medrich 12 -depart 32 -slates 4 -chemicals 40 -royalty 4 -idle 32 -queen 44 -transcriptions 32 -headdress 32 -consideration 32 -behalf 32 -stettin 4 -maury 4 -hyatt 4 -greener 4 -barringer 4 -squeaked 32 -arsenal 38 -elections 32 -skis 32 -felonies 32 -skokie 8 -artel 10 -silicon 36 -losses 100 -discard 32 -kawasaki 6 -petrofina 4 -nutshell 32 -standing-room-only 32 -beets 32 -rebounding 32 -paste 32 -minstar 8 -sheets 32 -teamwork 36 -lovers 36 -zaentz 12 -charleston 14 -race 38 -tax-shelter 32 -pound 32 -stocks 110 -fabric 32 -anti-competitive 32 -reining 32 -pasta 32 -frets 32 -b-1b 4 -rack 32 -year-ago 46 -century 44 -hit 44 -good-natured 32 -meager 32 -hip 36 -trump 14 -overestimated 32 -handicapped 32 -announced 96 -cigars 32 -premark 14 -gas-turbine 32 -marble-floored 32 -silvercrest 14 -walesa 4 -weeded 32 -uneasy 32 -simpler 32 -daily 100 -that 126 -gundy 4 -watkins 12 -expires 32 -tonalities 32 -tape 32 -hindsight 32 -invalidated 32 -penn 12 -differed 32 -pumps 32 -refinery-run 32 -siemens 6 -dmgif 8 -wrong 36 -explode 32 -modifications 32 -expired 32 -seeking 36 -principally 32 -pent 32 -ring-shaped 32 -emphasis 32 -succeed 32 -tragically 32 -going-private 32 -tendencies 32 -performers 32 -disguise 32 -dynamic 32 -casino 36 -craig 12 -sia 12 -contenders 32 -spit 32 -permission 32 -usbk 8 -palmolive 4 -usher 32 -circumstance 32 -unveiled 32 -concerning 32 -expressivity 32 -beautiful 36 -clearances 32 -optimism 32 -break 32 -bread 32 -maligned 32 -wisely 32 -strategy 36 -tagged 32 -acker 12 -telerate 12 -opinion 38 -fares 36 -jyll 4 -breau 8 -tweak 32 -sopranos 32 -romantic 34 -less-costly 36 -flipped 32 -threatened 32 -ratepayer 32 -acting 44 -availed 32 -ago 44 -aga 4 -age 36 -whom 32 -adopts 12 -staffing 32 -wichita 12 -village 32 -creditwatch 4 -howard 14 -destinations 32 -promising 32 -bride 36 -earth-stuff 32 -deutsche 36 -anglo-irish 4 -planted 32 -add-vantage 6 -fischer 4 -commission 44 -dash 32 -extinction 32 -law-abiding 32 -embrace 32 -cool 32 -commodities 44 -implantable 32 -keyt 4 -tool 36 -blockbusters 32 -bruce 14 -controlling 32 -lessor 32 -non-consumer-related 32 -interfering 32 -home-improvement 32 -darby 4 -chase 46 -jennifer 2 -kinderhill 4 -exam 32 -lesson 32 -become 32 -mnst 4 -flop 32 -logistical 32 -parking 32 -russian 4 -sabotaging 32 -innocence 32 -get-up 32 -retire 100 -contesting 32 -strikingly 32 -bimonthly 32 -landry 4 -weatherly 12 -appointing 32 -divulging 32 -traveled 32 -fills 32 -swore 32 -flagrant 32 -graveyard 32 -acolytes 32 -buyers 96 -charmer 32 -prosecutor 32 -jnj 8 -stemple 12 -shattuck 4 -carefully 32 -mooney 4 -edging 32 -clung 32 -ecdysiast 32 -ditto 2 -wheeling-pittsburgh 2 -piano 32 -distraction 32 -junk 32 -felix 2 -mattingly 12 -courts 102 -circuits 32 -deep-rooted 32 -defective 32 -nav 8 -whenever 34 -barish 4 -hurt 36 -twain 32 -correct 32 -food-service 32 -s-corporation 6 -germalists 32 -guarantees 32 -last-ranked 32 -west 46 -gradually 36 -his 46 -o 12 -easter-egg 4 -has-been 32 -accounted 32 -guaranteed 36 -fields 44 -outrageous 36 -high-volume 32 -information-age 32 -treasury-department 32 -usage 2 -thermo 8 -condition 32 -guarding 32 -meters 32 -t3100s 4 -direct 32 -tactics 32 -switch 36 -caribbean 4 -asta 4 -yelena 4 -blue-and-yellow 32 -devoted 32 -obstacle 32 -lawyers 46 -dynasty 4 -athletics 32 -curious 32 -imperatives 32 -mcdowall 12 -ohbayashi 4 -work-force 32 -democracy 36 -transfers 96 -bradlees 4 -award-winning 32 -execution 32 -a.m.e. 2 -blows 32 -bigelow 4 -overboard 32 -comers 32 -philippine 6 -decent 32 -convention 36 -admission 32 -brueghel 4 -dayton 12 -faster 40 -opinions 32 -destined 32 -keizai 4 -conducted 32 -cratered 32 -rozen 4 -recalls 32 -kai 4 -meeting 36 -middle-level 32 -safety-equipment 32 -cost-conscious 32 -acid 32 -appropriate 32 -kab 8 -approve 44 -boosterism 32 -preacher 32 -methyl 32 -misunderstandings 32 -armstrong 14 -masx 4 -oversee 32 -unfortunately 34 -thereby 32 -misgivings 32 -entertain 32 -hertzberg 4 -dashed 32 -accordingly 34 -pox 32 -precede 32 -hyde 12 -professorial 32 -battle 36 -frommer 12 -warburg 10 -sources 46 -within 34 -improve 32 -louisiana 12 -call-ups 32 -expert 32 -licensed 32 -ver 4 -heard 44 -dilson 4 -deloitte 4 -bennes 6 -pocket 36 -gritty 32 -senior 100 -automated 32 -platform 32 -geneva 4 -boorstin 4 -loan-rescheduling 32 -cars 36 -incur 32 -staffers 36 -socially 32 -trumpets 4 -controlled 96 -re-announced 32 -commentator 32 -arizona 14 -atrc 4 -shipowners 32 -third-party 32 -closed-end 32 -balletomanes 32 -role 108 -brennan 8 -harvard-educated 4 -roll 32 -clark 12 -caesars 14 -winning 36 -defense-oriented 32 -robbins 14 -poured 32 -third-place 32 -lessen 32 -move 44 -stapleton 4 -banks 110 -passenger-car 32 -collomb 12 -reseach 32 -tnt 6 -strict 34 -single-a-2 32 -mo 4 -m.b.a.s 4 -solidly 32 -reductions 32 -manning 8 -barker 4 -intro 32 -ours 32 -fiat 12 -mid-1987 32 -distillers 4 -hasn 98 -mid-1988 32 -tnm 4 -intact 32 -duties 36 -philippines 4 -rebuilding 32 -nonbinding 32 -compromised 32 -veto 32 -mitzi 4 -rekindling 32 -sheriff 36 -pacemakers 32 -meshed 4 -murdock 12 -murdoch 8 -pinnacle 4 -township 32 -money 46 -por 4 -haskins 4 -exhaust 32 -guillermo 2 -tacked 32 -allergy 36 -marius 4 -establish 32 -undercharging 32 -flourished 32 -critiques 32 -rearranging 32 -finale 32 -manley 12 -urges 36 -sane 32 -sand 32 -sang 32 -sanf 4 -accountemps 2 -donaldson 6 -syrian-lebanese 4 -unanimously 32 -urged 32 -bluefield 4 -gupta 4 -you 38 -sank 32 -bancorp 4 -baine 4 -killed 32 -trico 4 -riling 32 -trick 32 -forks 4 -puccini 4 -dollar 108 -agreed 96 -dealing-room 32 -strident 32 -trade-deficit 32 -perfectly 32 -r.t. 2 -stronger 38 -linear 32 -stomping 32 -laughs 32 -no 110 -story 36 -nl 4 -b-1 12 -ortner 4 -streamlined 32 -lack 32 -tackle 32 -claim 32 -wolfe 4 -turnover 32 -p 8 -bamford 12 -sequestered 32 -boot 32 -book 36 -egon 4 -reveals 32 -vli 14 -paddio 4 -categories 32 -boom 38 -restrict 32 -syne 4 -decorator 32 -count 36 -mountlake 8 -shows 36 -spadaro 8 -troopers 32 -bulkier 32 -inspired 32 -idealized 32 -hiding 32 -nowhere 34 -tele 8 -telc 8 -hypotheses 32 -typical 32 -inspires 32 -deep-seated 32 -pare 36 -nessen 4 -high-bypass 32 -manageable 32 -arrow 14 -stops 32 -our 38 -pezim 12 -beutel 4 -dormant 2 -appeal 36 -truck-trailer 32 -cozy 32 -sought 44 -enjoins 4 -appear 32 -asset 46 -charlotte 4 -deborah 2 -fade 32 -eminent 32 -computerizing 32 -darman 12 -miles 32 -bet 32 -serenade 4 -bev 4 -sister 32 -insufficient 32 -turtle 32 -journeyed 32 -ovarian 32 -provide 32 -tempting 32 -monopolies 32 -rolls-royce 4 -prestowitz 4 -valuations 32 -walks 32 -voltage 4 -inordinately 32 -colton 4 -bel 4 -dung 4 -title 32 -villain 32 -tested 32 -foundered 32 -proceedings 32 -vittorio 2 -turks 4 -garments 32 -murky 32 -routines 32 -federally 32 -all-too-familiar 32 -motor-hotel 32 -dining 32 -gloomy 32 -health-threatening 32 -g.f. 2 -pattern 36 -coelho 4 -sprinted 32 -heterogeneity 32 -pds 4 -spectacular 32 -species 32 -dinners 32 -flirtation 32 -protracted 32 -wishner 4 -nationalist 32 -ferdinand 4 -talkers 32 -lunchmates 32 -lenient 32 -zen 4 -jill 4 -cross-subsidization 32 -high-profile 32 -zee 4 -tight-lipped 32 -near-term 32 -charming 36 -reserves 38 -spruce 32 -beezley 4 -humming 32 -stew 4 -stiff 32 -step 32 -raced 32 -cit 4 -up-to-date 32 -medical-diagnostic 32 -nosedived 32 -spots 32 -barton 12 -cil 4 -lapses 32 -cih 4 -accommodate 32 -stem 32 -high-yield 34 -pre-world 32 -predecessor 32 -sauteed 32 -free-lance 32 -appetizer 32 -mention 32 -roomier 32 -reaching 32 -minimized 32 -failed 36 -celebrity 36 -nationwide 36 -syrians 4 -football-field 32 -unpopular 32 -zeros 32 -nonetheless 34 -jaques 12 -low-key 32 -greenback 32 -magnuson-moss 4 -true 34 -reincorporation 32 -truk 8 -gutfeld 4 -treasurer 36 -paulson 12 -palatable 32 -warehouses 32 -braniff 4 -edp 12 -pre-nuptial 32 -vitality 32 -calming 32 -starving 32 -ndunduma 4 -simons 8 -rate-economic 32 -taiwanese 10 -prank 32 -warehoused 32 -self-fulfilling 40 -macintosh 4 -yasuhiro 4 -eproms 4 -extendable 32 -authoritative 32 -pill 32 -identify 36 -unconnected 32 -wet 32 -campbelltown 4 -newton 8 -vineyards 32 -couched 32 -moderates 40 -wen 8 -deployment 32 -escalated 32 -financings 32 -wed 32 -wears 32 -ridiculously 32 -pursuit 32 -web 32 -rotterdam 4 -chucking 32 -grant 46 -wondered 32 -market-auction 32 -blunder 32 -testing 96 -penney 8 -commonly 32 -easing 32 -vie 32 -flashier 32 -grand 36 -vic 8 -penned 32 -inflow 32 -deisz 4 -vin 4 -remain 32 -centauri 4 -capital 46 -kohl 4 -inspections 32 -island 36 -missile 108 -suggests 32 -dwindled 32 -region 32 -cycles 32 -draws 32 -hospitals 96 -exclusivity 32 -alters 4 -cash-strapped 32 -show-business 32 -manipulated 32 -convictions 36 -glenview 12 -painfully 32 -heads 96 -olmstead 4 -frankly 32 -subsidiaries 36 -veterans 36 -recitation 32 -mutual-fund 40 -current-dollar 32 -personnel-related 32 -benny 4 -german-supplied 2 -enclaves 32 -across-the-board 32 -a 126 -preserve 32 -readers 32 -bonding 32 -gone 36 -benson 4 -one-shot 32 -softness 32 -red-faced 32 -telling 32 -cut-rate 32 -stress 32 -cautioned 32 -trop 4 -trot 32 -culminating 32 -surviving 32 -troy 40 -somoza 4 -amsterdam-based 4 -behaving 32 -eurobond 4 -conclusion 32 -nikkei 4 -carlyle 12 -lubensky 8 -wagoneer 4 -equals 32 -agents 32 -prohibitive 32 -author 32 -attracting 32 -counselor 32 -interrupt 32 -decoder 32 -pratt 14 -couriers 32 -teikoku 4 -warranties 32 -extra-curricular 32 -smelting 32 -commercially 32 -spokesmen 34 -youthful 32 -reservation 32 -yew 4 -snaps 32 -fried-chicken 32 -equipped 32 -culminates 32 -alaskan 4 -shoot 32 -pride 32 -perks 32 -legally 32 -working 34 -muddling-through 32 -chazov 4 -chef 32 -forest 36 -equalized 32 -charm 32 -other 110 -addressing 34 -health-insurance 32 -crude 102 -safeguards 32 -appreciable 32 -promotion 32 -hostages 96 -films 32 -leblond 4 -u.s. 14 -jmb 4 -quantities 32 -quaintness 32 -tutelage 32 -reshuffled 32 -afrikaans-language 4 -hamm 8 -outlined 32 -thanks 32 -peopled 32 -curran 4 -glassmaker 32 -fare-restructuring 32 -battling 32 -stock-trading 32 -conceding 32 -seesaw 4 -taft 14 -asserted 32 -far 102 -peoples 32 -outlines 32 -overstepped 32 -staging 32 -four-alarm 32 -false 36 -seat-of-the-pants 32 -duplication 32 -billions 32 -handled 32 -personifies 32 -storm 36 -fan 32 -soybean 34 -emulated 32 -insensitive 32 -personified 32 -yield 36 -acceptability 32 -thwarted 32 -worthy 36 -unconcerned 32 -colleagues 34 -urging 32 -vote-buying 32 -accessibility 32 -decoupled 32 -toussie 12 -marginally 32 -roche 4 -butt 32 -salon 32 -reflection 32 -harper 4 -dollar-stocks 4 -opened 32 -aussies 4 -seven-day 32 -clings 32 -reject 36 -entering 32 -unloaded 32 -pantheon 4 -gimmick 32 -fingers 32 -depends 32 -oldsmobile 8 -mystic 32 -sysm 8 -env 4 -car-market 32 -grateful 32 -allude 32 -obligated 32 -tough 46 -shunned 32 -pending 34 -implement 32 -swartz 4 -laundrette 4 -ene 4 -end 36 -multiple 32 -barbella 8 -yeutter 12 -claiborne 8 -most 46 -proprietor 32 -plead 32 -recognizing 10 -spotlight 32 -identification 32 -nobody 38 -mortars 32 -actress 36 -privatize 4 -limousines 32 -driving 36 -saks 4 -demise 32 -defining 32 -confinement 32 -spacious 32 -pleas 32 -downgraded 96 -architect 40 -felon 32 -bittersweet 32 -presumption 32 -farms 36 -franchisees 36 -spinoffs 32 -selected 32 -lacking 32 -man-to-man 32 -rebates 32 -hamilton 4 -gruntal 4 -downgrades 32 -pneumonia 32 -commercial 46 -unproductive 32 -discreet 32 -expiration 32 -pittsburgh-based 4 -next 110 -pha 4 -shamberg 12 -pale 32 -sugary 32 -vacillation 64 -formidable 32 -palo 8 -schierl 8 -zccbs 4 -orally 32 -meaningless 32 -economically 32 -brazilian 14 -ottawa 12 -pals 32 -byrne 12 -code 36 -realized 32 -maxsavers 12 -angels 36 -sparked 32 -clny 4 -showgirl 32 -mystery 36 -proven 32 -restructurings 32 -lillie 8 -intermingling 32 -toledo 4 -proves 32 -realizes 32 -norand 4 -emission 32 -stony 2 -sides 36 -prevalence 32 -streeters 4 -ideally 34 -conduct 32 -acrid 32 -participation 32 -organs 32 -hainan 4 -steelmaking 96 -back-and-forthing 32 -forelock 32 -approximately 34 -mcalpine 4 -debtholders 32 -spillman 8 -rapidly 32 -internationally 32 -dissuade 32 -attendant 32 -prompt 32 -ranky 4 -jr. 4 -tracts 32 -empty 32 -interpublic 6 -n.m. 4 -afl-cio 4 -illness 32 -mindful 32 -alaska 6 -rushing 36 -harbert 4 -witnesses 32 -sanctions 100 -motivations 32 -tim 6 -larry 14 -awaiting 36 -cassettes 32 -name 36 -recitatives 32 -roll-out 32 -assumed 96 -prospect 32 -user 32 -uses 32 -outfitting 32 -touches 32 -selectively 32 -sherwood 6 -assumes 32 -pope 4 -trustee-like 32 -memphis 12 -touched 32 -used 44 -marvin 6 -t.ims 4 -shoe 36 -anti-nazi 32 -blockbuster 46 -islamic 4 -concerns 106 -comprising 32 -oligopoly 32 -anders 4 -jra 4 -rehabilitation 32 -lots 36 -show 36 -grumbling 32 -mccormick 10 -shop 36 -crocker 4 -fleecer 4 -layout 32 -grim 32 -marmon 4 -matthew 4 -arrangements 32 -eyes 32 -add 36 -clumsy 32 -radar 100 -ada 4 -bullish 32 -yang 8 -terminated 36 -spend 32 -low-cost 32 -pivotal 36 -yank 32 -briefs 2 -constant 96 -reeling 32 -contents 32 -transcend 32 -guillotine 32 -alluringly 32 -hussein 4 -september 4 -liberalized 32 -herbs 32 -wages 38 -attendance 32 -potted 32 -larceny 32 -cargo 96 -greater 32 -trees 32 -past 38 -pass 32 -milhollin 4 -provident 4 -dissenting 32 -peasant-style 32 -nonprofit 32 -rewards 32 -t.spz 4 -guiliani 8 -prized 8 -directly 40 -high-risk 32 -high-rise 32 -wiping 32 -travelers 36 -sections 32 -erosion 32 -hampton 4 -alto 4 -nipny 4 -zero-coupon 32 -agricultural 98 -neighborhoods 32 -happy 32 -lockheed 6 -swanson 12 -straight 36 -non-whites 32 -isaly 12 -plump 32 -cooke 4 -plugging 32 -every 102 -prerequisite 32 -shareholder-solicitation 32 -speculate 32 -boston-based 4 -terrific 32 -chocolate 36 -nobility 32 -dime 12 -features 32 -mold 32 -destabilize 32 -belonged 32 -enmity 32 -chocolaty 32 -crowded 32 -equal 32 -ills 32 -featured 32 -dims 4 -holder 44 -trainees 32 -expect 36 -jean-jacques 2 -abrupt 32 -auditing 32 -seidler 4 -called 38 -broadest 32 -customs 44 -oath 4 -check-kiting 32 -youth 32 -sits 32 -jam 4 -quality 36 -swedes 4 -rolls 4 -semiconductors 38 -repented 32 -lifetime 32 -uncontrived 32 -foreign-made 32 -caisse 6 -buzzword 32 -trustees 36 -blend 32 -were 106 -shift 32 -site 36 -holidays 32 -strassner 8 -grains 2 -infocorp 4 -mark 46 -korner 4 -divisiveness 32 -triumphing 32 -marc 4 -wheels 32 -police 38 -suicidal 32 -accomplish 32 -poised 32 -mary 4 -inundated 32 -offsetting 34 -leaks 32 -mart 32 -mars 4 -prominent 32 -naturally 34 -spokeman 32 -masks 32 -niagara 14 -kraft 14 -mississippi 6 -defends 32 -structured 32 -helgi 4 -gleaming 32 -lci 4 -higher-priced 32 -bottom 36 -narrowly 32 -kirchner 4 -desktops 32 -structures 32 -lce 4 -roses 34 -meinertzhagen 4 -justly 32 -ambulances 4 -betting 32 -murray 6 -connecting-flight 32 -stretch-out 32 -guilder 32 -strains 32 -leveled 32 -terminating 32 -reproach 32 -commercials 32 -alluded 32 -refusing 32 -ross 12 -governors 38 -full-fledged 32 -mountains 36 -warrick 4 -proceeding 32 -brussels 12 -beneficiaries 32 -reality 32 -pillar 32 -apawamis 4 -reconsider 36 -unavailable 32 -butane-powered 32 -designs 32 -industrial-policy 32 -mid-1970s 32 -suspicious 32 -ingenuity 96 -shades 32 -yawata 4 -surrounded 32 -neither 34 -nitro 12 -electrical 100 -taxpapers 32 -fein 4 -focusing 32 -washingtonian 4 -lerner 8 -rosen 4 -tokkin 32 -housewares 32 -badly 32 -then-preeminent 32 -application 32 -comedy 32 -service 46 -attending 32 -noises 32 -concentrations 32 -minimal 32 -fialka 8 -chic 32 -comdata 14 -restaurant 32 -knot 32 -know 36 -bette 2 -graves 12 -levies 32 -intense 36 -stream 32 -nonstop 32 -reversals 32 -go-between 32 -cash-rich 32 -thymidine 32 -automobiles 32 -fisher 6 -disney 12 -carpet 32 -necessary 32 -levied 32 -discounted 32 -nonresidential 32 -oasis 32 -pension 36 -champlain 4 -gold 110 -golf 32 -arbitrary 32 -pulled 32 -blytheville 4 -rapids 4 -pulley 32 -solid-waste 32 -screwdriver 32 -absorb 32 -posts 44 -acknowledgment 32 -chestnuts 32 -omnicom 14 -adoption 32 -barrel 36 -hollow-cheeked 32 -barren 32 -wisdom 32 -indicator 36 -missed 34 -officer 98 -kimberly-clark 4 -barred 32 -women 38 -magnavision 4 -what 46 -night 36 -misses 32 -lucrative 32 -ridden 32 -democratic-controlled 4 -crager 8 -celso 4 -parliamentary 32 -curiously 32 -champagne 32 -silenced 32 -graduate 36 -offices 100 -concludes 36 -billboard 32 -lacked 32 -november 4 -tungsten 38 -pressuring 32 -snarl 32 -werner 4 -brainstorming 32 -eagle 4 -lilliputian 4 -outdated 32 -snickered 32 -sis 4 -bonar 12 -colo.-based 4 -railway 36 -ricans 4 -mores 32 -agrees 44 -hicksville 8 -drove 32 -leonard 6 -unaffected 32 -ownership 32 -gambling 38 -single-a/a-1 32 -revealing 32 -couples 32 -weekdays 32 -succumbing 32 -explained 32 -cannes 4 -congress 46 -mona 4 -overtaken 32 -dice 32 -dick 4 -maple 32 -blackmail 32 -herring 32 -investigator 32 -retainer 32 -spanish 12 -mont 4 -goals 96 -arabia 4 -labor-force 32 -shutting 36 -tourney 32 -telemann 4 -inseparable 32 -buildup 32 -real-estate 96 -providence 12 -small 36 -invention 32 -ira 4 -beneficiary 32 -zurich 4 -hitting 32 -tvx 4 -rockets 32 -funniest 32 -re-transmit 32 -inflating 32 -changes 46 -collection 36 -naturalization 4 -build 96 -punk 32 -renominations 8 -network 44 -tully 4 -complimented 32 -unknown 34 -conn. 4 -heading 32 -stretch 32 -surcharges 32 -glutted 32 -gut 32 -transfer 32 -sector 36 -designate 32 -fleeting 32 -lie-detector 32 -cleanse 32 -midland 12 -gun 32 -recipient 32 -ramsey 4 -unfathomable 32 -pre-schoolers 32 -origination 32 -refinements 32 -spillover 34 -berri 4 -churning 32 -midterm 32 -style 32 -rapid 32 -soderblom 12 -lew 4 -troubled 36 -leu 4 -let 46 -les 12 -ugliest 32 -landro 4 -ore. 4 -leo 6 -prosecution 32 -nadzick 4 -theoretically 2 -abandon 32 -self-control 32 -leg 36 -triggered 32 -real-world 32 -led 96 -cookie-cutter 32 -frustration 32 -perpetrated 32 -submits 32 -labant 8 -ipos 4 -armtek 4 -chairman 110 -black-led 32 -transmitted 32 -sparingly 32 -agricultural-debt 32 -swofford 4 -minority 32 -quebecor/british 4 -kbh 4 -disk-drive 40 -maui 4 -shipments 36 -translated 32 -hold 32 -relic 32 -expanding 96 -unprofitable 32 -gorda 4 -prettiness 32 -somerville 8 -ethnic 32 -catch 36 -syndicate 32 -surrounding 32 -departed 32 -enterra 4 -supplication 32 -quicksands 32 -cheer 32 -wood-paneled 32 -employing 32 -intercontinental 32 -year-earlier 38 -enacting 4 -times 46 -summoned 96 -landslide 32 -mocatta 4 -smells 32 -authors 32 -producing 32 -expenditures 32 -shogun 4 -breathes 32 -decks 32 -vein 32 -risk 36 -chiefs 44 -rise 44 -kissing 32 -ex-management 32 -repercussions 32 -sea-otter 32 -supremacy 32 -skywest 14 -because 110 -outset 32 -distressing 32 -aluminum 36 -immunized 32 -divested 32 -affirming 32 -hate 32 -concurs 32 -okla. 4 -bendix/martin 6 -whirlpool 10 -republicans 4 -judiciary 36 -drabkin 12 -icahn 12 -delegate 32 -cbe 8 -penalty 36 -rt.a 12 -morally 32 -moving 36 -fontana 4 -types 36 -brown 46 -latched 32 -life-of-contract 32 -daughter 32 -activist 32 -birth-control 32 -bank-dallas 4 -english-language 4 -president-elect 4 -ventures 36 -honda 12 -british-based 4 -echoing 32 -unbearable 32 -andrea 4 -camps 36 -hancock 8 -kerchief 32 -kilos 32 -mineral 32 -colo. 4 -counterbid 32 -half-million 32 -louis-based 8 -faithful 32 -shortcut 32 -superstation 32 -closed 44 -undisclosed 32 -easily 32 -renting 32 -walls 32 -resolves 32 -revolt 32 -antonio-based 4 -vlsi 4 -iceland 4 -explosion 36 -take-home 32 -suites 4 -minot 4 -minor 38 -trouble-shooter 32 -injuries 32 -mazanec 8 -schedules 32 -indexation 32 -stupefaction 32 -tonic 32 -maytag 14 -divestments 32 -detente 32 -suited 32 -argument 32 -spiraling 32 -collapsed 32 -insider-trading 36 -sprinting 32 -technicare-related 4 -clearly 38 -automobile 32 -zayre 8 -taco 14 -reactor 32 -ineligible 32 -qualities 32 -bourbon 32 -adolph 6 -emoluments 32 -strictly 32 -division 100 -sidelines 32 -smash 32 -va.-based 4 -lambasted 32 -unconventional 32 -conservative 36 -winger 4 -knives 32 -opc 4 -scheduled 32 -fastbacs 4 -shilling 12 -yale 4 -reorganize 4 -accompanied 34 -constrained 32 -acidic 32 -pointless 32 -renominating 32 -winged 32 -severed 32 -cancer 36 -finance 38 -duquesne 14 -single-a-minus 32 -ignorance 32 -possession 32 -trekked 32 -editorials 32 -dave 4 -accommodative 32 -formality 32 -gluing 32 -cancel 32 -barney 12 -tour 36 -state-supported 32 -egyptian-born 4 -spotty 32 -evasion 32 -activated 32 -authorizing 32 -barnes 12 -winston-salem 4 -counteroffer 32 -infection 32 -scrimp 32 -calero 12 -heating 34 -resolved 32 -talismans 32 -entrepreneurs 32 -free-energy 32 -heil 4 -railroad 36 -point 36 -volumes 32 -prefer 32 -correspondingly 32 -carting 32 -marian 2 -musician 4 -byuwi 4 -despite 46 -head-on 32 -stabilize 36 -expansionist 32 -air-defense 32 -opposite 32 -obtaining 32 -abroad 36 -windows 36 -legislatures 32 -state-of-emergency 32 -prevented 32 -mechanism 32 -recapturing 32 -performance 32 -anti-khomeini 32 -flowing 32 -midwinter 32 -enjoys 36 -indignation 32 -foreign-aid 32 -numbers 34 -blast 32 -undermine 32 -kick 32 -whittled 32 -public-debt 32 -allowed 32 -non-russian 32 -hentoff 12 -submit 32 -krock 8 -kanawha 4 -aberrations 32 -blase 32 -peace-of-mind 32 -twenty 6 -inequitable 32 -exploiting 2 -dunlop 8 -marco 4 -malpractice 32 -simplicity 32 -heat-processing-systems 32 -enable 32 -mckenzie 12 -march 46 -w-4 4 -low-fare 32 -treating 32 -maurice 6 -vermont 4 -dien 4 -albn 8 -transit 32 -credit-card 42 -flaxen-haired 32 -syndicator 32 -shorter-maturities 32 -cable 46 -speeded 32 -managua 4 -proverbial 32 -quantity 32 -disposition 32 -third-quarter 32 -photographs 36 -rostrum 32 -conform 32 -immune 32 -extradition 32 -illiterate 32 -grooves 32 -relation 32 -consider 34 -limited-service 32 -cultivating 32 -solving 32 -mellifluous 32 -albert 14 -forecast 36 -classical 32 -motherhood 32 -systemic 32 -luke 4 -pink-marble 32 -assists 32 -unc 4 -maxsaver 6 -vcr 4 -ipg 4 -cln 12 -pick-up 32 -consult 32 -metric 32 -denationalization 36 -tennis 36 -timed 32 -coupon 40 -javett 8 -defined 32 -assist 32 -phaedra 4 -dietrich 8 -balance-of-power 32 -relates 32 -grabbed 64 -defines 32 -chlorthalidone 32 -jenner 8 -disqualify 32 -antibiotics 32 -stamping 32 -panama 4 -homicide 32 -superpowers 32 -duncan 4 -reversal 36 -modestly 32 -newborn 32 -imbalances 32 -def 4 -harrison 6 -handed 32 -write-off 32 -mutter 32 -participated 32 -squirming 32 -resumed 32 -polyethylene 36 -trustworthiness 32 -recording 36 -des 44 -pointing 34 -anchorman 32 -shallow 32 -wertheim 6 -proportionately 32 -toxicity 32 -mauro 4 -allergens 32 -particular 32 -community 38 -armenians 4 -chop 8 -girlfriend 32 -investments 44 -spouse 32 -ironic 32 -chimicles 8 -dogma 36 -branch 32 -curbed 4 -criticized 32 -presumed 32 -underwithheld 32 -surnames 32 -aro 4 -weekly 36 -frest 12 -carmelo 4 -nge 4 -relatives 32 -olympics 6 -vt. 4 -raft 32 -eventually 106 -mysterious 32 -killers 32 -potpourri 32 -these 46 -technicians 2 -capacity 108 -kanabayashi 4 -fresh 36 -marxists 4 -forty-eight 4 -resuming 36 -ripples 32 -quarry 32 -pollsters 32 -chesapeake 4 -instrument 32 -w.w. 4 -orbn 8 -interact 32 -wasting 32 -scr 4 -g.d. 10 -better-conceived 32 -loan-loss 32 -two-count 32 -amanda 4 -simplest 32 -thatcherite 4 -inadequacy 32 -petrodollars 32 -puerto 4 -demographic 2 -definitively 32 -policyholders 32 -saving 32 -defensive 32 -cruise-missile 32 -harvard 4 -vncp 8 -sliding 32 -gay 32 -aphrodisiac 32 -hl 12 -throughout 32 -flowers 32 -bonanza 4 -transferring 32 -exempt 36 -ingelheim 4 -bern 8 -undertaking 32 -violation 32 -saluted 32 -agins 4 -reasoning 32 -segmentation 32 -given 42 -barbie 4 -brraap 32 -closures 32 -highlight 32 -galley 4 -janesville 4 -explainable 32 -cypm 12 -alex 14 -travel 46 -cornell 4 -greenwich 4 -jpi 8 -v 4 -mildest 32 -hoskins 12 -weeks 32 -stumble 4 -love 36 -missiles 36 -accrued 32 -academics 32 -bureaucrat 4 -imposing 36 -five 102 -schoolchildren 32 -arab-israeli 4 -finder 32 -action 110 -curbing 32 -shunning 32 -krauss 4 -consolidate 32 -repetitive 32 -decliners 40 -hers 32 -football 36 -office 36 -surfaces 32 -midsummer 32 -multi-year 32 -ordered 32 -whips 32 -here 106 -medium 32 -persistent 32 -evanston 4 -wealthy 32 -batchelor 12 -hero 32 -developer 32 -banners 32 -strike 36 -txt 8 -txu 8 -relinquished 32 -stamford 12 -contacting 32 -declares 46 -simplistic 32 -drift 32 -down-home 32 -gannett 4 -stall 32 -undue 32 -hundred 32 -grumman 6 -icg 4 -callan 8 -encouraging 32 -ipo 4 -higher-quality 32 -txn 12 -creole 4 -pull 32 -paychecks 32 -cracks 36 -sizeable 32 -pza 4 -defenses 32 -loyalty 36 -blueprint 32 -bilandic 4 -single-a-minus/a-2 64 -maritime-union 2 -examine 32 -invisible 36 -gwf 8 -tattletale 4 -masses 32 -okada 12 -vehicles 36 -hunting 32 -subpeonas 32 -housing-industry 32 -caters 32 -stanleytown 8 -hooray 2 -unassailable 32 -registered 32 -capitalizing 32 -overwrought 32 -feverishly 32 -conspiracy 32 -pressed 34 -connecticut 4 -cholesterol 36 -nev. 4 -recognize 32 -shiny 32 -inconvenient 32 -adds 34 -interested 32 -fought 32 -anomaly 32 -penalize 32 -coldly 32 -supreme 46 -receipts 36 -elegance 36 -argued 34 -are 110 -reiterate 32 -released 32 -sensitivity 32 -prolonged 32 -now-canceled 32 -argues 32 -rough 32 -bloedel 4 -party-plan 32 -assemblies 32 -bloated 32 -insatiable 32 -debt-negotiating 32 -nonessential 32 -senators 34 -zanzotto 12 -anomalies 32 -francois 12 -concussed 32 -farm-credit 32 -cattivera 4 -schiff 4 -amateur 32 -atoms 32 -recalculation 32 -society 36 -basra 4 -three-division 32 -carter-wallace 2 -illiquidity 32 -dyess 4 -brunette 32 -meats 4 -hid 32 -multinationals 32 -shelved 32 -tally 32 -spur 32 -babyboomers 32 -anonymous 32 -neglect 32 -beaches 32 -disheartening 32 -follow-through 32 -gsu 8 -mellon 4 -localities 4 -distinguished 32 -higher-salaried 32 -refunding 32 -desire 32 -net 110 -duzan 8 -microcomputers 32 -distinguishes 32 -revelation 32 -a-from 64 -judicial 32 -pointed 32 -hdl 8 -deter 32 -nec 6 -parfet 8 -ending 36 -ned 6 -laboratory 32 -eastmet 12 -indianapolis 12 -fantasy 36 -ceuta 4 -ohio-based 4 -abortion 32 -sustainable 32 -brass 36 -officially 36 -piece 32 -carryover 32 -minnesota 14 -poorly 32 -modest 32 -bse 4 -mitchell 12 -evidence 34 -nodes 32 -strategies 36 -bust 32 -hawks 4 -taped 32 -acquisitions 38 -nursing 32 -denominations 32 -wab 8 -deflect 32 -shifts 32 -glowing 32 -way 38 -consisting 96 -chauffeur-driven 32 -sale-and-leaseback 32 -tapes 32 -peter 14 -inertial 32 -war 44 -appeasement 32 -then-usual 32 -magical 32 -missile-guidance 32 -initiative 36 -cheapest 32 -hezbollah 4 -rebuke 32 -apologies 32 -laws 100 -rban 4 -perpetually 32 -sachnoff 8 -fellow 34 -long-suffering 32 -noranda 4 -motivating 32 -harvestable 32 -knees 32 -kodo 2 -libraries 32 -experiment 32 -unsatisfactory 32 -lawn 32 -hoover 4 -irritate 32 -limelight 32 -garages 32 -ukman 4 -crews 36 -deed 32 -compound 32 -integrate 32 -deficits 32 -determinations 34 -high-gloss 32 -boustany 4 -bodan 4 -likened 32 -stories 32 -deep 32 -deepening 32 -peladeau 4 -extremely 32 -severence 32 -staying 32 -cuban 4 -noting 40 -reworking 32 -discipline 32 -velta 4 -possess 32 -cigarette-tax 32 -drunks 2 -iran-arms 4 -collider 4 -sable 4 -knappik 4 -physicist 32 -importers 10 -mer-national 6 -accredited 32 -gore 4 -reschedule 32 -tremendous 32 -mystique 32 -mas 8 -arvin 14 -shl 10 -westburne 14 -gutowski 4 -trends 36 -stretched 32 -she 46 -anyone 34 -homeless 32 -shy 32 -ongoing 32 -corp. 12 -towns 32 -mk 8 -rights 36 -philosophizes 32 -minneapolis 12 -plain 32 -cloaked 32 -sanction 32 -presidential 38 -varieties 32 -frame 4 -accessible 32 -recognized 32 -jewels 4 -wife 32 -sphere 32 -eaters 4 -flash 32 -enhancements 32 -courting 32 -balfour 4 -consulting 32 -prompted 32 -recognizes 32 -steamed 32 -du 46 -enclosed 32 -shock 32 -d.c.-based 4 -determined 32 -searing 32 -tumbles 4 -travel-rebates 32 -shenanigans 32 -tsuei 8 -lobby 32 -right-wing 32 -pan-hellenic 4 -specific 32 -tumbled 32 -appendix 32 -high-interest 32 -siegel 12 -fists 32 -cno 4 -calmat 6 -trimmer 32 -cnh 4 -delegates 36 -bargaining 32 -creative 36 -marketing 110 -averages 34 -gintel 12 -overnight 36 -characterization 32 -delegated 32 -arms 100 -scouting 32 -advantages 32 -army 36 -improper 36 -ruth 4 -fortunes 32 -farrakhan 4 -frills 32 -polygraph 32 -allied-signal 4 -predators 32 -signs 36 -maffei 4 -duke 4 -gyrations 32 -conceded 32 -assessed 32 -unanimous 32 -arw 8 -mares 32 -non-contract 32 -superficialities 32 -disturb 32 -longtime 32 -self-respecting 32 -whx 8 -hovered 32 -felt-tipped 32 -concedes 34 -glass 36 -lightning 32 -applauds 32 -maldutis 12 -boroughs 32 -shrank 32 -sponsored 32 -revising 4 -greeley 4 -gruesome 32 -kilroy 4 -cullen 4 -gould 4 -aug. 14 -allanna 4 -hall-walker 12 -wounds 32 -consistently 32 -ortega 4 -rightness 32 -dobbs-higginson 4 -fermilab 4 -familiar 32 -ae-1 4 -towards 32 -mirrors 36 -departures 32 -commitments 36 -blue-chip 32 -nabih 4 -operates 96 -dilorenzo 4 -horelick 12 -m-s-r 12 -netherlands 12 -lasting 32 -emigration 32 -company 110 -public-choice 32 -paternalistic 32 -car-rentals 32 -mikael 4 -operated 32 -kinnear 12 -crossing 32 -awesome 32 -relentless 32 -packed 32 -dissatisfaction 32 -one-on-one 32 -walcott 4 -jokes 32 -chronicler 32 -chronicles 32 -hands-off 32 -switching 32 -supplements 32 -marketers 32 -areas 36 -gypsy 32 -unfolded 32 -dignity 32 -l.f. 4 -indicating 32 -stricken 32 -exceptions 32 -convened 32 -becoming 32 -rallying 32 -hardly 36 -nice 36 -camcorder 32 -erased 4 -tuberculosis 32 -drafted 32 -intrinsically 32 -khalq 4 -cost-plus-fixed-fee 32 -akron 12 -omni 4 -trish 4 -undoing 32 -kelp 32 -well-schooled 32 -ssc 6 -mainly 32 -slow-moving 32 -everywhere 32 -address 32 -abducted 32 -come 36 -runup 32 -s-3a 4 -carthage 4 -integrated 32 -timely 32 -counted 32 -oil-industry 32 -integrity 32 -engineers 32 -counter 32 -officers 100 -lehman 14 -catches 34 -cheesecake 32 -prospects 36 -els 4 -sobbed 32 -exxon 2 -rental 32 -community-center 32 -wheezes 32 -retirees 32 -school 36 -ele 4 -astronomically 32 -elf 4 -eli 4 -foreign-exchange 40 -elm 4 -store-closing 32 -shrink 32 -fabrics 32 -destroy 32 -aboard 32 -mdr 4 -burroughs-wellcome 6 -deploy 32 -commercial-paper 32 -fits 32 -current-account 96 -airborne 36 -challenging 32 -mounts 32 -operate 32 -buddhism 4 -mdc 4 -insurgents 32 -solicitations 32 -entitled 32 -bloodcurdling 32 -higher-ups 32 -exchangeable 32 -demonstrates 32 -aryans 4 -fee 36 -fed 46 -embodied 32 -solutions 32 -poet 32 -third-generation 32 -semi-rut 32 -broderick 4 -houston-based 4 -platt 4 -peso 32 -financial-information 32 -franc-denominated 32 -piscataway 4 -livestock 2 -chorus 32 -struck 32 -drunk 32 -prosperity 32 -industrialists 96 -woods 44 -continent 32 -julie 4 -theories 38 -sounds 32 -dianna 4 -messy 32 -wis.-based 4 -woody 4 -date 40 -bulletin 4 -crowds 32 -horde 32 -erwin 2 -unemployed 32 -exacerbating 32 -yoder 4 -evolving 32 -timber 32 -once 46 -antilles 4 -beaten 32 -tied 36 -maine-based 4 -toilet 32 -plains 6 -already-scheduled 32 -phenomenally 32 -profit-making 32 -appliance 36 -ties 44 -tier 32 -ivory 36 -permitting 32 -quayle 4 -inductance 32 -solve 32 -developments 40 -blondes 32 -ore 32 -varig 4 -biased 32 -clash 32 -earn 32 -pepsi-cola 4 -fostering 4 -efficiently 32 -projectiles 32 -electronics 102 -ironically 10 -switzerland 14 -cbs/fox 14 -heartbeats 32 -eaves 12 -hookup 32 -ahc 4 -handling 36 -fumbling 32 -echoes 32 -cancels 8 -artwork 32 -underway 32 -courses 32 -push 36 -university 36 -jack-of-all-trades 32 -buckhorn 14 -uncontrollable 32 -appreciate 32 -l.a. 4 -counteract 32 -j.j. 6 -confessed 32 -two-by-four 32 -followers 32 -small-volume 32 -curtails 32 -athens-based 4 -g.k. 4 -metal-futures 32 -subjecting 32 -nearer 32 -analytical 4 -diameter 32 -bendix 6 -withhold 32 -focuses 32 -attributed 32 -harry 6 -doodles 4 -bottling 100 -environment 40 -association-college 4 -acute-care 32 -subscribers 32 -boesky-like 4 -buddha 4 -difference 32 -investor-protection 32 -town 36 -atlanta-based 4 -left 44 -german 14 -event 32 -emerged 32 -maritime 36 -treat 32 -decades 36 -incinerator 32 -withdraws 8 -murdered 32 -oscar-winning 4 -craven 4 -norse 4 -detroit-based 4 -reshufflings 32 -northbrook 12 -multiples 32 -whether 42 -mabey 12 -auction-rate 32 -betrayal 36 -cardiss 8 -exchange-market 32 -isle 4 -talky 32 -pollack 12 -knew 32 -zealand 4 -venter 4 -sperm 32 -second-largest 32 -dragged 32 -sells 44 -dis 8 -spontaneity 2 -restrictive 32 -paccar 6 -div 4 -kickback 32 -stage 32 -khj 4 -taxi 32 -coalition 32 -squeezing 32 -interpreted 32 -citicorp-issued 4 -dia 4 -dig 40 -die 32 -issue 36 -streamlining 32 -mouth 32 -curricula 32 -recovery 36 -consent 32 -escan 4 -gigs 32 -holed 64 -speculation 34 -stones 4 -successors 32 -disposable 32 -sweetening 32 -otters 36 -four-month-per-exam 32 -villa 4 -algerian 4 -haughey 12 -writer 32 -writes 32 -blount 12 -cadillac 8 -dotted 32 -listing 32 -y 4 -affirms 4 -dynamic-random-access 32 -lait 32 -four-month 32 -galinsky 4 -railroads 36 -ncr 4 -message 32 -sweetened 32 -mti 8 -boning 32 -unfreeze 32 -americus 14 -ravishing 32 -deadline 32 -indemnifying 32 -hartman 6 -maximum 32 -jewel-like 32 -benefited 32 -desperate 32 -researched 32 -breaks 32 -sudden 32 -lodged 32 -divestiture 32 -pop 32 -talks 44 -influencing 32 -reinterpreting 32 -fizzle 32 -able 32 -aviation 36 -seminars 32 -chevette 4 -junkyard 32 -herzog 4 -md 4 -liquidate 32 -transvaal 4 -broadside 32 -cattle 38 -approached 96 -transactions 32 -liquidated 32 -musical 44 -heavy-water 32 -agree 36 -eyerman 12 -hinge 32 -incorporate 32 -delicious 32 -approvals 32 -cost-control 32 -depend 32 -biochemist 32 -canyon 6 -pares 32 -quotable 4 -market-moving 32 -timid 32 -slaughter 32 -selecting 32 -generale 4 -uncovered 32 -premier 36 -chairwoman 32 -multimate 4 -lottery 36 -according 38 -per 32 -munitions 32 -demands 96 -chemotherapy 32 -mclellan 12 -home-state 32 -acoustic 4 -eligibility 32 -relaxing 32 -slack 32 -managerial 32 -disdained 32 -repackage 32 -wedgestone 10 -phrased 32 -pledges 32 -amadeus 4 -market-share 32 -worldly 32 -jtl 4 -concerted 32 -wrangler 4 -reluctance 32 -pet 36 -pledged 32 -period 36 -firm 44 -loss-sharing 32 -mcdougall 12 -inordinate 32 -compaq 14 -jacques 6 -scientist 32 -microphones 64 -feeding 32 -attractive 32 -sixth 38 -panic 12 -merhige 12 -majority 32 -encapsulating 32 -rankings 32 -rating 32 -spiritual 32 -policy-makers 32 -makeup 32 -rift 32 -shareowner 4 -transport 36 -babcock 4 -ailments 32 -supermarkets 4 -henderson 12 -spins 32 -savaiko 4 -unequal 32 -effectiveness 32 -barris 8 -succeeding 34 -resende 4 -admitting 32 -delivered 32 -succumbed 32 -outage 32 -sep. 2 -wiesbaden 8 -shipment 32 -work 38 -magnets 32 -grist 32 -worn 32 -worm 32 -negev 4 -flows 32 -memories 36 -lethal 32 -marijuana 32 -wore 32 -word 36 -perma 4 -relies 32 -paradise 4 -adjoin 32 -argyll 6 -red-blood-cell 32 -stamina-testing 32 -tabak 4 -toughest 32 -converse 4 -illuminating 4 -acquitted 32 -politically 32 -nbl 4 -hearts 32 -hart-dyke 4 -toaster 32 -diligent 32 -melloan 4 -dusty 32 -hearth 32 -mcdonough 4 -microcomputer 12 -dome 2 -swarms 32 -bureau 36 -approving 32 -moran 4 -moral 40 -silos 32 -statement 32 -interference 32 -glugs 32 -wyoming 4 -bundle 32 -peddling 32 -designated 32 -pursuing 36 -trusted 32 -slopes 32 -irwin 4 -executive 108 -janice 4 -reachable 32 -debtor 32 -damascus 4 -designates 32 -airlines 46 -comedian 38 -legitimately 32 -stormed 32 -proudly 32 -measure 36 -assassination 32 -fuller 14 -monopole-hunting 32 -vacationers 32 -eurosecurities 4 -rented 32 -nominated 32 -detergent 32 -pension-plan 32 -library 4 -young 46 -airliner 32 -danger 32 -stinko 32 -artifice 32 -confronted 32 -gelbard 8 -projections 32 -deals-curbing 32 -deterrence 32 -libyans 4 -oversupply 32 -sterling 36 -memorabilia 32 -nanometrics 10 -certificates 96 -power 46 -pleasure 32 -shkif 8 -mirroring 8 -ellis 4 -exploring 32 -rail 100 -angered 32 -moribund 32 -german-based 4 -tenth-largest 32 -asserts 32 -music 44 -self-destruct 32 -intermediaries 32 -contractors 32 -spirits 32 -create 32 -franchising 32 -younger 32 -ptrk 8 -percentages 32 -lotion 32 -zero 32 -outlays 40 -immunities 4 -mettam 8 -anymore 36 -nakagama 4 -pink 32 -durable 40 -city-based 4 -if 46 -unwilling 32 -id 4 -ground 32 -no-growth 32 -ii 4 -in 110 -parlayed 32 -gate 36 -is 110 -encountered 32 -fortune 36 -iv 4 -watchdog 36 -non-grocery 32 -subpoenaed 32 -deprived 32 -chatted 32 -warships 32 -yoko 2 -futile 32 -euromark 12 -welcomes 32 -onboard 32 -happens 32 -tensions 32 -batus 4 -lamented 32 -technology-based 32 -hodges 4 -fielding 32 -etzioni 4 -tricone 32 -welcomed 36 -inquiring 32 -steelworkers 36 -piety 8 -suh 32 -forecasters 32 -obstructing 32 -sum 40 -royalties 32 -llsi 4 -droves 32 -shut-off 32 -encompassing 32 -atlantis 14 -hewlett-packard 14 -devised 32 -encounters 36 -stanford 12 -mohammad 4 -collect 32 -tribe 32 -devises 32 -atlantic 14 -veterinary 4 -spurring 32 -fight 36 -recognition 32 -wood 36 -unwanted 32 -interestingly 2 -hotsy-totsy 32 -franchiser 32 -franchises 32 -loughman 4 -cypress 6 -barratt 12 -always 32 -frozen 32 -pickup 36 -gargles 32 -grille 32 -warden 32 -job-service 2 -luck 36 -common 110 -disbelief 32 -campsite 32 -luce 4 -touch 36 -employes 32 -brooklyn 4 -generally 36 -feline 32 -arisen 32 -confidentiality 32 -monessen 4 -fbt 4 -mailed 32 -direction 32 -defect 32 -appropriations 4 -autos 44 -extradited 32 -score 32 -reimposing 32 -offenses 32 -painewebber 6 -constructors 12 -awacs 4 -disrupt 32 -inaction 32 -invading 32 -spearheaded 32 -scorn 32 -puckett 8 -forthcoming 32 -fernando 4 -legislating 32 -confronting 32 -field 36 -mired 32 -resourcefulness 32 -manzi 8 -tullock 12 -endorsed 32 -off-peak 32 -gains 38 -decontamination 32 -tourists 32 -flow 36 -frothy 32 -heftier 32 -raymond 6 -picture 36 -weakest 32 -importing 32 -beings 32 -effectively 32 -listens 32 -burenga 8 -substantially 32 -anger 36 -small-scale 32 -misinterpretation 32 -guinness 14 -graduation 36 -cost-savings 32 -shrunken 32 -koss 4 -briefings 32 -fair-skinned 32 -mazankowski 4 -implicates 8 -audited 36 -hambros 2 -climate 32 -jordan 12 -reward 32 -musty 32 -pact 44 -purchased 32 -essence 32 -undeservedness 32 -childless 32 -malcolm 6 -mogul 32 -directs 32 -cftc 12 -petroleumish 32 -buccaneers 32 -purchaser 32 -synthesized 32 -speaking 32 -neidl 4 -clad 32 -cement 36 -portraits 32 -recovered 32 -any 46 -kfvpr 4 -catala-roca 4 -favored 36 -face 44 -tells 36 -product 32 -beyond 34 -ann 14 -ebbing 32 -cohorts 32 -reestablishing 32 -jennings 12 -polls 42 -functions 32 -ana 4 -old 108 -deduced 32 -dwarf 32 -urquhart 4 -assn 4 -recapitalized 32 -jews 4 -stick 32 -throwback 32 -walking 34 -armies 32 -flown 32 -writeoffs 32 -down 38 -trail 32 -stepchildren 32 -amdur 4 -gibbs 12 -alternative 32 -clan 36 -shouldn 96 -temperatures 32 -ducklings 32 -does 98 -credit 46 -parliament 36 -lederle 4 -handleman 10 -answer 36 -brother 32 -smashers 32 -honolulu 8 -bizet 4 -nine-year 32 -found 100 -racially 32 -breeding 32 -ensnare 32 -customers 110 -coliseum 32 -stern 12 -entails 32 -overproduced 32 -revolution 36 -sovereign 36 -distressed 32 -oyster 32 -fitzgerald 4 -trout 4 -minimizing 32 -trials 32 -appearance 32 -deadlock 32 -owning 32 -commander 32 -u.s.-assigned 4 -option 44 -televisions 32 -worksheet 32 -toward 36 -identity 32 -bouncers 34 -bradford 4 -filtration-systems 32 -describing 34 -jozef 4 -rejoined 32 -shamir 14 -amounting 32 -leak-rate 32 -boesky 12 -sword 32 -uncompromising 32 -tolls 4 -budget 110 -metal 36 -machinists 36 -campaign 38 -montana 4 -nucleoside 32 -ministers 32 -ensue 32 -auel 4 -horizons 36 -budged 32 -rejected 32 -stimulates 32 -suspicion 32 -patterned 32 -plunging 32 -midler 4 -anti-drug 32 -upper-income 32 -coatings 32 -paydown 32 -tartar 4 -re-election 32 -illnesses 32 -half 38 -resignations 32 -jack 14 -hapless 32 -hears 32 -morbid 32 -injunctions 32 -willis 4 -heart 36 -degree 32 -composer 32 -afoot 32 -farmer 36 -human 36 -atv 4 -salivating 32 -diversified 100 -campground 32 -wheat 34 -doubts 36 -jaruzelski 4 -prat 4 -extraordinary 32 -sylvester 4 -composed 32 -life-sized 32 -weapons 96 -globe-trot 32 -amerika 4 -register 36 -patently 32 -dc-9s 4 -olympic 4 -enjoy 32 -removed 32 -tribune 4 -archbishop 4 -gene-splicing 32 -panitz 12 -oakland-based 4 -headquarters 36 -owes 32 -emeritus 32 -andrew 6 -grappled 32 -goodrich 12 -corona 4 -engineering 100 -directorate 32 -installed 32 -spies 32 -exploitation 32 -massive 34 -kingdom 4 -mrs. 14 -constitute 32 -deploys 32 -cane-sugar 32 -morality 32 -bartlesville 8 -london-based 4 -bernard 6 -hometown 32 -equaling 32 -havre 4 -fat 32 -symptoms 32 -whiner 32 -alloys 32 -big-name 32 -fao 4 -steeley 12 -aristar 4 -blindfold 32 -vacancies 32 -seasonal 32 -faa 12 -quebec 14 -fab 4 -anne 4 -audit 32 -farmland 32 -floated 32 -liberalize 32 -subsequently 34 -ncaa 4 -del 14 -pritzker 12 -backstage 32 -merger 100 -jewishness 4 -harbor 36 -floats 32 -implicit 32 -viable 32 -aroused 32 -audio 32 -labor-relations 32 -vista 4 -weinberger 14 -tortured 32 -consistency 32 -massachusetts 4 -borrowers 32 -abusive 32 -revolutionaries 32 -book-to-bill 32 -saint 14 -crowned 32 -cincinnati 4 -stupidity 32 -legal 44 -permissive 32 -epileptics 32 -snared 32 -owed 32 -recruit 32 -post-teledyne 2 -racqueteers 8 -scullin 4 -defrauding 32 -prospekt 32 -choreographic 32 -fabrication 32 -jury 36 -unhappiness 32 -workout 32 -fame 32 -wryly 32 -vehemently 32 -considerations 32 -currently-available 32 -furnace 32 -jeremy 2 -gas-fired 32 -ald 4 -nauman 4 -one 46 -reorganization 40 -begun 38 -all 46 -tiring 32 -annoucements 32 -jumping 32 -gardner 4 -ono 4 -rdc 4 -alloy 32 -kallshian 4 -amstrad 14 -eagerness 32 -carbon 32 -eloquence 32 -piping 32 -july 6 -predicament 32 -cancellations 32 -budgeted 32 -fall 36 -persian 4 -deductible 32 -vino 32 -boycott 32 -achieve 32 -malmstrom 4 -passage 32 -enactment 32 -contentious 32 -cookies 32 -mall 32 -ahead 40 -hikes 32 -silva 12 -sikhism 4 -single-a 32 -early-loss 32 -viewership 2 -oversized 32 -theresienstadt 4 -so 106 -enzyme 32 -replicate 32 -coastal 4 -trost 4 -pac-man 4 -combines 36 -happen 32 -welch 8 -underwriter 32 -eight-tenths 32 -gap 36 -combined 34 -teen 32 -chu 8 -rep. 6 -conductor 32 -flawed 32 -midnight 32 -scalia 12 -witkowicz 4 -camera 32 -steps 36 -dies 32 -susquehanna 14 -diet 32 -instinct 32 -attended 32 -h. 6 -conviction 32 -tax-deferred 32 -rivalry 34 -offset 40 -delves 32 -seek 36 -died 38 -dismiss 32 -eprom 4 -burnham 4 -physical 32 -coincidentally 32 -flammable 32 -itt 6 -uno 4 -its 46 -ansberry 4 -investigated 36 -fgsv 4 -unplaced 32 -avenue 36 -private-sector 32 -und 32 -communities 36 -pleaded 96 -texas 14 -breach-of-contract 32 -itc 4 -sarney 12 -enriching 32 -flourishing 36 -rooms 32 -investigates 4 -he 46 -professions 32 -redistributing 32 -high-volume-dependent 32 -ha 8 -rile 32 -souring 32 -inititiated 32 -cocoa 38 -hi 4 -greatly 32 -scion 8 -ht 4 -maze 4 -dials 32 -lease 32 -spawned 32 -arlington 4 -mimic 32 -nightmare 36 -kika 8 -a&m 4 -affect 36 -kike 32 -johnnie 4 -profitable 34 -shrinks 32 -briefed 32 -unionized 32 -appointments 32 -decentralizing 32 -obscured 32 -midyear 32 -scratch 8 -genesee 4 -discrimination 32 -transformed 32 -der 36 -gsx 8 -alleges 44 -releases 32 -alternatives 32 -comrades 32 -mcgibbon 4 -brock 14 -radar-jamming 32 -baja 4 -numerical 32 -weathermen 32 -ignores 32 -unemployment-insurance 32 -epitaph 32 -technicare 4 -tales 36 -commissions 34 -cook-off 32 -petition 32 -adopt 32 -announcing 32 -guiding 32 -opportunity 32 -complacency 32 -bypwi 4 -brewer 32 -epiphanies 32 -new-product 32 -propulsion 32 -tongue-twisting 32 -helm 32 -testimony 38 -smugly 32 -academe 32 -quash 32 -working-class 32 -academy 4 -modernization 32 -donate 32 -maher 4 -wesson 4 -fear 32 -illicit 32 -totaling 40 -land 36 -lane 36 -framed 32 -nsc 12 -original 32 -drafts 32 -government-related 32 -nitze 12 -shannon 4 -slump 36 -lengthy 32 -armour 4 -one-half 32 -brocade 32 -hoops 32 -pieces 32 -nat 4 -increments 32 -selection 32 -arrest 32 -rethinking 32 -lyon 12 -emerges 36 -quick 46 -fleet-replenishment 32 -creations 4 -nad 4 -nag 32 -eagerly 32 -respectable 32 -pieced 32 -threat 32 -tax-related 32 -naj 4 -robusta 4 -quota 32 -defeat 32 -shultz 14 -insystec 4 -ask 34 -smoothly 32 -criticisms 32 -coercive 32 -complicated 32 -hookups 32 -airspace 32 -hopes 36 -bloodletting 32 -stamped 32 -state-of-the-art 32 -patrol 36 -ast 4 -ill-timed 32 -joined 32 -limitation 32 -dealer-manager 32 -expressions 32 -afterwards 32 -chronar 8 -softened 32 -harmony 36 -morocco 4 -finanza 4 -punitive-damage 32 -cheap 36 -roscoe 4 -sensation 32 -scope 32 -cheat 32 -burrowing 32 -mid-afternoon 32 -unwritten 32 -faster-than-expected 32 -russians 4 -exclusively 32 -readiness 32 -twins 32 -bagged 32 -sadness 32 -airplanes 36 -asian 4 -body 32 -interco 4 -decor 32 -whitman 12 -underpayment 32 -bemoaned 32 -worcester 4 -becker 4 -production 38 -titan 4 -pilkington 4 -sweat 32 -hanover 4 -hard-line 32 -tackles 32 -quasar 4 -collaborate 32 -decof 4 -celebrations 4 -expenditure 32 -lieberman 8 -cont 8 -stewart 12 -child-care 40 -marxist 4 -wholly 32 -establishing 36 -suitcases 32 -replicated 32 -j.r. 2 -comprise 32 -cone 4 -depressed 32 -steward 32 -reprint 32 -revive 36 -wellcome 14 -m&m/mars 4 -hiring 40 -fallen 32 -brazenly 32 -absb 4 -forceful 32 -unilateral 32 -retrospective 4 -espn 6 -mini-recession 32 -jurisdiction 32 -broke 32 -worth 36 -worthwhile 32 -ladenburg 4 -flirts 4 -govil 4 -well 38 -kay 4 -influx 32 -goading 32 -n.y.-based 4 -wci 8 -dec. 14 -hobbled 32 -successfully 32 -uc-santa 2 -c.i.t. 4 -cough 32 -perceived 32 -lies 32 -bqr 8 -improperly 36 -landscape 36 -smuggle 32 -unpublished 32 -grudgingly 32 -detrimental 32 -atswi 4 -skinny 32 -aimed 32 -narrowed 36 -dominates 32 -melodrama 32 -teaching 32 -borden 14 -dwindling 32 -eagan 4 -nordic 4 -placing 32 -claimed 32 -romilly 2 -border 36 -information-services 32 -h.f. 4 -wade 36 -dominated 44 -speculated 32 -jourlet 4 -squash 34 -unpaid 32 -witter 4 -bends 32 -share 36 -chief 46 -flow-of-funds 32 -securities 110 -computer-guided 2 -surpass 8 -bounties 32 -johannesburg 4 -one-megabit 32 -copenhagen 8 -smiles 32 -anmc 6 -laroche 8 -sports 36 -transcribed 32 -burton 4 -proud 40 -computerized 32 -valve-stem 32 -faulty 32 -aide 36 -cbs 14 -minuscule 32 -mixes 32 -spectrum 32 -gruneich 4 -clifford 4 -curve 4 -eyebrows 32 -concealed 32 -woes 36 -courtesy 32 -sizable 32 -choice 32 -feisty 32 -coordinating 36 -delighted 32 -lecturer 32 -bullet 4 -telegraph 36 -late-afternoon 32 -sharper 32 -denes 4 -stockholders 32 -wyss 8 -winkler 4 -workstation 32 -marie 4 -austere 32 -combine 32 -resilience 32 -helium 32 -maria 4 -last 110 -preeminence 32 -stepped 32 -pro 32 -unsecured 34 -photographers 32 -wild 44 -minority-owned 32 -blood-analysis 32 -specializes 32 -variations 36 -cease 32 -cost-effectiveness 32 -abitibi-price 6 -phyllis 8 -overhaul 36 -prod 32 -studying 32 -mass 36 -amount 32 -maynard 4 -laurdan 4 -garden-variety 32 -turboprop 32 -fives 32 -specialized 34 -another 110 -bermon 12 -surrogate 32 -tax-withholding 32 -restrain 32 -kyotaru 12 -drug-company 32 -growth 38 -insisting 32 -z-holiday 16 -concise 32 -kin-deep 32 -elaborate 32 -obsessed 32 -invulnerable 32 -circumventing 32 -noted 32 -shriners 4 -walked 32 -above-market 32 -u.s.-made 4 -exporting 36 -suddenness 32 -noticed 32 -start-up 32 -gade 4 -gadd 12 -lyrics 32 -magazine 44 -dread 32 -simulated 32 -impropriety 32 -journals 36 -prop 32 -surface-to-surface 32 -dream 36 -week-r 4 -double-edged 32 -notices 32 -ordnance 32 -mondschein 12 -wish 32 -cassette 32 -psychotic 32 -n.v. 14 -compounds 32 -truffle 32 -swx 8 -kept 32 -inventories 34 -financial-service 32 -scribbled 32 -palmetto 4 -activists 32 -interchangeable 32 -afternoon 32 -streptokinase 32 -overcharging 32 -prizes 4 -ready 36 -mesh 32 -overpaying 32 -houston 12 -productions 36 -abrogated 32 -boulder 4 -accepted 42 -heels 36 -respectability 32 -shored 32 -stock-manipulation 32 -whitehead 12 -talbots 4 -scarce 32 -tananbaum 12 -nonrecurring 32 -widens 32 -versatility 32 -programmers 32 -stoddard 14 -burk 4 -sensitive 32 -inter-city 4 -clean-coal 32 -deadly 32 -importance 32 -sporting 32 -wis. 4 -rave 32 -mask 32 -extras 4 -farm-workers 32 -energetic 32 -undervaluation 32 -t-1000s 4 -edward 14 -nature 32 -boumedienne 4 -sw. 4 -prohibit 32 -weepy 32 -tentative 40 -awarding 32 -leeds 4 -assuaging 32 -roman 4 -cecola 8 -separating 32 -postal 32 -wetherby 4 -liability 32 -colorful 32 -reoccur 32 -masco 14 -dozen 32 -dismal 32 -raving 32 -turbo-fan 32 -lungs 32 -inflows 32 -mueller 4 -renee 4 -three-month 32 -living 32 -built 96 -photo-identification 32 -iii 4 -dense-pac 14 -divorced 32 -vote 36 -renew 32 -social 38 -corroborating 32 -envelope 32 -surgery 32 -deplorable 32 -bradstock 4 -department-store 32 -heartbeat 32 -department 44 -incestuous 32 -debate 32 -praises 32 -breakup 36 -illusory 32 -vivid 32 -praised 32 -steven 6 -farmworker 4 -marks 44 -nelson 12 -wilson 14 -appeals-court 32 -steve 4 -eight-year 32 -mannequins 32 -sallie 4 -british 14 -apia 8 -issue-oriented 32 -schoonover 4 -systemwide 32 -sponsorship 34 -floating-rate 32 -distressingly 32 -linger 32 -coalesced 32 -capital-rich 32 -hysteria 32 -colored 32 -teck 4 -year-long 32 -tech 36 -history 36 -cutoff 32 -structurally 32 -geranium 32 -kookaburra 6 -consequently 2 -played 32 -robn 8 -government 110 -justice 46 -cds 4 -monrovia 4 -tarkanian 6 -cdn 8 -sthf 8 -ambulance-chasing 32 -circuit 40 -fortunate 32 -truitt 4 -environmental 46 -acquisition 44 -player 32 -wtbs 6 -excellent 32 -sovereignty 32 -afric 12 -tortuous 32 -baalbek 4 -heavy 96 -receiving 32 -duet 32 -assessor 4 -looking 34 -smiling 32 -lilac 4 -authored 32 -intensifies 4 -associating 32 -joel 6 -mandatory 32 -a.h. 14 -lopping 32 -n.c.-based 4 -basie 4 -cope 32 -basic 32 -featuring 32 -freeing 32 -surging 32 -backbone 32 -stake 36 -talmud 4 -popcorn 32 -recounts 32 -insiders 32 -resells 32 -restaurants 32 -comair 4 -cogeneration 32 -basis 32 -bureaucrats 32 -bensalem 4 -redesigned 32 -travers 8 -assad 4 -revue 4 -document 32 -wheel 36 -negotiating 42 -tolerate 36 -overallotments 32 -share-buying 32 -oscar 4 -slickers 4 -pereslavl-zalessky 4 -possessions 32 -redeemed 32 -anc 4 -four-share 32 -pinkish 32 -messrs. 6 -curacao 4 -cooperating 32 -summary 36 -gardena 4 -sheet 36 -dinar 32 -expands 32 -erupted 32 -rand 4 -totally 32 -hbo 6 -attracted 32 -founded 34 -affiliated 32 -short-lived 4 -purity 32 -incidence 32 -ranzino 4 -betrayed 32 -dad 38 -nation 36 -founder 40 -galbraith 12 -olson 12 -home-buying 32 -orange 14 -allocation 2 -four-nation 32 -bourbon-pecan 32 -remarked 32 -enron 4 -inflammations 32 -panhandle 2 -catastrophic-illness 32 -fiddler 32 -harass 32 -copy 32 -stymied 32 -distinct 32 -draped 2 -dancer 32 -dances 32 -week-e 8 -grenade 32 -minimize 32 -hormone 32 -fairy-tale 32 -likelihood 32 -squares 32 -grows 36 -track 44 -einstein 12 -reconcile 32 -futures 46 -danced 32 -misdiagnosis 32 -hammond 8 -ual-hertz-hilton 4 -judgments 32 -discredited 32 -workstations 40 -gibraltar 4 -expandability 32 -rife 32 -plays 32 -whole 32 -densely 32 -facilitated 32 -desirable 32 -hibernation 32 -mrn 4 -regencys 4 -israel 14 -alcohol 36 -rolf 4 -broker-loan 32 -ghana 4 -tired 32 -marrow 32 -bricker 8 -shun 32 -mackinsey 4 -plentiful 32 -carlow 4 -burgeoning 32 -tires 32 -civilization 36 -foreign-controlled 32 -edmar 4 -bottom-line 32 -polled 32 -desirability 32 -sentimentally 32 -shut 32 -mass-manufacture 32 -for 110 -fine-tuning 32 -mnd 4 -radicals 32 -mnh 8 -polemics 32 -erstwhile 32 -herpes 32 -squished 32 -packages 32 -representing 32 -indian 4 -warning 32 -foe 32 -patronized 32 -crewmen 32 -birinyi 12 -gowan 8 -reflex 32 -direct-mail 32 -colonize 32 -offhand 32 -proclaimed 32 -chl 4 -applies 32 -gtech 10 -p.m. 32 -union-backed 32 -fixed-interest 32 -textiles 40 -watered 32 -diverse 34 -sanitation 32 -ruin 32 -correlation 32 -ralph 6 -applied 40 -secretary 46 -epidemiologist 32 -lilly 4 -complains 32 -learned 32 -counterproductive 32 -claimants 32 -anthophyllite 32 -savings 36 -repeats 32 -crary 12 -imperishable 32 -anti-discrimination 32 -transcription 32 -disclosed 36 -rothschild 8 -sometimes 34 -assuming 32 -kickbacks 32 -universal 38 -reptile 32 -depreciated 32 -chb 8 -newer 32 -dibacco 8 -gra 4 -capsules 32 -interventions 32 -prayer 32 -bedeviled 32 -pity 32 -emphatically 32 -abductors 32 -powell 4 -innocently 32 -depreciates 32 -trendless 36 -bail 32 -blow-by-blow 32 -meier 4 -like 102 -donohue 14 -skills 32 -panamanian 4 -carol 4 -request 36 -principled 32 -bwv 4 -jazz 46 -mesa 14 -begged 32 -indicated 96 -librarian 32 -products 100 -boyer 4 -delicate 36 -sreg 8 -honing 32 -waft 32 -boost 36 -greenslet 12 -long-range 32 -angeles 4 -mid-1980 32 -indicates 32 -convertible 36 -resistance 36 -muranyi 4 -big-selling 32 -reacted 32 -commissioners 32 -dealt 36 -skeleton 4 -exercises 32 -clinical 46 -dead 36 -deaf 32 -respite 32 -paso 4 -denver 12 -clearer 32 -dean 46 -scrutinized 32 -holding 108 -dedham 4 -exercised 32 -system 46 -ndx 4 -campaigned 32 -portuguese 4 -services 44 -tijuana 4 -prepare 32 -underlines 32 -complexity 32 -levine 12 -kotzan 4 -locals 36 -strategic 36 -participate 32 -crane 12 -north-central 4 -dissipating 32 -millers 4 -dixie-narco 4 -drops 4 -marketed 32 -vaulted 32 -lucas 4 -efficiencies 32 -prentice-hall 4 -major-illness 4 -unrelated 32 -creeping 32 -christopher 4 -despair 32 -budgeteers 32 -sle 8 -violators 32 -farmers 96 -slb 8 -ankara 6 -fronts 32 -asbestos 38 -stymie 32 -logos 32 -increase 44 -orgolini 4 -rational 32 -sls 8 -principles 36 -humor 32 -sale-leaseback 32 -promptly 32 -sly 12 -disappointing 34 -formed 32 -yardeni 12 -flaws 32 -undertake 32 -conspired 32 -allocations 32 -m 4 -annualized 34 -wide-ranging 32 -wendy 14 -issued 96 -mortgage-related 32 -chapel 12 -joaquin 4 -country 102 -normally 32 -alfa-laval 6 -power-circuit 4 -longest 32 -senses 32 -family 38 -awful 32 -boss 44 -underscore 32 -reluctantly 32 -organization 36 -wise 32 -northeastern 6 -territories 32 -tail 4 -novelist 32 -italy 4 -ingredient 32 -denmark 6 -rehearsing 32 -hirsch 4 -tightness 32 -eventual 32 -proposition 32 -one-day 36 -recycled 32 -nasturtium 32 -expenses 32 -deterred 32 -renouncing 32 -disadvantages 32 -predictions 32 -rattle 32 -speeds 32 -punishment 32 -unpersuasive 32 -seagate 4 -best 36 -politicians 34 -me-too 32 -speedy 32 -demographics 32 -tenants 32 -thick-steel-plate 32 -saxophonist 32 -doling 32 -chesebrough 2 -gardening 32 -overlooked 32 -avoiding 32 -kerr 8 -sampling 36 -burn 32 -yoon 4 -pleading 32 -supermarket 36 -communism 32 -screaming 32 -igam 4 -beer-industry 32 -dazzling 32 -arrests 44 -clippings 32 -ministerial 32 -mezzo 2 -member 32 -solar 32 -widely 34 -fortunately 2 -decreed 32 -previously 34 -reunification 32 -foreman 8 -diminished 96 -chickasha 4 -tax-free 32 -converts 36 -justify 32 -oxygen 32 -askren 8 -additions 32 -amnesty 2 -proceeded 32 -pastry 32 -compromises 32 -crackdown 32 -defects 32 -uncle 36 -castle 36 -leading 42 -skagit/hanford 4 -extract 32 -wurtsmith 4 -entrusting 32 -vain 32 -coached 32 -conclude 32 -third 38 -returned 32 -diagnoses 32 -sunstyle 4 -proprietary 32 -stalking 32 -regimen 32 -four-day 32 -retorts 32 -cold 32 -pronounce 32 -near-quadrupling 32 -disinflation 32 -petzinger 4 -timetable 32 -mostly 38 -spears 12 -buttressed 32 -diagnosed 32 -n. 4 -regimes 32 -colt 6 -ship-to-stock 32 -shocked 32 -gruenberg 4 -frenzy 32 -met-coil 2 -courthouse 32 -twilight 32 -tishman 4 -salisbury 4 -annuity 36 -composing 32 -usx 12 -guenther 4 -rock 38 -occasions 32 -challengers 34 -usa 12 -r.a. 2 -envoy 32 -pre-school 32 -speculator 32 -kip 4 -earners 32 -harnessing 32 -col. 14 -remained 32 -frivolous 32 -cares 36 -provision 32 -decay 32 -homely 2 -impressed 32 -five-member 32 -un-american 4 -antwerp 4 -discovered 32 -massacre 36 -greek-turkish 4 -prepares 32 -demand 102 -bicycle 32 -nws 4 -injury 32 -deregulated 32 -measles 32 -deposit-taking 32 -conditions 32 -hansen 12 -subpoenas 32 -mid-1950s 32 -deferred-compensation 32 -quickly 36 -kan. 4 -thermal 32 -prescription 32 -avant-garde 32 -bmy 4 -tax-driven 32 -assent 32 -none 36 -prediction 32 -development-park 32 -regime 32 -wait 32 -sitting 34 -wouldn 38 -resents 32 -cfp 4 -consented 32 -wrestlers 32 -bonner 4 -signficant 32 -glitches 32 -double 32 -cries 32 -vonder 8 -originality 32 -amps 4 -separately 42 -mechanical 32 -names 44 -martine 4 -shipping 34 -ports 32 -diversions 4 -one-hour 32 -lebanese 6 -councils 32 -expropriation 32 -showing 32 -educating 32 -flourishes 32 -recital 32 -fuehrer 4 -omitted 100 -crushed 32 -tito 4 -jung/brannen 4 -unfolds 32 -stated 32 -stearns 12 -flexibility 32 -rule 36 -denounce 32 -hoffmann-la 4 -carrots 32 -heartland 36 -conjuring 32 -sankey 8 -volatility 32 -twice 36 -niskanen 4 -columbia 4 -stringent 32 -boulevard 6 -reinventing 32 -treasured 32 -violent 32 -ranking 32 -pretax 32 -failing 32 -coaching 32 -herman 4 -named 102 -detainee 32 -two-way 36 -fundamentalists 32 -urgent-care 32 -naysayers 32 -covertly 32 -desert 36 -guesses 32 -spectacle 40 -disembowel 32 -boardwalk 32 -syndicated 32 -unsealed 32 -rubicam 4 -sucked 32 -consolidating 32 -pulling 32 -robert 14 -compression 32 -resolving 32 -totaled 40 -miniscribe 6 -electrons 32 -syndicates 32 -addition 32 -regulating 32 -fowler 12 -center 36 -remarks 32 -worthless 32 -autumn 32 -nu 8 -payments 36 -s&p 14 -rally 36 -prevails 4 -gastropod 32 -sacrificed 32 -kb 4 -lehigh 6 -dorin 4 -lumbermen 36 -viacom 14 -dunce 32 -friends 38 -marietta 4 -breeders 32 -govett 4 -furnishing 32 -doomed 32 -heng 8 -crystal 32 -industrywide 32 -parcels 32 -luxembourg 4 -doilies 32 -austerity 32 -retention 32 -conglomerates 32 -touchstones 32 -fringes 32 -harassment 32 -sprint 4 -stemming 32 -pistol 32 -crates 32 -appalling 32 -teamed 32 -detention 32 -insulation 32 -chipmakers 36 -legions 32 -gridlock 32 -spring 36 -laughlin 4 -pzl 4 -disappointed 32 -strength 32 -autonomously 32 -vaux 4 -winchester 4 -abrahamson 12 -francesc 4 -purveyors 32 -tel 4 -lamboley 4 -special-interest 32 -burlington 4 -ill-considered 32 -sergeant 32 -device 36 -a-6e 4 -copper 36 -litigated 32 -omcm 4 -cuisine 32 -languages 32 -selections 4 -target 44 -calls 100 -prevailing 32 -kagan 4 -dealings 32 -composite 36 -epilepsy 36 -assitance 32 -cd-v. 32 -tick 32 -underfinanced 32 -introduces 32 -grasping 32 -communications 46 -hub 36 -experiences 32 -fruit 36 -divided 32 -east-west 36 -suitable 32 -abdication 32 -indict 32 -fascinating 32 -designed 32 -caesarean 4 -strong 46 -ingested 32 -inched 32 -districts 32 -demonstrations 32 -asian-americans 4 -supervised 32 -designer 32 -components 32 -mansfield 4 -mll 8 -supervises 32 -business 110 -e-flats 4 -inches 32 -mater 36 -transportation 46 -billion-plus 32 -altogether 40 -tcr 8 -glasses 32 -injustices 32 -fascinated 32 -disagree 32 -gtch 8 -calculate 32 -doll 4 -entertainers 32 -valuing 32 -avionics 32 -mnco 12 -scarcity 4 -boosted 96 -profit-margin 32 -contadora 4 -omega 4 -evoking 32 -harassed 32 -missile-defense 32 -scholarly 32 -gallon 32 -yield-driven 32 -sisb 4 -horses 36 -surrender 32 -reunify 32 -displeasure 32 -wgn 4 -slowdown 32 -emphasizing 32 -observing 32 -lend 32 -aided 36 -fling 32 -fair 36 -chambers 32 -monroe 4 -joseph 14 -aiden 2 -sharon 14 -celebrities 32 -b 44 -stability 32 -aides 38 -expressing 32 -brunei 14 -flint 36 -cathleen 2 -occurrence 32 -explanations 32 -citicorp 6 -suppose 32 -ecuador 4 -montgomery 12 -publications 36 -cpas 4 -seller 32 -microbes 32 -power-train 32 -computers 108 -lasser 12 -bollenbacher 4 -divers 32 -marine 38 -falsgraf 4 -airmach 4 -rockwell 14 -leinberger 12 -technical 36 -a.s. 4 -boon 32 -emery 4 -lower 46 -legislative 32 -stiffer 32 -eurodollars 8 -momentum 32 -newman 4 -incentives 32 -r. 14 -nazionale 4 -hotels 36 -wurtman 12 -tax-act 32 -toyed 32 -privately 32 -macy 12 -exhausted 32 -m.d.c. 4 -reveres 32 -noses 32 -cherokee 4 -billiard 32 -remedy 32 -one-way 32 -tear 32 -jerrold 4 -kitchen 32 -norrett 4 -hassenfeld 8 -chairmen 32 -fiercely 32 -devaluing 32 -crnr 4 -ridge 4 -prudential-bache 4 -springs 36 -recycle 32 -team 32 -buddy 32 -took 96 -howorth 12 -sinking-fund 32 -sins 32 -and/or 32 -dreary 32 -camper 32 -robotics 36 -populated 32 -semiannual 32 -middle-income 32 -mosquito 4 -winnings 38 -toyco 4 -contest 32 -wed. 10 -widespread 32 -capabilities 32 -large-scale 32 -barricaded 32 -family-planning 32 -respects 32 -ginnie 14 -receding 32 -unrated 32 -telq 8 -succeeds 96 -contributors 34 -feiner 8 -non-german 32 -isotope 32 -checkup 32 -subtracts 32 -seizures 36 -sne 8 -screams 32 -venezuela 4 -controls 100 -public-relations 32 -thinks 32 -w.r. 2 -part 46 -oversubscription 32 -scenarios 32 -outcome 32 -attorney 108 -rare 36 -fled 32 -jailed 32 -self-interest 32 -brakeman 32 -catered 32 -comprises 96 -amity 4 -wove 32 -t.pwa 8 -singer 38 -batschari 12 -modernizing 32 -conflict 32 -mailroom 32 -dole 12 -oecd 12 -reaping 32 -memorial 36 -auto-focus 40 -cassandras 4 -cray 4 -justifies 32 -flee 32 -subjourneymen 32 -spark 32 -overweight 32 -perpetuating 32 -hamel 4 -justified 32 -entices 32 -advance 36 -kingpin 32 -spare 32 -bluth 4 -theodore 6 -inestimable 32 -f. 4 -cliff 4 -junior 34 -distributors 36 -comedies 32 -understandings 32 -concede 32 -overwhelmingly 32 -tirelessly 32 -jointly 36 -efficient 32 -secret-payments 32 -tell 36 -rodgers 8 -wy 8 -kaul 4 -exists 32 -repairs 32 -battled 32 -personality 32 -liquidity 32 -seizure-prone 32 -refund 36 -pressured 32 -gears 32 -tianjin 4 -remanded 32 -readonly 32 -full-service 32 -disturbed 32 -receivables 32 -compete 32 -truce 32 -audio/video 12 -truck 36 -rudolfo 4 -pressures 40 -out 38 -kiev 4 -sa-7 4 -referee 32 -inherit 32 -rgc 8 -theater 36 -purchases 40 -greyerz 4 -compromise 32 -doubling 32 -ellen 6 -withstood 32 -kiel 4 -maclean 4 -premise 32 -traffic 36 -ring 44 -affidavits 32 -hours 32 -soybeans 36 -lowest-cost 32 -manifesto 32 -arrearages 32 -pioneering 32 -authoritatively 32 -quaintly 32 -unthinkable 32 -convertible-debt 32 -nose 32 -kendrick 4 -balk 32 -ball 44 -balm 32 -emerging 32 -kramer 14 -gmfanuc 10 -brewing 36 -questioned 32 -sneer 32 -targeting 32 -wider 36 -optimists 32 -shearson 14 -dedicated 32 -most-active 32 -boosting 32 -meeting-date 32 -removal 32 -proponent 32 -smaller 42 -opera 36 -temporarily 32 -portrayal 32 -down-and-out 32 -shrinking 32 -ruinous 32 -gilliam 4 -resubmit 32 -awe 32 -generosity 32 -fats 2 -blow 36 -guarini 8 -sonic 4 -censures 4 -recruiting 32 -road 36 -forbid 32 -syrian-backed 4 -souls 32 -headway 36 -dallas-based 4 -biologists 32 -publicist 32 -censured 32 -reprieve 32 -kahn 14 -described 32 -bed 32 -plutonium-producing 32 -baxter 14 -expansion-minded 32 -hewn 32 -aerobics 2 -facilitate 32 -replies 32 -recover 32 -tale 32 -revived 32 -raging 32 -hangars 40 -vault 32 -theatrical 32 -morrison 4 -rankles 32 -breathlessly 32 -armand 2 -image 32 -auditor 40 -trivial 32 -thermos 32 -udayan 4 -obscenity 32 -privee 36 -personal-recognizance 32 -redskin 4 -county 38 -trillion 32 -henkel 12 -wearing 32 -nightclub 32 -clinic 36 -freighter 32 -mordechai 4 -pulpit 32 -finally 34 -programmable 32 -business-type 32 -criticize 32 -bonuses 32 -cyl 8 -stud 36 -often-violent 32 -essay 32 -reuters 14 -renominated 32 -independently 32 -pocahontas 4 -leadership 32 -murphy 12 -advice 32 -comeback 36 -discriminate 32 -fourth-biggest 32 -relentlessly 32 -cm&m 4 -health-care 32 -subcommittee 36 -irancontra 4 -fractured 32 -bud 32 -brook 4 -victoria 4 -display 32 -foods 36 -alike 32 -wooded 32 -disbursed 32 -commrcl 8 -won 108 -woo 32 -properties 36 -sympathetic 32 -metallurgical 32 -although 46 -essential 32 -wooden 32 -buy 36 -govmt 12 -chicago 14 -schoolyard 32 -united 46 -higginbotham 4 -choose 32 -drifted 32 -light 36 -quick-delivery 8 -intermediate 36 -spotting 32 -concretely 32 -closing 32 -endearing 32 -whitney 4 -wachtell 4 -and 126 -resort 32 -spoofs 32 -stripping 32 -coalitions 32 -chair 32 -boosters 32 -bookseller 32 -intensified 32 -foundation 38 -marriage 32 -sakharov 4 -mujahedeen 4 -bank-holding 32 -yielded 32 -premiums 32 -escape 32 -weizhou 4 -lufthansa 6 -whose 36 -kalmus 12 -siege 32 -bank-affiliated 32 -merits 32 -parade 32 -merrill 14 -unify 36 -anglia 8 -loeb 4 -strawberry 32 -mind-set 32 -rotating 32 -raids 36 -lauderdale 12 -vending 32 -ex-military 32 -dune 32 -clothes 32 -bowman 32 -ousting 32 -allegedly 32 -medically 36 -receives 44 -neighbors 32 -moratorium 32 -bmws 4 -floppy 32 -retrovir 4 -halter 8 -continents 32 -allocating 32 -arraignment 32 -club 36 -govern 32 -japanese 14 -lytton 4 -unchanged 40 -uacia 8 -appears 36 -disgorge 32 -meetings 32 -ojay 8 -stipulation 32 -nrc 12 -however 46 -similar 32 -accumulating 32 -alphabet 32 -discount 44 -ruins 32 -ojai 4 -shape 32 -proclaim 32 -foremen 32 -linden 4 -court-sanctioned 32 -wrongful 32 -irresistible 32 -circulation 32 -ax 32 -stopped 32 -stricharchuk 4 -require 32 -ar 4 -as 110 -ap 4 -skirt 32 -fumes 32 -at 110 -au 34 -inclination 32 -gabriel 4 -procedures 32 -an 46 -tamper 32 -al 6 -outer 32 -ab 4 -ac 4 -one-third 32 -ag 12 -ad 34 -hybrid 32 -pl 4 -hypocrisy 32 -foote 4 -amateurs 32 -unspeakable 32 -unrest 32 -since 110 -harvests 32 -aut 4 -spurted 32 -diagnosis 32 -kudos 32 -sex 32 -order-inflow 32 -beautifully 32 -workforce 32 -privilege 32 -remarkable 32 -meatpacking 32 -outboard 42 -piercing 32 -laurels 32 -self-regulatory 32 -yitzhak 4 -slides 32 -multiple-unit 32 -negatively 32 -derive 32 -veered 32 -requested 32 -a-340 4 -uniform 32 -bargain 36 -arched 32 -father 36 -interior 34 -rein 32 -overdose 32 -confiscatory 32 -chooses 32 -plummeted 32 -bernhard 4 -ex-spouse 4 -khoo-controlled 4 -hectic 32 -lawn-care 32 -basil 32 -demanding 32 -pirrie 8 -offender 32 -right 34 -obviously 36 -obscenely 32 -collins 4 -philip 14 -lesuer 8 -knowledge 40 -businesses-newspaper 32 -onus 32 -crusade 32 -marcom 12 -bltb 8 -colline 4 -saig 8 -dispatched 32 -offended 32 -inns 4 -consisted 32 -treasury 46 -chevrolet 12 -oppose 32 -organized 32 -rifles 32 -follow-up 32 -proposes 44 -first-generation 32 -decrease 32 -million 100 -counterpart 96 -expectation 32 -l 4 -antony 4 -margins 36 -proposed 104 -disclosure 38 -obscure 32 -dirt 32 -calumny 32 -gerald 6 -montfrooy 4 -scheme 36 -businessses 32 -sec 14 -parker 4 -ascent 32 -detect 36 -dire 32 -regulatory 36 -fake 32 -atomic 36 -emulate 32 -regulators 38 -bitterness 32 -ignore 34 -combining 32 -parked 32 -floundered 32 -induce 32 -chains 36 -gen. 14 -steadfast 32 -odd 32 -flamingos 32 -resign 36 -atcor 12 -gates 46 -gunmen 32 -rochester 4 -redirecting 32 -refiners 64 -politburo 32 -mohawk 4 -imf 4 -particles 32 -fromer 8 -maxsaver-type 4 -smoking 32 -silly 2 -peaceful 32 -lugano 4 -joao 4 -joan 6 -dickerson 8 -sills 4 -assessment 36 -studio 34 -opposed 36 -reincorporate 32 -iron 36 -puget 12 -hard-to-please 32 -trampled 32 -opposes 32 -underwent 32 -washed 32 -goebbels 4 -sisters 4 -base 36 -doyle 12 -outperform 32 -chocolate-truffle 32 -gene 12 -guatemala 4 -immuno-modulator 32 -tunes 32 -gena 4 -comes 36 -pronouncements 32 -plausible 32 -creditworthy 32 -bass 12 -hiked 32 -philipp 4 -gdp 4 -tuned 32 -worrying 96 -nonverbal 32 -notch 32 -institutionalized 32 -options 36 -placebo 32 -unattainable 32 -handcuffs 32 -regret 32 -beginning 102 -readily 32 -crossed 32 -washes 32 -black 36 -exciting 32 -lancaster 4 -grandfather 32 -bringing 32 -co-author 32 -securing 32 -lunches 32 -awhile 32 -signaling 34 -satisfy 32 -runs 32 -worker-training 32 -proof 32 -bubbles 32 -philadelphia-based 4 -africa 4 -bought 100 -calculates 32 -noticing 32 -frankfurt-am-main 4 -neurological 32 -tangled 32 -outnumbered 32 -soria 4 -walder 8 -throws 32 -dispose 32 -soviet-turkish 4 -dual 4 -apcar 8 -kenneth 6 -muckrakers 32 -useless 36 -genes 32 -vowed 32 -travel-related 32 -elixirs 32 -financier 36 -tigrs 4 -korea 4 -cleaning 32 -programming 32 -exasperation 32 -unbroken 32 -lynch 4 -fences 32 -alexander 4 -lapsed 32 -kennett 4 -townships 32 -mass-produced 32 -boehringer 6 -salaried 32 -a-1-plus 4 -prisons 32 -shoving 32 -dormitory 32 -tax-exempt 32 -civil 44 -appalachia 4 -unpromising 32 -images 32 -campuses 32 -classified 32 -rotan 4 -turkeys 32 -trillions 32 -motorists 32 -ideal 34 -exceeded 32 -alpha 4 -zinberg 8 -pc 4 -empathy 32 -intractable 32 -involuntary 32 -editorial-features 32 -jesuit 4 -half-price 32 -psychiatric 36 -bloomingdale 4 -copyright 32 -bullion 2 -sarre 4 -purse-snatching 32 -camping 32 -industry/government/residents 32 -biden 12 -wayne-gossard 8 -upsetting 32 -races 32 -beverly 12 -cannot 32 -professional-management 32 -low-hanging 32 -flemming 4 -tractor 36 -companies 102 -well-financed 32 -revenge 32 -settings 32 -puffs 32 -rank 36 -soars 8 -hatch 12 -stockholm 12 -marsh 12 -proxies 32 -jon 6 -joachim-ernst 4 -channel 38 -dog 36 -biographies 32 -song 36 -takes 44 -don 46 -really 38 -southmark 14 -greats 32 -sons 4 -dot 32 -dow 14 -wallenberg 12 -faltered 32 -prohibiting 32 -bat-lovers 32 -rationing 32 -rooted 32 -taken 102 -westchester 4 -jobless 36 -catalog 32 -preliminary 32 -bans 32 -ordering 32 -hydromatic 32 -walton 12 -mrmt 8 -moody 14 -platforms 32 -cia 12 -kurds 4 -stumps 32 -bang 36 -band 32 -hunger 32 -inform 32 -problem-free 32 -pay-as-we-go 32 -restraint 32 -centerpiece 32 -incomes 32 -borrus 4 -no-smoking 32 -scotto 4 -their 38 -overtures 32 -lloyd 4 -kuwait 4 -wide-eyed 32 -conservatives 36 -bellevue 12 -reputations 32 -rising 34 -samuel 14 -wermiel 4 -reyna 8 -world-renowned 32 -cper 4 -interpret 32 -coups 32 -opera-comique 4 -won-lost 32 -cs1 4 -passive 32 -placements 32 -during 110 -ingenious 32 -bars 32 -bungling 32 -drugged 32 -twelve-meter 2 -achieved 32 -narrates 32 -bramham 4 -began 100 -fastest-growing 32 -taxable 32 -chafing 32 -fullerton 4 -amerada 4 -funding 36 -historic 32 -assignments 32 -labs 4 -gil 4 -fmc 4 -lebaron 4 -catalytic 32 -without 98 -budget-minded 32 -prestigious 32 -syracuse 4 -agreeable 32 -wallich 4 -inspect 32 -rutledge 8 -hypocritical 32 -daniel 14 -afinp 8 -nominee 32 -hitler 4 -harmfully 32 -inefficient 32 -aleman 12 -imperative 32 -hangout 32 -income-tax 32 -same 32 -wildest 32 -arms-sale 32 -olvr 8 -feet 32 -fees 38 -shaping 36 -francis 12 -unique 32 -antediluvian 32 -upbeat 32 -burdened 32 -ni 8 -hundreds 32 -separate 32 -anathema 32 -popular 36 -arbitrager 36 -passports 32 -ratification 32 -attract 36 -piper 4 -aplenty 32 -volcker 12 -transmissions 36 -pipes 32 -mornings 32 -speed 32 -extinguished 32 -co-producer 32 -reopening 36 -stemmed 32 -speculators 32 -ice 36 -continental 14 -princess 4 -broad 36 -aquifers 32 -skiing 32 -endeavor 32 -three-week 32 -klm 4 -kentucky 6 -handgun 32 -worry 36 -teacher-graduates 32 -salary 44 -bko 8 -arraigned 32 -ruffled 36 -contortionists 32 -rookie 32 -lion 32 -disclosing 32 -win 40 -year 46 -fleeing 32 -heritage 38 -sweepstakes 32 -supposedly 32 -sobriety 2 -african 4 -asarco 2 -shimbun 4 -illinois 6 -conjunction 32 -multibusiness 32 -noticeably 32 -lavenders 32 -sugar 46 -helen 4 -seizing 32 -amhowitz 8 -laf 4 -cty 8 -hawaii 4 -fetched 32 -revolted 32 -cooperative 32 -small-lot 40 -seward 12 -multinational 32 -suitor 36 -ailment 32 -attitude 32 -dropped 32 -lobbyist 32 -issuers 32 -disciple 32 -sectors 36 -ms-dos 4 -combination 32 -deposits 32 -mainstay 32 -measuring 32 -rulings 32 -resorted 32 -real-life 32 -trace 32 -never 38 -chang 4 -mediating 32 -goldman-kidder 4 -liable 32 -aggiornamento 32 -stafford 8 -j.b. 4 -mentz 4 -interesting 32 -ensuring 32 -chant 4 -relied 32 -moreover 42 -relief 34 -norwood 12 -unanimity 32 -boasted 32 -scuttle 32 -movers 4 -least 32 -georgine 8 -designers 32 -curiosity 32 -willems 4 -contained 32 -foreclosed 32 -recommend 32 -civilian 32 -lap 32 -cemetery 40 -anyway 32 -stature 32 -cities 38 -renominate 32 -abruptly 96 -franey 4 -dose 32 -mcgraw 4 -shoeshine 32 -sherwin-williams 4 -settling 34 -safeguard 32 -value 44 -datron 4 -vulnerability 32 -carter 12 -casting 32 -format 32 -headlines 34 -coke 4 -contras 4 -claude 8 -cartel 32 -aback 32 -thrift 36 -marckesano 12 -kingwood 8 -headlined 32 -houari 4 -rendered 32 -popularized 32 -sadly 32 -work-practice 32 -larouche 6 -starting 32 -reluctant 32 -detroit 12 -parking-lot 32 -leapfrogged 32 -overwhelms 32 -axes 32 -small-to-mid-sized 32 -fraud 36 -academic 32 -which 34 -underpricing 32 -one-stop 32 -fomon 12 -outspoken 32 -amusements 4 -stipulate 32 -corner 32 -hmt 12 -campesinas 32 -vows 40 -barrier 32 -hacienda 4 -segments 32 -atpwi 4 -mercantile 6 -derided 32 -sitcoms 32 -remains 100 -holliston 4 -flanked 32 -recommended 32 -examination 32 -metropolis 8 -respected 32 -first-time 32 -letter 44 -helene 4 -pollutants 32 -unintelligent 32 -expressed 32 -rumford 4 -amounts 32 -dramas 32 -finances 32 -selective 32 -differ 32 -expresses 36 -limerick 32 -evidently 2 -l-dopa 4 -repaired 32 -table 36 -meaningful 32 -vivaldi 4 -expelled 32 -backdrops 32 -maturities 32 -tips 32 -pakistani 4 -discusses 4 -faced 34 -principal 40 -serial 32 -superb 32 -brooks 4 -disappear 32 -intervene 32 -rogaine 4 -pragmatic 36 -cuckoo 4 -altered 8 -baby 36 -high-speed 32 -mkt 4 -rekindled 32 -ground-based 32 -notably 32 -bolder 32 -irish 6 -unjust 32 -hendry 12 -pointedly 32 -relevant 32 -reigning 32 -navy 46 -cocaine 32 -tilley 12 -mccarran-ferguson 4 -rioting 32 -general 46 -businessmen 34 -coincided 32 -pclb 8 -eldorado 6 -small-business 32 -hispanic 4 -simplifying 32 -delftaland 4 -skimpy 32 -loaned 32 -coincides 32 -view 36 -caterers 6 -hopkins 12 -arming 34 -added 38 -blanket 32 -degenerate 32 -connection 32 -bones 36 -performed 32 -pointe 32 -lenin 4 -self-dealing 32 -augustine 12 -downgrade 32 -cats 36 -tax-deductible 32 -dante 4 -lyndon 4 -blockade 36 -faces 36 -solved 32 -bronchial 32 -khoo-shearson 4 -wholeheartedly 32 -decorative 32 -untypical 32 -outweighed 32 -syndication 36 -unluckiest 32 -adams 12 -pioneered 32 -picks 36 -adjust 32 -milpitas 4 -latest-quarter 32 -parkinson 4 -ostensible 32 -pandemonium 32 -disk 40 -nominations 32 -heroes 32 -dish 34 -yoshihara 12 -earnest 32 -reader 32 -velasco 4 -czar 32 -immunex 10 -government-guaranteed 32 -plowed 32 -carrier 46 -abated 32 -disagreement 32 -searle 14 -kaiserslautern 12 -insist 32 -ormrod 4 -angiographic 32 -shake-ups 32 -singleton 12 -abolishing 32 -postwar 32 -chula 6 -ibm 14 -hatton 12 -person 32 -hangs 32 -long-misunderstood 32 -conceptual 32 -david 14 -michel 6 -gerstner 4 -uneventful 32 -ted 14 -ssax 12 -tea 64 -compensating 32 -ten 34 -davis 12 -penalties 36 -federal-assisted 32 -viewing 32 -steel-equipment 32 -filene 4 -saddened 32 -agaricus 32 -anytime 32 -rushes 32 -dulles 4 -ceased 32 -becomes 36 -listen 32 -seaman 32 -independent-production 32 -via 104 -listed 32 -iaea 4 -rebounded 32 -arms-control 96 -mother-of-pearl 32 -trajectories 32 -swaziland 4 -bubble 32 -famous 32 -reach 36 -steamship 36 -conditioning 32 -litton 6 -plastics 36 -requests 36 -supplementary 32 -interim 36 -withdrawing 32 -forget 34 -raytheon 10 -saul 4 -errickson 8 -norfolk 14 -sematech 4 -oppenheimer 12 -rushed 32 -psychologists 34 -vainly 32 -memotec 14 -awaited 32 -moscow 14 -phrase 32 -demurred 32 -euphoria 32 -euphoric 32 -thorp 12 -mayer 4 -draining 32 -retreating 32 -pools 32 -synergies 32 -loud 32 -uic 4 -rightfully 32 -maintain 32 -indirectly 32 -flourish 32 -incident 32 -shrugged 32 -stuffed 32 -restriction 32 -needs 32 -grease 32 -tarrant 2 -province 32 -committees 32 -colleen 2 -various-purpose 32 -colgate 12 -five-day 32 -pickets 32 -outstripped 32 -rico 4 -rich 36 -input 32 -requires 32 -rice 44 -firms 46 -haider 12 -finazzo 8 -rica 4 -vibrant 32 -mid-year 32 -required 32 -satisfactory 32 -unrestricted 32 -underlying 32 -responds 32 -forged 32 -sibling 32 -versions 32 -then-nsc 32 -polite 32 -interpretations 34 -siconolfi 4 -permissible 32 -paunches 32 -note 106 -entailed 32 -injunction 32 -stayed 32 -perseverance 32 -masterly 32 -establishment 32 -figure 32 -grips 32 -penetration 32 -telephone 38 -access 100 -all-seeing 32 -emotion 4 -flynn 8 -phillip 6 -germain 4 -overturned 32 -fishing 32 -decompression 32 -coloratura 32 -firestone 4 -type 32 -accuracy 32 -hemming 12 -stand 32 -near 36 -overthrow 32 -miss 38 -april 4 -one-for-one 32 -takeovers 36 -fox 44 -medical 110 -editing 32 -symbol 36 -rockwool 4 -babies 32 -bbc-brown 2 -differential 32 -psc 4 -maker 100 -makes 36 -weddings 32 -clear-cut 32 -honeysuckle 4 -iowa 4 -aet 8 -choleric 32 -vocabulary 32 -sweetener 40 -circumvent 32 -second-biggest 32 -conception 32 -aversion 32 -stunt 32 -people 38 -rhythm 32 -crybabies 32 -gift 32 -criteria 32 -remembrance 4 -theorist 32 -immune-deficiency 32 -vastola 8 -stung 32 -wildfire 32 -highs 32 -downside 32 -narrow 32 -syndrome 32 -treasures 32 -lykes 4 -sticks 32 -azt 6 -fastest 32 -need 32 -unconfirmed 32 -homeland 32 -ever-changing 32 -cows 32 -singlehandedly 32 -dragnet 32 -material-service 32 -cantata 4 -mcalister 4 -portsmouth 4 -convulse 32 -subscriber 32 -patch 4 -directive 32 -abject 32 -booked 32 -ingredients 32 -despaired 32 -jackie 4 -files 40 -emphasized 32 -sure 38 -tacky 32 -reynolds 14 -nwa 4 -charging 40 -framework 32 -tragic 32 -carver 32 -photograph 32 -unheroic 32 -ebbed 32 -sonja 4 -scholar 4 -customary 32 -furious 32 -interferred 32 -inventor 32 -devise 36 -robot 64 -cash-laden 32 -apt 32 -tennessee 4 -engines 32 -oscars 4 -affords 32 -reserve 38 -palace 36 -abrogation 32 -willfulness 32 -inspection 40 -bullock 4 -scream 32 -teledyne 14 -messages 32 -vbnd 4 -consultant 32 -hans 4 -sophistication 32 -self-taught 32 -maintaining 34 -arcana 32 -hand 36 -hang 32 -four-city 32 -plagued 32 -entreaties 32 -plush 32 -cookie 4 -sideshow 32 -furtherance 32 -rivers 4 -awoke 32 -anatolia 4 -affairs 36 -schubert 4 -aspire 32 -habit 32 -asbestosis 32 -calan 4 -giuliani 12 -prodding 32 -triple-a 32 -industrialist 32 -paralysis 32 -bills 36 -sometime 34 -drained 32 -chops 32 -staked 32 -denver-based 4 -referring 40 -depressing 32 -needles 32 -acceptable 32 -bacterium 32 -honor 34 -mortgages 32 -founding 32 -slap 32 -timbre 32 -inefficiencies 32 -crackpot 32 -universe 36 -mortgaged 32 -plethora 32 -personalities 32 -airport 36 -competing 42 -divests 32 -plum 32 -high-rate 32 -solely 32 -second-front-page 32 -mikva 4 -release 36 -financially 32 -extravagant 32 -chilled 32 -smasher 36 -vacuum 32 -malcontents 32 -realize 32 -vancouver 12 -semans 8 -incomparable 32 -pullout 32 -pemberton 4 -fyffe 4 -painstakingly 32 -notion 32 -enters 8 -triggering 32 -unacceptably 32 -ariz. 4 -threaten 32 -license 36 -skimming 32 -acronyms 32 -bats 32 -bundesrat 4 -threatens 32 -bother 32 -get 36 -stein 12 -athletic 32 -realtors 32 -conversing 32 -understate 32 -traced 32 -hiking 32 -discloses 32 -anti-missile 32 -well-turned-out 32 -zug 4 -mcdermott 4 -inventory/sales 8 -revenue 110 -sentenced 32 -vincent 4 -swaps 32 -yardsticks 32 -marlin 4 -fissures 32 -criminal 36 -winds 32 -rural 32 -five-ton 32 -doesn 98 -coil 32 -stances 32 -buyout 40 -sprawling 32 -evolved 32 -masterson 8 -vessels 32 -jagry 4 -meta 4 -bloomington 4 -faceless 32 -above 34 -male-pattern 32 -insecure 32 -u 12 -plaguing 32 -universally 32 -disparity 32 -unseen 32 -emanuel 2 -step-up 32 -effects 32 -house-backed 4 -occurred 32 -hover 32 -martin 14 -supportive 32 -enrollments 32 -renegotiated 32 -scandal 44 -anti-tank 32 -ritual 32 -subsequent 32 -ivanov 4 -grumbled 32 -presley 4 -tricks 32 -warren 14 -acted 32 -hosts 32 -edible 32 -amicable 32 -congressman 32 -tricky 32 -confrontation 32 -instruct 32 -heartening 32 -cs-1 6 -allergies 32 -bankruptcy-law 32 -stuffy 32 -thoughtful 8 -unglamorous 32 -solution 32 -auditors 32 -surprise 32 -afford 32 -influenza 32 -ambitious 32 -dc-9 8 -workaday 32 -more-expensive 32 -well-placed 32 -survived 32 -top-notch 32 -discussion 32 -surgical/critical 4 -visual 32 -lower-level 32 -forgo 32 -raves 32 -survives 32 -neil 6 -amal 4 -game 36 -smoot-hawley 4 -eco 4 -overt 32 -celebratory 32 -grosz 4 -patients 34 -pittsburgh 12 -surveyed 96 -notes 100 -animation 32 -patrick 14 -rock-and-roll 32 -specified 32 -suppress 32 -installments 32 -practice 32 -doubtless 32 -highly 32 -prolongs 32 -york 4 -u.s.-based 4 -aspartame 36 -slashed 32 -modeled 32 -drive 36 -veggies 32 -incorporating 32 -partner 96 -republican 4 -drahuschak 4 -distaste 32 -slashes 32 -quotation 32 -renaissance 36 -bribery 32 -constraints 32 -leveling 32 -hume 4 -pennsylvania 14 -zero-sum 32 -tobin 4 -niels 4 -harmless 32 -lenox 4 -taurus 4 -productivity-related 32 -cramming 32 -probes 32 -overstates 32 -bricks-and-mortar 32 -intellectuals 34 -cave 4 -damped 32 -miranda 4 -handcuffing 4 -todd 10 -forefront 32 -meantime 34 -corporations 98 -quarries 32 -redoubling 32 -natural-gas 32 -salespeople 32 -detained 32 -depression 4 -van 44 -cupertino 4 -days 36 -ousts 8 -anti-aircraft 32 -constantly 32 -piedmont 14 -occupational 36 -credible 32 -draft 32 -contagion 32 -deciding 32 -khan 4 -considers 96 -light-duty 32 -maturing 34 -concessions 32 -slipping 32 -well-suited 32 -sympathy 32 -variable 32 -pierre 12 -smelling 32 -chinese 4 -offending 32 -e.f. 6 -before 110 -incumbent 32 -realty 4 -depositors 32 -firefighters 32 -administrations 32 -foreign-relations 32 -taxi-bureau 32 -challenged 32 -awkward 32 -modulator 32 -factories 36 -joll 4 -forward 32 -vienna 12 -khoo-related 4 -challenger 32 -rodime 6 -intensifying 32 -mann 12 -auction-market 32 -deemed 32 -canda 12 -watchfully 32 -pitch 32 -posner-controlled 4 -marwick 4 -restructures 8 -assembly 96 -preferred 34 -satisfied 32 -assemble 36 -apparently 34 -overcapacity 32 -industrials 44 -inclusion 32 -seasons 32 -stimulus 32 -conclusions 32 -reinforce 32 -dissent 32 -faintest 32 -innovations 32 -nuk 4 -accumulate 32 -disciplines 32 -profit 46 -edgar 12 -favoring 32 -guard 36 -taverns 32 -assistant 36 -crash 32 -thinner 4 -arch-rival 32 -stock-held 32 -originally 40 -networks 32 -franchise 32 -probe 36 -shook 32 -neutrinos 32 -examined 32 -nike 4 -wondrous 32 -accompany 32 -innocuous 32 -dwarfed 32 -contract-suspension 32 -impulse 32 -examiner 32 -examines 32 -alumnus 32 -dissatisfied 32 -tavern 4 -brannock 12 -harvested 32 -regional 32 -rare-book 32 -twenty-first 4 -clare 4 -kinard 12 -hessians 4 -console 32 -diluted 32 -swallowing 32 -mathematics 32 -halmi 4 -acts 44 -misuse 32 -bracher 12 -ogilvy 14 -merchandise 96 -potential 32 -volga 4 -roundup 4 -dilutes 32 -soil 32 -ven 4 -unexpired 32 -squeeze 32 -ried 12 -easter 4 -contend 36 -robinson 12 -imperialism 32 -kilobytes 32 -swift 40 -behavior 32 -inner-city 32 -tipped 32 -hurry 32 -yvette 2 -angus 2 -accrual 32 -quite 38 -alice 14 -umbrella 32 -suffer 32 -bankruptcies 40 -banner 32 -parking-brake 32 -knifepoint 4 -feedlots 32 -case-by-case 32 -reits 4 -altar 32 -on-time 32 -banned 32 -leisure-time 32 -surfaced 32 -hard-nosed 32 -quits 4 -annihilation 32 -roddy 4 -first-class 32 -sylvan 14 -unruffled 32 -dr. 14 -mid-level 32 -professionsals 32 -correction 32 -perforce 32 -altering 32 -retaliatory 32 -memory-enhancing 32 -fibreboard 6 -rolled 32 -clause 32 -bancroft 14 -syrup 32 -imprisoned 32 -prevent 32 -strongly 32 -referendum 32 -t.td 8 -interferes 32 -corporation 32 -regulator 32 -davidson 4 -aims 32 -guitar 32 -instructing 32 -uneasiness 32 -car-rental 32 -divisions 32 -discussing 36 -adjudicated 32 -thirteen 2 -tourism 36 -oop 4 -remembers 32 -suisse 4 -brrs 4 -regulations 32 -hhs 6 -underestimates 32 -destroying 32 -inherited 32 -administrators 48 -narrowest 32 -psychologist 32 -piasio 4 -appoints 8 -enterprising 4 -agip 4 -three-man 32 -sumitomo-group 4 -european 14 -characterizes 32 -battipaglia 4 -lech 4 -taxes 44 -brobeck 12 -damaged 36 -viewpoint 40 -mulders 12 -end-of-year 32 -swissair 4 -over-the-air 32 -laing 12 -taxed 32 -agio 4 -unable 32 -post-cruzado 32 -humans 32 -millions 32 -weight 32 -zickler 4 -fateful 32 -goldwater 8 -minerals 36 -swim 32 -jay 6 -objections 32 -preeminent 32 -asinof 4 -morse 14 -clowning 32 -just 46 -commute 32 -daniels 4 -constitution 44 -merchant 32 -vernon 14 -alan 6 -insuring 2 -glue 32 -noncompt 32 -caldwell 4 -grapefruit 32 -appreciably 32 -reservists 32 -jan 4 -bells 36 -u.s.-london 8 -mit 4 -loans 44 -property 38 -highway 36 -customized 32 -largest-airline 32 -belly 32 -balanced 32 -excluding 42 -mix 32 -indirect 32 -amassing 32 -jenrette 4 -mid 32 -distrusted 32 -inadequate 32 -payne 12 -min 4 -overblown 32 -mim 4 -balances 32 -earth 36 -debtors 36 -langoni 12 -nichols 4 -ally 32 -workable 32 -garnishing 32 -crafty 32 -vendor 32 -rumor 32 -hoard 32 -printing 36 -hesston 14 -india 14 -carnation 4 -baker 14 -returning 32 -co-equal 32 -coors 6 -initiated 32 -follows 40 -embarrassing 32 -drew 44 -manhattan 4 -protesters 32 -classification 32 -payable 32 -hobhouse 4 -fatalities 4 -beverage 36 -dakota 12 -dreg 32 -heretofore 40 -roger 12 -inventors 32 -pyf 8 -undertaken 32 -chapter 36 -glut 32 -imaginative 32 -worries 36 -sculptor 32 -xp 6 -allied 14 -groupings 32 -frazzano 12 -longer-term 32 -peacetime 32 -similarly 42 -bizarre 32 -sluggishness 32 -librettist 32 -worried 32 -owned-and-operated 32 -infinet 6 -weekend 32 -authorized 32 -minutes 38 -generically 32 -manuel 6 -precious 42 -evading 32 -short-covering 32 -fancier 32 -instant 40 -eisenhardt 8 -pathways 32 -cautiously 32 -marcheschi 12 -intelligence 102 -bhc 8 -diebold 8 -ignoring 32 -frequently 32 -recyclables 32 -advocates 34 -census 12 -earmarked 32 -tx 12 -unsafe 32 -rabbit 36 -kingsborough 4 -emergency 32 -sharpened 32 -tt 8 -tv 12 -molding 32 -th 4 -r-revised 16 -awed 32 -tl 4 -to 126 -fighter 32 -tandem 4 -hammett 12 -expensive 32 -tight 32 -phil 4 -deduct 32 -bashers 4 -idiot 32 -sorting 32 -arthur 4 -biology 32 -field-goals-allowed 32 -presses 36 -when 110 -waste 32 -boasts 32 -exclusionary 32 -referrals 32 -asbestos-containing 32 -steptoe 4 -child 36 -chile 6 -aging 32 -high-income 32 -midwestern 32 -chill 4 -cartoons 4 -sprung 32 -hitherto 32 -swayed 32 -easy-to-use 32 -nonpartisan 32 -opens 32 -owners 100 -happening 32 -frankenstein 4 -first-phase 32 -deposed 32 -t. 4 -rebuffed 32 -suffers 32 -packaging 36 -columnist 32 -pounded 32 -suing 32 -folded 32 -allergist 32 -non-public 32 -derived 32 -firings 32 -aggressive 32 -suspected 32 -elbtf 4 -barriers 32 -semantical 32 -kupinski 4 -else 32 -raoul 4 -derives 32 -towels 32 -felony 32 -transforming 32 -treasuries 36 -felons 32 -sqn 8 -physicists 32 -mines 36 -peck 12 -contacts 32 -poliakoff 12 -strip 32 -texaco 14 -n.y. 4 -critics 34 -subtracting 32 -owens-illinois 4 -supply 36 -predatory 32 -snow 32 -autobiography 32 -streams 32 -lynne 6 -zaibatsu 32 -giacomo 4 -reebok 14 -august 6 -radio 36 -bsbx 12 -animated 32 -protestant 4 -rumored 32 -lytle 12 -gothic 4 -whizzing 32 -beech 6 -depreciation 36 -judith 4 -bdg 4 -situated 32 -researcher 32 -deficiency 32 -clonidine-hcl 32 -rescheduled 32 -northern 46 -eat 36 -ear 32 -wheaton 4 -eileen 4 -parish 4 -predictable 32 -eaf 4 -institutes 36 -vary 32 -criers 32 -handles 32 -explored 32 -booming 32 -bedroom 32 -devotion 32 -bashing 32 -literary 32 -hastily 32 -mtc 8 -sets 44 -seminary 32 -quest 38 -specifics 40 -gentles 4 -mcgann 4 -objectives 32 -thank-yous 32 -surround 32 -hence 34 -quiet 36 -storrs 4 -super 4 -feb 12 -raikes 12 -password 32 -established 32 -seth 4 -kaiser 14 -full-time 32 -exchanged 32 -innocent 36 -jefferies 4 -affirm 36 -non-food 32 -intellectual 32 -exchanges 32 -prospered 32 -introducing 32 -fosters 32 -stupid 36 -manon 4 -vacations 34 -jeep 36 -mrgo 4 -vicious 32 -edison 4 -beryl 6 -brutal 32 -unchecked 32 -whammy 32 -dofor 12 -semi-obscene 32 -honest 32 -ryder 4 -corrupt 32 -four 46 -per-share 42 -quit 36 -utp 8 -wrlt-fm 4 -unfounded 32 -deductibility 32 -strategically 32 -hall 36 -jamail 12 -criminals 32 -riverside 4 -coup 36 -implicated 32 -clay 6 -panels 32 -slashing 32 -leery 32 -hitachi 14 -nicaraguan 4 -consolidated 46 -sparrow 4 -halt 36 -concealing 32 -hitlin 4 -adapt 32 -forrest 4 -journalistic 32 -gottesman 12 -comparisons 32 -yankee 4 -sentiment 32 -applying 32 -asai 8 -coercion 32 -ervin 12 -export-import 12 -sympathize 32 -redd 12 -gulfstream 4 -ehrman 4 -celtic 6 -standing 32 -axp 12 -rate-of-return 32 -endure 32 -doel 4 -invariably 34 -consumer-electronics 32 -self-policing 32 -hiatus 32 -economics 36 -hoare 4 -decline 36 -hard-currency 32 -pilots 46 -joanne 4 -abrams 4 -preventing 32 -dixons 14 -trumpet-filled 32 -curfew-long 32 -contact 32 -minimum 32 -fischbach 6 -facility 32 -harsher 32 -information 100 -winnick 4 -yellow 4 -gaz 4 -dairy 32 -freeze 32 -jardine 4 -one-week 32 -slv 8 -male 32 -traumas 32 -gas 36 -microcosm 32 -greatest 32 -preferring 32 -revitalize 36 -imposed 96 -generation 36 -california-santa 4 -endangering 32 -secrets 32 -notice 36 -accounting 36 -lawfully 32 -accusations 32 -slr 4 -nsm 4 -bleeding 32 -gentiles 4 -knut 2 -multilayer 32 -hustle 32 -porky 4 -stop-loss 32 -bailout 32 -mismanagement 32 -antitrust-law 32 -newspapers 36 -remainder 32 -ranalli 8 -mylan 14 -comment 32 -lodging 32 -mccaw 4 -margaret 4 -mouse 32 -jenny 4 -assign 32 -martial 32 -commend 32 -forster 12 -reinstate 32 -genentech 14 -transplant 32 -health 38 -invalid 32 -surprised 32 -will 110 -broadly 32 -transparently 32 -crude-steel 32 -wilf 8 -cfps 4 -copolymer 4 -chicago-based 4 -peak 32 -av-8b 4 -peat 36 -monteverdi 6 -arranged 32 -undo 32 -surprises 32 -formulates 32 -furnaces 32 -olivier 6 -registry 32 -playful 32 -vocational 32 -nssi 4 -mr. 14 -pannill 4 -thomson 4 -shooting 34 -andover 4 -co-plaintiffs 2 -thrusting 32 -malls 32 -responsibile 32 -misunderstanding 32 -embattled 32 -l. 4 -disbanded 32 -manifold 32 -shad 12 -karma 32 -sensed 32 -hodel 12 -unconstrained 32 -underwritings 32 -declaration 32 -abhorrence 32 -catalyzed 32 -valley 4 -rises 36 -shaw 12 -condoned 32 -seemingly 36 -extending 32 -survival 40 -busines 32 -sweden 14 -prudential 4 -graceland 4 -thinly 32 -perplexed 36 -kelton 12 -aggressively 32 -single-day 4 -contrasting 32 -mid-1990s 32 -mrk 8 -borrow 32 -restated 32 -trundles 32 -mrs 8 -home-grown 32 -beds 32 -expedited 32 -pilot 32 -wide-bodies 32 -treason 32 -growl 32 -grown 32 -breckinridge 2 -expanded 32 -former 110 -subsidizes 32 -moss 4 -runnin 4 -insures 32 -citizens 110 -subsidized 32 -hinted 32 -apprehensive 32 -readings 32 -acerbic 32 -funnier 32 -broadman 4 -highest-paid 32 -unobtainable 32 -pronounced 32 -pool-playing 32 -six-year 32 -yesterday 106 -freddie 14 -impoverished 32 -liquidating 32 -split 38 -influential 32 -simplify 4 -costing 4 -buoy 32 -indicators 36 -replace 32 -gen 4 -covent 4 -inquirer 4 -feared 32 -benighted 32 -moments 32 -amused 32 -octave 32 -spaciousness 32 -park 36 -stripped 32 -anticipate 32 -criticizing 32 -premature 32 -closer 36 -abhors 32 -foundations 32 -benefactors 32 -rainer 2 -pioneers 32 -tendering 32 -pfennig 32 -microwave 36 -photos 32 -submitting 32 -almys 12 -sake 32 -half-decade 32 -insured 32 -procter 4 -creature 32 -qualified 32 -palm-sized 32 -loyal 32 -acceded 32 -street 44 -outright 32 -physician 32 -characterized 32 -f-111 12 -pgloy 4 -executives 102 -visitor 32 -fruitful 32 -papandreou 12 -compatible 32 -job 36 -intraday 32 -series 36 -verifying 32 -slumping 32 -signal 46 -wigton 12 -hp 4 -alfred 6 -page-one 32 -anglin 4 -defection 32 -systems 44 -lipper 14 -w. 4 -creeps 32 -aquino 6 -businessman 32 -durables 32 -socialist 36 -impatiens 32 -x-rays 4 -suggested 32 -caps 32 -contemplating 32 -ps 4 -views 36 -unspecified 32 -niceties 32 -engen 4 -acres 32 -ponnet 4 -intravenous 36 -few 38 -spurgeon 8 -strunk 8 -shortfall 32 -swept 32 -admittedly 32 -undertreatment 32 -pre-election 32 -attractiveness 32 -wx 4 -implication 32 -accommodated 32 -managed 32 -describes 32 -wu 8 -glemp 4 -lip 32 -climb 32 -covenants 32 -actual 36 -stimulated 32 -policy-making 32 -press 36 -lio 8 -cruz 12 -helane 2 -stems 32 -we 46 -parallelism 32 -sprinkel 12 -bred 32 -ticking 32 -lie 32 -savannah 4 -acquiesced 32 -la 46 -stratified 32 -zones 32 -sentence 32 -remedies 32 -benefits 36 -woodside 4 -studiously 32 -acronym 32 -cardillo 4 -each 46 -eliminate 32 -dipping 32 -soady 8 -shish 2 -praise 32 -clamoring 32 -oak 4 -knight-ridder 4 -analyses 32 -lure 32 -sea 46 -ousey 4 -aiding 34 -catering 32 -see 38 -overreacting 32 -wasteland 32 -fixed-rate 32 -candidate 96 -assets 110 -parochial 32 -wealth 32 -accob 8 -firepower 32 -semantics 36 -received 96 -replacing 32 -set 44 -unlawfully 32 -fertilizers 32 -dirty 32 -horseshoe 32 -lk 4 -survivors 32 -harbingers 32 -ascribed 32 -lurk 4 -bally 14 -wholesalers 40 -emily 4 -adieu 32 -damage 36 -clabir 14 -lake-front 32 -together 34 -numerous 34 -populations 32 -merion 8 -consecutive 32 -eisner 12 -finds 36 -cynr 4 -jeanne 4 -house 46 -shoulders 32 -proposal 44 -palmer 4 -patterns 32 -efh 8 -motion 36 -opponent 32 -ideas 36 -soothe 32 -equipment 36 -kebab 32 -export 32 -questions 32 -conflicting 32 -southern 46 -asset- 32 -angiographics 32 -jacob 4 -gary 14 -avondale 2 -infections 32 -citations 32 -systematic 32 -virus 40 -comply 32 -harmful 32 -forstmann 4 -foray 32 -micro 4 -brashness 32 -skew 32 -pan 14 -pants 32 -malongo 4 -radical 32 -repairmen 32 -clock 32 -fault 32 -persio 4 -protecting 32 -mechanically 32 -austria 4 -angolans 4 -league 36 -preservers 32 -misapplied 32 -remind 32 -participating 34 -checkoff 4 -gencorp 14 -pacific 14 -merge 32 -low-priced 32 -denominated 32 -data-base 32 -subordinated 32 -crises 32 -disadvantaged 32 -bibles 4 -uglich 4 -circles 32 -single-family 34 -emigrated 32 -nuclear 108 -propped 32 -circled 32 -brethren 32 -critically 32 -footwear 32 -subordinates 32 -disgraceful 32 -mpi 4 -knitting 36 -rinehart 4 -media 36 -ponnelle 12 -prevalent 32 -victim 32 -helps 32 -foreigners 38 -rowan 4 -unveils 12 -formations 32 -underweighted 32 -denounced 32 -performer 32 -vrdolyak 12 -spill 36 -craziness 32 -bout 32 -claimant 32 -briefly 32 -prompting 32 -stalls 8 -undercutting 32 -portfolio 32 -aspects 34 -karr 8 -seven-member 32 -wrongdoing 32 -dublin 12 -wall-to-wall 32 -brought 36 -extremist 32 -plainclothes 32 -innovate 32 -offerings 36 -subsidizing 32 -chevy 4 -priority 32 -schulz-koehn 4 -pegasus 14 -discharge 32 -engaged 32 -swapped 96 -r.h. 6 -nora 4 -marpac 4 -nameplate 32 -substitution 32 -hertz 6 -seconds 32 -wesley 6 -dogs 32 -vacancy 96 -contender 32 -crooks 12 -outfitted 32 -inextricably 32 -eroding 32 -came 40 -editor 36 -comanche 4 -carbide 4 -nervous 32 -quick-reaction 32 -followerfish 32 -craft 32 -harrisburg 8 -amid 110 -contended 32 -calculators 32 -upset 32 -camp 36 -giants 36 -toilets 32 -simultaneous 32 -reinforcing 32 -diana 4 -cost 32 -suspending 32 -boca 12 -stone 46 -facilitating 32 -electrocardiogram 32 -pacman 4 -fast 44 -doublespeak 32 -contrasted 32 -methods 36 -proved 32 -two-story 32 -lets 32 -nicaragua 6 -debtor-country 32 -lifeless 32 -baryshnikov 4 -chilton 12 -pervasive 32 -elbit 4 -grenada 4 -co-managed 32 -permits 32 -investigating 32 -port 32 -suter 12 -erdman 4 -leominster 4 -owns 96 -jules 4 -prankster 32 -looked 32 -saudi 4 -endeavors 32 -co-manager 32 -invade 32 -avoided 36 -requirements 32 -alnt 8 -bnk 4 -emi 4 -install 32 -tancredo 4 -manne 4 -randy 4 -emerge 36 -delinquent 4 -disappearance 32 -suburb 32 -deficit 44 -mimicking 32 -sutezo 4 -shafer 8 -retton 4 -willcox 8 -you-know-what 32 -turbo 32 -broadcaster 32 -lincoln 14 -recall 36 -anthony 6 -cleveland-based 4 -extended-stay 32 -supplemented 32 -patient 32 -violets 32 -unusually 32 -kelley 4 -koester 8 -assembling 32 -contrasts 32 -conglomerateur 32 -central 44 -scholarship 32 -obscene 32 -erratic 32 -pistols 32 -conveyed 32 -cpq 12 -extends 44 -dresser 4 -dresses 32 -cpt 6 -unload 32 -hci 8 -louis 14 -jerusalem 4 -miny 8 -low-income 32 -perquisites 32 -dressed 32 -revolutionary 32 -huffy 32 -signals 32 -rationed 32 -duplicate 32 -hike 32 -cph 4 -mountain 36 -carpenter 12 -genetski 12 -bradt 8 -arco 14 -inadequately 32 -averaging 32 -attribute 32 -confounded 32 -liebowitz 4 -employs 32 -brady 4 -intensely 32 -sorbonne 4 -buyer 34 -airplane 32 -lexington 8 -strenuous 32 -guyon 4 -rationale 32 -holders 102 -volz 8 -investors 46 -primary 36 -penobscot 12 -photographically 32 -advertising 110 -micron 4 -euan 8 -settled 32 -wine 100 -wind 36 -wing 36 -primark 12 -hardening 32 -parlow 12 -wins 36 -group 46 -argonaut 4 -publish 32 -international 110 -healey 4 -monopole 32 -newsprint 36 -turnaround 36 -cadre 32 -retail-entertainment 32 -broderbund 4 -double-a/a-1-plus 32 -mend 32 -peers 36 -beckett 4 -ventilation 32 -indianapolis-based 4 -mscc 4 -georgetown 4 -waters 32 -short-maturity 32 -forbidden 32 -menu 32 -unlawful 32 -monopoly 32 -news 38 -nios 4 -sailors 32 -stolen 32 -differently 32 -nickname 32 -maquilas 42 -overseeing 32 -represent 32 -confirmation 40 -survivor 32 -jewelry 32 -elects 4 -steel-fabrication 32 -households 96 -squawk 32 -brent 6 -reacting 32 -long-term 34 -combipress 6 -denison 14 -resulted 32 -jurists 32 -fraction 32 -puzzles 32 -feistritzer 4 -union 46 -revved 32 -otherwise-flattering 32 -cornetti 4 -farmington 4 -schooling 32 -machine 36 -abdul-haadee 4 -puzzled 32 -mine 32 -vague 32 -scared 36 -abbe 4 -iowa-based 4 -lakes 32 -olenick 4 -seemala 6 -repaid 36 -deductions 32 -bottler 32 -bottles 32 -airways 4 -spur-of-the-moment 32 -employ 32 -entrepreneurial 32 -ciba-geigy 4 -jean 6 -repair 36 -screamed 32 -cigarette 36 -well-connected 32 -dahl 4 -fiancee 32 -valuable 32 -company-owned 32 -wealthier 32 -bridges 4 -keenan 12 -otis 4 -collaborated 32 -air-freight 64 -drill-bit 32 -ruffle-front 32 -condominium 32 -meets 32 -recordings 32 -den.a 8 -sacrifice 4 -editors 32 -duck 32 -nev.-based 4 -ribavirin 32 -respect 32 -hairs 32 -evokes 32 -brentwood 4 -herald 4 -gathered 32 -adults 32 -networking 36 -independence 32 -clifton 2 -differences 34 -isolated 36 -plunge 36 -money-transfer 32 -scaled-back 32 -southeast 14 -frequent 34 -ahonoora 4 -wheeler-dealers 2 -spirited 32 -chosen 32 -scares 32 -linda 12 -serviceable 32 -somebody 36 -metropolitan 44 -twin-engine 32 -immediate 32 -ads 36 -swamping 32 -ida 4 -approaches 32 -consultants 36 -unfazed 32 -deaths 32 -bikinis 32 -wriv-tv 2 -eons 32 -loucks 8 -everything 36 -recreating 32 -vice 108 -straw 32 -silk 32 -vulnerable 32 -ex-cia 32 -locations 32 -resins 32 -looms 36 -financement 4 -smashing 32 -unravel 32 -cary 4 -math 32 -casseb 4 -black-market 32 -excessively 32 -herculean 32 -care 100 -card 38 -trucking 32 -antidote 32 -alarming 32 -shortages 32 -carl 6 -semiconductor 110 -higher-than-anticipated 32 -method 32 -planners 36 -involved 36 -moines 4 -sprinkles 32 -declared 96 -controller 32 -prevents 32 -drescher 4 -u.s.-dominated 4 -pastures 4 -muster 32 -psychologically 32 -bylaws 32 -jacksonville 4 -limiteds 32 -superiority 96 -sprinkled 32 -low-budget 32 -departure 36 -vaginal 32 -compunction 32 -ethical 32 -muted 32 -protected 32 -projection 32 -steelmakers 32 -spanier 8 -squirms 32 -contain 32 -overturn 32 -pore 32 -correspondent 32 -josephthal 4 -median 32 -slapping 32 -stevenson 4 -attributes 32 -sleeves 32 -guts 32 -smelters 32 -disappearing 32 -linked 32 -realigned 32 -di 44 -newsstands 32 -endo 8 -northrop 14 -extreme 32 -phillips 14 -sort 32 -steppes 8 -steeper 32 -affirmed 32 -whereabouts 32 -foster 44 -invite 32 -decker 4 -sport-fishing 32 -bland 32 -john 14 -parents 38 -hazlitt 4 -guardian 32 -lackluster 32 -enjoying 32 -do 38 -recast 32 -dividing 32 -notorious 32 -transfering 32 -lebow 4 -preparing 32 -entered 32 -stabilization 36 -arrived 32 -straws 32 -marty 6 -ottone 4 -commendable 32 -auschwitz 4 -marts 32 -lining 32 -batter 8 -giudici 4 -practical-joking 32 -honduras 4 -copycat 4 -ability 100 -autonomous 32 -upgrade 36 -de 100 -inouye 4 -shipper 32 -post-big 32 -thursday 6 -institutions 100 -multiply 32 -objected 32 -guano 32 -hodgepodge 32 -settle 44 -wayne 8 -inception 32 -unclear 32 -high-energy 32 -printable 32 -sweetner 8 -shield 36 -front-runner 32 -gregory 12 -traders 46 -maj. 4 -riddled 32 -usw 4 -high-flier 32 -bethesda 4 -job-training 32 -parnes 4 -sias 8 -sociological 32 -deprive 32 -metro 14 -vitro 2 -riddles 32 -bouquet 32 -underwriters 38 -apparent 32 -voters 34 -jerome 4 -subcontractors 32 -margo 2 -supplanting 32 -overcome 32 -sighting 32 -well-known 32 -cnvx 4 -wdc 8 -description 32 -george 14 -t.npi 4 -left-wingers 32 -uep 8 -jaded 32 -athens 12 -fenner 4 -forties 32 -fortier 4 -fulfilled 32 -tie 32 -obligatory 32 -well-versed 32 -unaware 32 -extravagance 32 -holdings 44 -skill 32 -sales 110 -new$ 4 -barnett 12 -maryinsky 4 -prospective 32 -bankrupt 32 -bah 2 -mingle 32 -inflated 32 -prosecute 32 -literally 32 -use 36 -subsidy 32 -exactly 32 -symptom 32 -withheld 32 -richer 36 -d.h. 4 -motown 4 -calendar 36 -millennia 32 -india-rubber 4 -supersaver 32 -victor 14 -refinances 32 -now-defunct 32 -cautious 40 -enforcement 36 -subside 32 -fearon 4 -consumer-goods 2 -instances 32 -midlands 4 -skipped 32 -bill 44 -pinto 4 -tastes 36 -suggestions 32 -hard-charging 32 -elegant 32 -homework 32 -goldhammer 4 -cornerstone 32 -taylor 12 -millar 4 -sup 4 -fianna 4 -suggestion 32 -thousand 32 -exchange 102 -nippon 14 -larger 32 -noncompetitive 32 -recognizance 32 -star-topped 32 -satisfying 32 -repay 32 -almost-certain 32 -industry-financed 32 -reconsideration 32 -stray 32 -pork 38 -meg 4 -sokolow 12 -middleman 32 -deregulation 36 -barometer 32 -marquee 32 -waived 32 -flatly 32 -both 46 -pfe 12 -iverson 4 -bipartisan 34 -epidemiology 32 -dmbk 8 -humphreys 4 -offering 100 -article 32 -higher-than-expected 32 -fodder 32 -source 42 -shot 32 -expense 32 -gatsby 4 -ill-advised 32 -bottomed 32 -jean-louis 4 -punished 32 -wacky 32 -ferry 32 -skilled 32 -ries 4 -stock-cash 32 -five-year 32 -illustration 32 -carry-forward 32 -resigned 108 -adopted 40 -betty-jo 2 -bomb 32 -object 32 -guero 32 -user-charge 32 -burgee 4 -six-cylinder 32 -kinburn 4 -backing 32 -combustion 32 -perfect 32 -vanderbilt 12 -maloof 4 -saturn 6 -commanders 32 -renunciation 32 -sasso 4 -ill-designed 32 -bauer 12 -hug 40 -buildings 36 -hud 2 -administering 32 -bunches 32 -goodwin 4 -entrenching 32 -thorough 32 -iscs 4 -wsj 4 -gerhard 4 -hut 4 -unfortunate 32 -hawkeye 14 -passion 32 -shp 8 -announcements 32 -shadowy 32 -non-discretionary 32 -currently 42 -co-production 32 -only 38 -initially 96 -exposed 32 -ado 32 -wharton 14 -telecasts 32 -tropical 32 -bottoms 32 -bank-secrecy 32 -weights 32 -linerboard 38 -mills 36 -bls 4 -pierce 44 -resume 36 -recollection 32 -consolidation 32 -loosens 32 -commonplace 32 -supports 32 -mckinley 8 -pause 32 -cited 36 -futility 32 -robin 4 -cardin 4 -changed 36 -chieftains 32 -high-end 32 -m&a 4 -beaubien 4 -bendel 4 -stifles 32 -alabama 4 -cites 96 -elliott 4 -massacres 32 -similarities 32 -bombarded 32 -admittance 32 -leader 36 -utah-based 4 -donning 32 -secretaries 32 -taiwan 4 -nationality 32 -spreadsheet 32 -transitional 32 -realists 32 -guidance 32 -blessed 32 -demure 32 -dismissing 36 -stunning 32 -entrepreneur 32 -cheerful 32 -arab 4 -work-places 32 -thievery 32 -complication 32 -cry 36 -reckoned 32 -complicating 32 -coasts 32 -sobbing 32 -expressionist 4 -asking 32 -residence 4 -peregrine 32 -premeditation 32 -mighty 32 -tones 32 -sylvania 4 -shimon 4 -frail 32 -lks 8 -resourceful 32 -switched 32 -mysteriously 32 -toned 32 -rts 4 -mightn 32 -hose 32 -sensors 32 -three-ring 32 -sustaining 32 -zoltan 4 -lanes 32 -resentment 32 -utilities 46 -adv 8 -overaddiction 32 -tinier 32 -slower 32 -forcing 32 -along 42 -alone 36 -rousseau 4 -spilled 32 -miserable 32 -gaffe 32 -slowed 32 -peugeot 8 -newsletter 32 -schumer 4 -swooping 32 -wasn 44 -occidential 8 -employment 36 -stifling 32 -career 36 -afghanistan 4 -canada 14 -movie-studio 32 -dropout 32 -s.p.a. 4 -secretly 32 -may 110 -nunn 12 -enjoyed 32 -seeped 32 -shops 36 -frazier 4 -brookfield 8 -doing 40 -exe 4 -moffett 4 -norsk 14 -observance 32 -units 102 -transporation 4 -kisses 32 -hinduism 4 -gaughan 8 -blumenthal 4 -excesses 32 -involves 32 -jumblatt 4 -sounding 32 -local 38 -distanced 32 -royster 12 -u.s.-trained 4 -river 4 -pains 32 -demonic 32 -oxford 4 -glory 32 -unilever 4 -paint 32 -saharan 4 -bartels 4 -sufficiently 32 -databases 32 -child-bearing 32 -acknowledgement 32 -richest 32 -connotation 32 -cuff 12 -geochemistry 32 -skidded 32 -brotherhood 4 -informant 32 -recorders 32 -cheating 96 -supervision 32 -spreckels 4 -rebs 4 -tables 32 -retired 34 -carolee 2 -roth 12 -seemann 8 -kyushu 4 -suffered 32 -falcons 32 -ga. 4 -mirth 32 -dearborn 6 -crowne 4 -warsaw 4 -dana 4 -all-irish 32 -classicist 32 -becor 12 -deposited 32 -trickling 32 -brand-name 32 -galen 4 -orate 32 -prospering 32 -defraud 32 -includes 36 -paribas 14 -visitors 34 -armed 36 -halcyon 32 -orders 36 -special-interest-group 32 -beane 12 -shakeout 32 -premises 32 -bernie 4 -tubing 32 -double-a-minus 32 -confined 32 -burden 32 -test 36 -loosely 32 -chapstick 4 -withholding 32 -pays 32 -stylistic 32 -louisiana-pacific 14 -warp 32 -enlist 32 -fa-18 4 -ballpark 32 -arguments 32 -averted 32 -stainless 32 -shipyard 36 -interruptions 32 -grossed 32 -abt 14 -lenders 32 -misappropriation 32 -charlie 4 -gingerbread 32 -bbec 8 -abc 14 -sickness 32 -aba 12 -sahara 4 -resultant 32 -abf 8 -oxy 8 -tinkering 32 -soliciting 32 -thrust 32 -eligible 32 -commodore 4 -innovating 32 -bbc 4 -braverman 12 -avez 32 -feels 32 -bbk 4 -jumps 32 -veiled 32 -chimpanzees 32 -medieval 32 -jour 32 -legal-ethics 32 -martini 4 -combat 32 -leak 32 -training 36 -lybrand 4 -coney 4 -outsells 32 -refer 32 -rang 32 -welcome 32 -glass-steagall 4 -turns 40 -impede 32 -offense 32 -successful 32 -unisys 4 -supplemental 32 -urge 36 -state-owned 32 -kinda 4 -gang 32 -wilmington 12 -safeway 4 -unwelcome 32 -abner 4 -good 38 -crosby 12 -lafarge 14 -weeklong 32 -nights 32 -intimacy 32 -airliners 32 -blackboard 32 -kansas 4 -rubens 4 -contribution 32 -then-secret 32 -older 32 -granite 32 -high 46 -policy-setting 32 -ballets 34 -entiche 4 -well-orchestrated 32 -day-to-day 32 -humanity 32 -buford 8 -thunberg 4 -bathers 32 -babson 8 -day-care 34 -dutch 4 -inquiries 32 -encased 32 -ought 32 -clashed 32 -druse 4 -automotive 32 -begetters 32 -gaudy 32 -examinations 32 -unreleased 32 -chg. 4 -acquirers 32 -mmblf 8 -assures 32 -lunch 32 -nuclear-strategy 32 -gemayel 4 -assured 32 -long-term-gain 32 -eisenhower 4 -turbine 32 -walsh 4 -goradok 4 -boundaries 32 -such 34 -background 36 -stallion 32 -totalling 32 -feldstein 12 -daredevil 32 -pauls 4 -matsumura 4 -investing 32 -define 32 -phone 32 -ibus 4 -peasant 32 -paula 4 -diversion 32 -closings 32 -presented 32 -amends 32 -gmc 4 -perry 14 -employers 98 -smith 12 -gmh 4 -icn-financed 4 -paulo 4 -port-o-potty 4 -australia-based 8 -unrealistic 32 -strongest 32 -elder 36 -horton 4 -nor 42 -shotguns 32 -adjusting 32 -now 46 -yearn 32 -not 46 -years 102 -upward 32 -hoped 36 -alternate 4 -non 32 -artistic 32 -noc 8 -cooling-off 32 -purchase 46 -births 96 -million-ton 32 -chiefly 32 -teens 32 -terms 42 -portents 32 -amendments 32 -parodies 32 -conjectures 32 -amex 12 -sheathed 32 -partnerships 36 -answers 32 -investment-banking 32 -whereas 34 -rescuing 32 -job-hopping 32 -dimes 32 -straightforward 32 -something 36 -upc 8 -brother-in-law 32 -blank 4 -surrendered 32 -chances 32 -minzer 12 -clarified 32 -feetlebaum 4 -puusepp 12 -historian 34 -moguls 32 -timothy 4 -guilt 32 -fighting 38 -complementing 32 -high-performance 32 -peek 32 -decentralized 32 -ends 36 -django 4 -ratified 32 -behest 32 -guild 38 -advocacy 34 -predict 32 -clevepak 4 -paintings 32 -obtains 32 -profusion 32 -breakthroughs 32 -receive 32 -vastly 32 -conducting 32 -resell 32 -sayi 4 -alleghany 6 -kumar 4 -melt 32 -kildare 4 -learning 36 -stuff 36 -manatee 4 -sincerely 32 -segregated 32 -prescribed 32 -kaneb 6 -boone 8 -richardson-vicks 4 -revenues 32 -gilbert 12 -leasing 36 -strengthening 32 -feinstein 4 -decision 100 -shed 32 -aircraft-evacuation 32 -z 100 -j.c. 6 -priscilla 4 -beijing 4 -long 46 -pratically 32 -lone 4 -beans 32 -trimming 32 -manners 32 -steadily 32 -diagram 32 -netted 32 -command 36 -disks 32 -steeply 32 -yr. 4 -more-extensive 32 -shareholders 110 -sometimes-flimsy 32 -perrin 4 -extraction 32 -ratajczak 4 -anticipation 32 -clans 32 -valencia 4 -notwithstanding 32 -whittier 4 -disclosures 32 -notation 32 -early-deployment 32 -belton 4 -succeeded 32 -restored 32 -existence 32 -axles 32 -butyl 32 -labor-management 32 -kendall 8 -claridge 8 -seventeen 2 -host 32 -gubernatorial 32 -incorporates 32 -brewster 12 -libel 32 -augment 32 -born 38 -pastime 32 -traditional 32 -quinlan 8 -bore 32 -leapfrogging 32 -about 46 -fashionable 32 -management-led 32 -demos 8 -schenectady 8 -partially 32 -incorporated 32 -businesses 106 -telematics 4 -carmichael 4 -bowling 32 -dubai 4 -alter 36 -second-quarter 32 -bangemann 12 -homeowner-mortgage 32 -whiteman 4 -prof. 6 -alton 4 -occupancy 32 -vlis 8 -enact 32 -anniversary 32 -manure 32 -hyping 32 -korean 4 -koten 4 -stockman 8 -using 42 -earlier-than-contemplated 32 -seven-up 12 -abc-affiliated 4 -mims 4 -redoglia 4 -hwp 8 -concocted 32 -embassy 46 -awareness 32 -gravity 32 -fairness 32 -rittereiser 12 -mime 32 -engendered 32 -juveniles 32 -wielding 32 -disagrees 32 -chronicle 32 -ones 32 -seasonal-adjustment 32 -breath 32 -cutthroat 32 -iran-contra 4 -widened 40 -occupying 32 -wartenberg 8 -secret 44 -omnibus 32 -fractional 32 -lung 32 -suddenly 34 -ftc 12 -whiskey 32 -zagorsk 4 -burst 32 -copying 32 -cemeteries 32 -horse-breeding 32 -events 38 -accountants 36 -baluchis 4 -reduction 32 -determining 32 -taxis 38 -peroni 12 -wilshe 4 -explanation 32 -spaces 32 -top-level 32 -johnsons 4 -develops 36 -soviet 6 -miner 32 -trans 4 -proofs 32 -lobbying 32 -joint-venture 32 -prepubescent 32 -bounced 32 -stirring 32 -polluting 32 -sooner 32 -rjr 14 -silence 32 -industry 110 -still-centralized 32 -overcrowded 32 -debentures 100 -berisford 4 -william 14 -piloted 32 -panay 4 -superpower 32 -encouragement 32 -yamaichi 2 -impasse 32 -bidding 36 -slate 32 -overwhelmed 34 -frightened 64 -filed 32 -done 32 -chadli 4 -munch 32 -binge 32 -debt-laden 32 -sherman 12 -negative 32 -coherent 32 -maximize 32 -peach 36 -pre-big 32 -versus 32 -literature 32 -un-reagan-like 32 -velobind 8 -submitted 32 -vacant 32 -jet 36 -jew 4 -orbanco 14 -statutes 32 -clip 32 -interceptors 32 -rowlands 4 -mcknew 12 -destroyed 32 -management 110 -demonstrating 32 -vu 32 -finicky 32 -hypothetical 32 -master 36 -frustrate 32 -portend 32 -malevolent 32 -huw 8 -vi 4 -davenport 4 -vf 14 -ve 34 -systemhouse 4 -alarm 36 -switzerland-based 4 -throwaways 32 -average 46 -burned 32 -number 32 -resignation 32 -starcraft 4 -hated 32 -harming 32 -shipped 32 -lineup 32 -encourage 32 -robelo 12 -rub 36 -overdue 34 -acquirer 32 -exams 32 -run 36 -apartment 32 -decide 32 -orsini 8 -enrique 4 -c-yields 64 -the 126 -short 36 -natomas 4 -expectations 102 -baker-watchers 4 -should 102 -v. 4 -undermining 32 -offshore 38 -profitability 36 -rigs 32 -shore 32 -shcharansky 4 -gave 32 -pyrotechnics 32 -appointment 32 -skyw 8 -mayors 32 -receivers 32 -conferences 32 -stable 32 -drinkers 32 -ensure 32 -s&w 4 -tracks 32 -ppos 4 -dress 32 -displaced 32 -gospel 32 -overreach 32 -thighbone 32 -oil-drilling 32 -burdick 12 -metals 36 -securities-industry 2 -u-shaped 4 -invitations 32 -evaporating 32 -ucc 8 -wald 4 -rebelling 32 -glucksman 4 -heck 14 -approximate 32 -legitimacy 32 -ucl 4 -eddie 6 -pawing 2 -teeth-gnashing 32 -merck 14 -about-to-boom 32 -barbara 14 -shearson-managed 4 -fighters 32 -ugly 32 -accidently 32 -male-female 32 -hawaiian 12 -methodology 32 -skins 32 -birthplace 32 -reliever 32 -scaling 32 -proliferation 32 -kent 6 -lump 32 -monaghan 4 -discuss 40 -schedule 32 -distribution 36 -wein 4 -connors 4 -pile 36 -hipbone 32 -eight-member 32 -beehives 32 -peerless 4 -wallop 44 -snag 32 -antique 32 -joned 32 -mass.-based 4 -soured 32 -musicians 32 -equivalent 96 -outlast 32 -quandary 32 -jones 12 -anonymity 32 -purporting 32 -tradition 34 -underground 32 -frayed 32 -affordability 32 -nugget 4 -tharp 4 -brutally 32 -zeal 32 -artificial 32 -ballot 32 -laffer 4 -date- 4 -smile 32 -nominating 32 -chester 4 -underclass 32 -pinstripes 32 -sidesteps 32 -predisposed 36 -policy 110 -drinking 32 -classifying 32 -england 4 -provoke 32 -anti-communist 32 -conservatively 32 -motorola 4 -photo 36 -surpassing 32 -unfulfilled 4 -capital-markets 32 -friendship 32 -forfeited 32 -repeal 36 -brant 12 -semiconductor-equipment 32 -dust 36 -fidelity-monarch 4 -bleached 32 -churches 32 -jail 32 -revisit 32 -himont 14 -electricity 36 -clearinghouse 32 -foiled 32 -non-cash 32 -accusing 32 -angeles-based 4 -grounded 32 -retaliated 32 -gymnastics 32 -slippers 32 -qua 32 -sun 46 -marking 32 -shakeup 32 -mechanic 32 -grievances 32 -hwkb 8 -million-plus 32 -abrogate 32 -rests 32 -brunswick 4 -four-year-olds 32 -runyon 4 -surged 40 -jackpot 36 -a.c. 4 -man-hours 34 -shrewsbury 4 -braided 32 -cinderblock 32 -opining 32 -putka 4 -leeson 4 -urstadt 4 -mclean 4 -approximates 32 -mcic 8 -dundee 4 -h.c. 6 -sayao 4 -continually 32 -sued 100 -creams 32 -ashamed 32 -anxious 32 -detonate 32 -mounted 32 -scripts 32 -cvn 8 -sweeten 40 -directed 32 -sweeter 32 -i 14 -fla.-based 4 -discontinued 32 -aren 100 -area 36 -market-sensitive 32 -sidestepped 32 -diem 4 -rotational 32 -volunteers 32 -summer 36 -travels 32 -litigator 32 -regaining 32 -joins 32 -ravaged 32 -ebnc 4 -joint 44 -laugh 32 -handful 32 -industrial 110 -deeply 32 -fears 32 -howe 4 -co-authored 32 -blocks 32 -steenburgen 12 -richards 4 -books/william 4 -ledyard 4 -desktop 32 -invoices 32 -heaped 32 -high-quality 32 -continues 100 -fundamental 32 -term-loan 32 -evaluating 32 -lulling 32 -boxer 36 -yielding 32 -concessionary 32 -schuman 4 -contracting 32 -benefit 32 -clearing 36 -continued 34 -good-looking 32 -contra 4 -nonunionized 32 -lingerie 32 -shots 32 -training-related 32 -playroom 32 -cmd 4 -lehder 4 -beaumont 4 -misstate 32 -fund-raising 32 -triple-damage 32 -janitors 32 -spotted 32 -seven-figure 32 -peer 32 -allowing 34 -additional 110 -seat 44 -wefer 8 -finding 36 -pro-forma 32 -implemented 32 -etn 8 -anti-inflation 32 -etc 32 -disagreed 32 -travenol 4 -seag 8 -budgets 32 -rival 38 -matter 36 -fixed-rated 32 -strange 32 -select 32 -sideways 32 -onions 32 -rhetorical 32 -k. 4 -cooper 14 -vapid 32 -nyta 4 -champions 32 -petroleum 44 -asians 6 -rappaport 8 -outlining 32 -fabricated 32 -journey 32 -naylor 12 -please 32 -excreta 32 -malfunctions 32 -schlesinger 8 -flying 34 -tipping 32 -insistence 32 -vessel 32 -undisputed 32 -damaging 32 -cornball 32 -basically 36 -altos 4 -receipt 32 -gloating 32 -tip 40 -competitiveness 36 -dividend 44 -uncommon 32 -reform 36 -limit 36 -cyanamid 4 -certification 36 -populism 32 -photographic 32 -lower-priced 32 -showcased 32 -sept. 14 -royal 4 -md.-based 4 -fast-paced 32 -refractory 4 -heckuva 32 -study 108 -treatments 32 -hia 12 -time 44 -creating 32 -broadens 32 -rains 32 -chemically 32 -gaps 32 -tellez 4 -somber 32 -military 100 -cooled 32 -grievance 32 -irrelevant 32 -widen 32 -intellectually 32 -lawyer-cpa 32 -tropics 32 -petitions 32 -dry 36 -sheldon 12 -gossip 32 -virazole 4 -lever 32 -hourlong 32 -mousse 32 -voices 4 -addressed 32 -refugees 32 -reprehensible 32 -mild-mannered 32 -matching 32 -tranquil 32 -asia 4 -jackets 32 -single-b-3 32 -obtain 32 -newark 12 -messiest 32 -addresses 32 -voiced 32 -adequacy 32 -vanzetti 4 -june 6 -landfill 32 -affirmative 32 -non-utility 32 -swordfish 32 -approaching 32 -cheetah 4 -sally 6 -anglo-dutch 4 -warned 32 -dale 6 -implied 32 -hospitable 32 -dali 4 -nutley 4 -harlan 12 -importantly 32 -countrywide 8 -subject 38 -scavengers 32 -coaches 32 -single-store 32 -roadblocks 32 -loses 4 -comfort 32 -closes 32 -cake 4 -morning 32 -dalhouse 12 -grove 12 -hats 32 -stephens 4 -receptive 32 -foreigner 32 -responsible 32 -mid-december 32 -prowess 32 -breathing 32 -furniture 100 -screening 32 -hata 4 -sewage-treatment 32 -bring 36 -apocryphally 32 -workload 32 -stockbroker 32 -neon 32 -toddlers 32 -states 46 -nick 4 -tenor 32 -martinet 4 -traitor 32 -inexact 32 -eye-opening 32 -disorder 32 -pro-iran 32 -hills-based 4 -tax-cut 32 -helyar 4 -bassett 4 -behind 34 -uninvited 32 -by-election 32 -outpacing 32 -relegated 32 -naked 36 -saunders 4 -persuaded 32 -slumps 32 -confuse 32 -illusion 32 -tainted 32 -e.h. 8 -floating 32 -portrayed 32 -kemal 4 -hotel 110 -introduced 96 -fractious 32 -capitalize 32 -maquila 34 -tampa 4 -mgm/ua 4 -tournaments 32 -lays 4 -precaution 32 -lists 32 -heublein 4 -induced 32 -rehearing 32 -substance 36 -fry 12 -retrenchment 32 -joking 32 -matthews 12 -fluids 4 -obligator 32 -little-noted 32 -sleepy 32 -kong 4 -nissan 8 -sloppy 32 -leeway 32 -lyons 14 -decision-making 32 -performs 32 -amdec 4 -experienced 32 -gainful 32 -print 36 -fairy 36 -sacrificing 32 -candidates 40 -reinsurance 32 -benefiting 32 -scientists 38 -insurgency 32 -citizenship 32 -shunted 32 -eschewing 32 -cuts 36 -aerospace/technology 32 -tree 36 -gods 32 -plants 36 -maintained 32 -ranks 40 -bracket 32 -complaining 32 -beirut 4 -data-storing 32 -junji 4 -transcripts 32 -depleting 32 -hirshhorn 2 -dillon 4 -journal 36 -rewarded 32 -acquiring 32 -starters 32 -price-depressing 32 -face-saving 32 -belmar 4 -first-year 32 -skull 4 -uneconomic 32 -boise 4 -guittard 4 -puff 32 -generic 38 -mich.-based 4 -maturity 32 -thwart 36 -candor 32 -drill 32 -johnson 14 -milkshakes 32 -huge 32 -bomber 32 -employee-compensation 32 -sink 32 -domestic 42 -sine 32 -sunnyvale 12 -sing 32 -bradley 12 -steaks 32 -fernand 8 -cocktails 4 -english 4 -ramps 32 -blamed 32 -lagging 32 -pitfalls 44 -risk-free 32 -papers 36 -resentful 32 -mirrored 32 -hiccups 32 -aquatic 32 -poison 32 -bound 32 -dreadful 32 -chip 100 -intourist 4 -hennessy 8 -expecting 32 -governs 32 -decorating 32 -wielded 32 -jaguar 4 -prospectus 32 -guarded 32 -obligation 96 -balloting 32 -guilty 44 -orthodox 32 -free-market 36 -silliness 32 -convince 32 -deal 36 -insecurity 32 -arguing 32 -historians 32 -rejuvenate 32 -mitsuko 4 -solomon 12 -mlp 8 -vacation 32 -decatur 2 -shoddy 32 -cordials 32 -deduction 32 -attraction 32 -towbin 12 -uac 4 -causing 32 -engagement 32 -ual 14 -definite 36 -democratic 38 -dominant 32 -uaw 4 -coal-mining 32 -presidents 100 -good-sized 32 -erasable 32 -themes 32 -kristol 12 -head 100 -kastenmeier 12 -davao 4 -girls 32 -pfizer 14 -resting 32 -trademark 32 -coates 8 -fujitsu 6 -laments 32 -kicker 32 -pahn-ish 4 -revelations 32 -pain 32 -frequents 32 -thematic 32 -genital 32 -emmett 8 -castro 4 -seeks 44 -observers 32 -tmc 8 -disenchantment 32 -coated 32 -nerd 32 -pursue 32 -loyalties 32 -american-made 4 -pre-tax 36 -understanding 36 -nero 6 -oil-pipeline 32 -blocking 32 -burlingame 4 -financial 108 -plutonium 32 -research-and-development 32 -tenders 2 -constanza 4 -sprang 32 -dipped 32 -streets 32 -nielsen 14 -pardon 32 -debut 32 -good-faith 32 -charts 32 -rudder 32 -pearce 4 -record-setting 32 -weighs 44 -campbell 6 -hmc 4 -forecasts 32 -locked 32 -paperback 32 -ekman 12 -suspect 32 -heinrich 4 -plaintiff 32 -briefing 32 -snowplows 32 -solidify 32 -vote-getter 32 -leaping 32 -mickey 4 -relaxation 32 -forte 32 -california-based 4 -information-service 32 -interrelated 32 -brains 32 -meanings 4 -decrane 8 -pollard 4 -sanctioned 32 -plotted 32 -steel-fabricating 32 -front-page 32 -jammed 32 -myriad 32 -shirley 4 -mass-production 32 -tattered 32 -tutu 4 -exporters 32 -reflects 32 -stave 32 -hurts 32 -regretted 32 -apathetic 32 -characters 32 -struggled 32 -origins 32 -von 32 -booster 32 -psycho-killer 32 -corresponding 32 -future 36 -intimidates 32 -banker 100 -capriccio 4 -restoring 32 -unidentified 32 -futuro 4 -coopers 12 -shicoff 4 -council 44 -fiduciary 32 -two-foot-long 32 -gazette 4 -therein 32 -impending 32 -withdrew 96 -sitcom 32 -salvador 4 -cooped 32 -bean 4 -lufkin 4 -gyohten 4 -tungstic 32 -beef 32 -pausing 2 -largest-ever 32 -dimon 4 -sporkin 12 -pandair 4 -sumitomo-clan 4 -middle 36 -sample 32 -performing 32 -correspondence 32 -salovaara 4 -capitol 4 -geared 32 -winners 40 -mike 14 -proprietors 32 -evacuation 32 -couponing 32 -waterflood 32 -forays 32 -basketball 38 -rajavi 4 -publicly 32 -kennedy 12 -convex 4 -pullback 32 -showboat 14 -convey 32 -racquets-master 32 -pipelines 4 -firsts 4 -ritterbusch 4 -treatment 32 -blonds 36 -lead 38 -mccullagh 4 -almost 98 -risked 32 -labeled 32 -served 32 -bluechip 32 -mixed 38 -blonde 36 -fairly 32 -bevy 32 -bahia 4 -earliest 32 -bees 32 -nonsense 32 -hme 8 -stay-over 32 -infusions 32 -yaroslavl 4 -martyred 32 -zoning 36 -cursory 32 -chrysler 14 -distinctions 32 -pcar 4 -breeze 4 -kilburn 8 -televisa 4 -orleans-based 4 -checked 32 -hierarchy 32 -msl 6 -repertory 36 -regalia 32 -church 46 -remarkably 32 -illustrates 32 -ixiz 4 -first-quarter 32 -nuts 32 -once-incurable 32 -zwyer 4 -puzzle 32 -virologist 32 -equitypurchase 32 -notable 40 -unless 34 -four-story-tall 32 -fezzes 32 -illustrated 32 -souvenir 32 -regarding 32 -subways 36 -magnetic 32 -chikane 12 -lights 32 -showering 32 -reimpose 36 -lionel 14 -one-upmanship 32 -viral 32 -patchwork 4 -era 32 -upshot 32 -lincoln-mercury 8 -wines 32 -armenian 4 -positions 44 -mcnealy 12 -hostility 32 -morrison-knudsen 8 -banjo 32 -room 36 -rent 32 -roof 32 -mccrk 4 -gluts 32 -walnut-sized 32 -embarrassed 32 -worse 32 -debt-ridden 32 -razor-sharp 32 -balanchine 4 -diagnostic 32 -certfs 8 -nihon 4 -seriously 32 -gotten 32 -rolla 4 -suzanne 4 -introductions 32 -tow 4 -dishwater-dingy 32 -replow 32 -top 38 -rouse 32 -bargain-basement 32 -toy 36 -output 46 -theme-park 32 -uphold 32 -partnership 100 -except 32 -arrows 32 -cartoon 32 -oust 32 -breakdowns 32 -tom 4 -btr 2 -too 38 -ton 32 -tulsa 12 -televised 32 -societies 32 -rex 4 -canon 14 -promotions 34 -spread 32 -minicomputer 32 -non-petroleum 32 -coincide 32 -lemgruber 12 -soared 36 -indictments 32 -leaflets 32 -e.m. 4 -theaters 32 -edwin 6 -limiting 32 -orlando 12 -tampering 32 -creatures 32 -carceres 4 -tertiary 32 -induces 32 -procurement 36 -amortizing 32 -haven 36 -honorable 32 -turned 32 -beard 32 -beare 8 -mtbe 4 -additionally 2 -turner 14 -unlimited 32 -reform-minded 32 -pre-aids 32 -grossly 32 -conceivable 32 -drebsky 4 -worst 32 -claptrap 32 -dausch 8 -bears 36 -corrections 12 -chasing 4 -romualdez 4 -suna 8 -difficult-to-collect 32 -channels 32 -claims 36 -sung 32 -suny 4 -asks 52 -foresee 32 -jacket 32 -ropk 4 -fearing 32 -farmer-borrower 32 -meditation 32 -sunw 4 -chautauqua 4 -gimmickry 32 -lying 32 -attempt 36 -extend 32 -dart 36 -age-old 32 -strips 32 -round 40 -schmedel 8 -extent 32 -confusion 36 -mobex 12 -jolla 4 -attested 32 -beer 100 -leaf 32 -abandoned 96 -calvin 4 -scaled 32 -hart 2 -allegheny 14 -farris 12 -retaliate 32 -weeping 36 -outrage 32 -toys 36 -colonialists 32 -cite 32 -after-tax 32 -paul 14 -oversize 32 -installation 32 -hard 36 -intimate 36 -embodying 32 -brill 4 -gloss 32 -harm 32 -intimidation 32 -confirms 32 -lent 32 -laundry 32 -easley 4 -calm 32 -recoupment 32 -arrington 12 -censure 34 -otc 12 -cx 4 -exact 32 -premiere 32 -rocked 32 -mail-order 32 -giraffe 32 -shadow 32 -delight 32 -tie-ins 32 -high-class 4 -government-appointed 32 -abraham 14 -cohen 12 -tarnow 2 -four-week 32 -specify 32 -gangster-filled 32 -ethic 32 -expulsion 32 -earlier 106 -novices 32 -baird 12 -inert 32 -delivery 32 -a.d. 4 -overlap 32 -delivers 32 -infusion 36 -hooking 32 -uncompleted 32 -cherbourg 4 -birthday 36 -revealed 32 -ko 4 -laser-related 32 -philadelphia 12 -sinking 32 -inch 32 -strippers 2 -compelling 32 -participant 32 -asthma 36 -jumped 96 -discharged 32 -snap 32 -alfb 4 -rob 32 -mckeon 8 -advantage 32 -berendt 12 -disappeared 32 -judgment 44 -worsening 104 -stalled 32 -assails 4 -filing 38 -cyclosporine-a 32 -selikoff 12 -affiliates-led 4 -rehash 32 -stepchild 32 -mob 36 -vanunu 12 -anti-ballet 32 -america 12 -spreads 36 -borscht 4 -witness 40 -giant 32 -keeps 32 -globe-trotting 32 -barge 32 -vigorously 32 -token 32 -preferential 32 -festival 36 -imprisonment 32 -chv 12 -fixings 32 -albright 4 -lugar 4 -shugart 4 -cruikshank 4 -operator 32 -three-point 32 -cutbacks 36 -manage 32 -woman 36 -spot-market 32 -recuse 32 -non-competitive 32 -churchillian 4 -safer 32 -laboratories 36 -rents 32 -unending 32 -mozart 4 -contradicted 32 -unhealthy 32 -assault 32 -crisp 32 -pasts 32 -medication 32 -petty 32 -skunk 32 -swaggering 32 -beige 32 -barrett 12 -drink 32 -rebate 32 -fulfilling 32 -williams 12 -gasoline-tax 32 -prejudices 32 -unhappily 32 -dimwitted 32 -float 32 -oceanographic 32 -overstate 36 -charlene 2 -democracies 32 -brain 32 -seelig 12 -insurgent 32 -convene 32 -undeniably 32 -enough 32 -willoughby 4 -putting 32 -psychiatrist 36 -aberrational 32 -data-networking 32 -douglas 12 -cigna 14 -sexual 32 -mon 14 -god 36 -aggregates 32 -dpac 8 -fazio 4 -characteristic 32 -falter 32 -tristesse 4 -acknowledged 36 -phase 32 -arrive 32 -cautions 32 -gop 4 -fredric 6 -realizing 32 -got 100 -covers 32 -ladies 4 -altitudes 32 -abides 32 -toronto-dominion 10 -image-tarnishing 32 -variant 32 -retard 32 -own 36 -thrive 32 -toying 4 -corrected 32 -acknowledge 32 -evenhanded 32 -shiite 6 -lou 4 -lot 32 -low 46 -facade 32 -los 46 -quarter-to-quarter 32 -chilling 32 -marvelous 32 -dresher 12 -acli 14 -bly 12 -citibank 4 -homes 100 -mounting 32 -winans 4 -centers 36 -glugging 32 -resulting 32 -inferior 32 -producers 102 -axxx 4 -startups 32 -sequent 4 -heavily 32 -viratek 4 -janney 4 -complied 32 -priorities 32 -growth-oriented 32 -empowered 32 -manifested 32 -misapplying 32 -locally 32 -distant 32 -panel 36 -episode 32 -admits 32 -wedding 32 -henri 4 -lines-western 4 -mack 12 -shouting 34 -disease 36 -rationalize 32 -laurence 6 -wrapped 32 -watching 32 -pedersen 8 -postponement 32 -narrows 36 -bely 4 -nmk 8 -growing 32 -actor 36 -naming 36 -figures 42 -cyrus 4 -enterprise 36 -freya 6 -greed 36 -purportedly 32 -needed 32 -nash 4 -europeans 4 -anti-ballistic 4 -linking 32 -ghosts 32 -anti-germ 36 -mot 4 -idomeneo 4 -fluke 32 -good-neighbor 32 -enforce 32 -custom-made 32 -supplied 32 -market-research 32 -cft 4 -expressive 32 -wyo 4 -state 110 -marion 4 -commentaries 32 -all-night 32 -glance 36 -cleared 36 -black-white 32 -supplies 36 -supplier 32 -birch 8 -patsy 32 -long-troubled 32 -fanatic 32 -additives 32 -rough-and-tumble 32 -horsepower 32 -dbase 32 -bonn 14 -god-fearing 4 -administration 38 -plc 4 -bond 42 -bone 32 -denis 8 -intermodal 4 -bona 32 -uprisings 32 -cachet 32 -incurred 32 -all-suite 32 -chevalier 36 -robertson 14 -kinds 32 -surprising 32 -membership 96 -shareholder 32 -olcp 8 -mishandled 32 -dicn 8 -trattori 4 -spokeswoman 32 -dollar-holdings 32 -schemes 32 -powerful 32 -exemptions 32 -picking 32 -chip-consuming 32 -withdraw 32 -pepsi 4 -design 36 -reins 32 -favorably 32 -bousquet-cadillac 4 -marrel 4 -office-space 32 -troubles 32 -nivard-flornoy 4 -racquets 34 -hills 36 -wrigley 4 -okay 32 -hsc 8 -automatic-teller 32 -coal-related 2 -soft-drink 96 -favorable 32 -bookshelf 4 -moines-based 4 -preference 32 -professor 32 -companywide 32 -lieutenant 32 -high-technology 32 -recess 32 -insolvent 32 -lpx 8 -sifting 32 -client 32 -miracle 32 -tendered 32 -tele-communications 14 -igc 12 -holt 8 -shindell 12 -restructure 32 -entitle 32 -agreements 32 -sailing 32 -imnx 8 -tournament 32 -u.s.a. 4 -differs 32 -gillespie 12 -j 14 -retained 32 -wright 4 -responsibility 36 -prosecuting 32 -words 32 -script 32 -remnant 32 -amiable 32 -machine-style 32 -ruined 32 -tank 32 -vanity 4 -euromarket 4 -kolb 8 -chaparral 4 -account 36 -attired 32 -dating 32 -explosions 32 -phases 32 -courage 32 -folk 32 -swap 44 -japanese-made 4 -retiring 32 -spectra-physics 14 -aligned 32 -aftermath 32 -sway 32 -michigan 14 -faith 32 -spokesman 96 -ounces 32 -perturbations 32 -burning 32 -tefra 4 -data-reading 32 -dozens 34 -chronologies 32 -york-based 4 -u.s.-japan 4 -deteriorates 32 -cease-fire 32 -sluggishly 32 -ire 32 -financal 32 -appliances 36 -satellite 36 -solis 4 -excursions 4 -financial-district 32 -goldfeder 12 -porsche 2 -chengbei 4 -jim 6 -fireworks 2 -texas-based 4 -pelf 32 -solid 32 -unsavory-looking 32 -carolinians 2 -explaining 32 -rome 12 -maggie 4 -heights 36 -servicer 32 -scary 32 -adolfo 6 -absolute 36 -claiming 96 -coordinator 32 -prophecy 4 -monolithic 4 -payroll 32 -civil-rights 32 -heed 32 -classrooms 32 -brezhnev 4 -scare 32 -solidarity 4 -contradiction 32 -reviewed 32 -irs 12 -struggle 36 -punta 14 -opec 4 -compelled 32 -miyazawa 4 -opel 6 -open 36 -backless 32 -rostov 4 -airlift 36 -games 100 -domination 64 -impediment 32 -poor-quality 32 -sneaking 32 -sessions 32 -delegations 32 -confides 32 -atoned 32 -pryor 4 -overcharged 32 -unprecedented 32 -particle 32 -kits 32 -edits 32 -daunting 32 -mendoza 4 -distance 32 -majestic 34 -overcharges 32 -likable 32 -wiped 32 -unterberg 4 -cast 32 -trucks 36 -broaden 32 -segue 32 -cleveland 12 -pricing 36 -rochelle 4 -saia 12 -said 126 -broader 34 -affecting 32 -hearings 32 -setup 32 -shaped 32 -planting 32 -operation 36 -lighter 32 -bournonville 4 -sinatra 4 -casy 8 -bitter 32 -combing 32 -potentially 32 -stations 100 -deeper 32 -percentage 34 -greeks 4 -arrogant 32 -stimulate 32 -steinberg 4 -blinder 14 -diversification 32 -lescaze 4 -cihlar 4 -anti-hypertensive 32 -kram 4 -guy 32 -broker 40 -libor 4 -stark 4 -articulate 32 -kalamazoo 8 -buses 32 -prevention 36 -moroccans 4 -microsemi 2 -treasurys 10 -bold 32 -money-losing 32 -gauge 32 -emerald 4 -landfills 32 -start 36 -bused 32 -unproved 32 -stars 36 -broken 32 -bearable 32 -anew 32 -followed 96 -carried 32 -hurting 32 -facts 36 -pepsico 14 -poems 32 -case 36 -pushing 42 -wonderfully 32 -sues 68 -repealing 32 -isolation 32 -ethylene 32 -zealand-based 4 -dimmed 32 -facto 32 -fete 32 -bragged 32 -agsi 8 -defer 32 -walt 12 -scrambled 32 -pins 4 -include 32 -haunted 32 -predator 32 -output-based 32 -baltimore 12 -pine 4 -navistar 4 -world 46 -scotch 4 -well-advised 32 -positively 32 -defying 32 -bade 32 -employed 32 -employee 32 -revised 36 -herrera 8 -stephen 14 -discontinue 32 -arie 4 -arid 32 -aria 32 -forbids 32 -psychology 32 -romans 4 -employer 34 -stance 36 -monthly 32 -stanch 32 -munich 4 -killian 8 -maps 36 -uncertainties 32 -divertimento 4 -ran 96 -shelling 32 -gin 32 -idol 32 -oil-services 32 -limousine 4 -decadent 32 -lambro 12 -hour 32 -noxious 32 -door-to-door 36 -gypsies 32 -lynford 12 -underdog 32 -jams 32 -processors 32 -bankshares 4 -matsushita 4 -berkeley 4 -detriment 32 -implicating 32 -adjustable-rate 32 -more-favorable 32 -padding 32 -lantern-lit 32 -rockingham 4 -continual 32 -compared 42 -inside-trading 8 -oklahoma 12 -broadcasters 38 -sponges 32 -families 32 -merchants 36 -lymph 32 -section 32 -compares 32 -nevada 12 -sng 8 -oil-service 32 -qatar 4 -world-class 32 -guidelines 32 -blue-ribbon 32 -bechar 4 -lease-purchase 32 -retroactive 32 -storybook 32 -weekends 32 -adjourned 32 -shields 12 -levy 44 -scapegoat 32 -plodding 32 -falsity 32 -automatic 36 -taxpayer 32 -uspci 14 -outlets 32 -unavoidable 32 -cjn 4 -bloodless 32 -milllion 32 -seem 32 -seen 36 -hidden 36 -interfere 32 -administrator 32 -seed 32 -crashed 32 -leukemia 4 -misdeeds 32 -indispensable 32 -thwarts 32 -birnbaum 8 -sees 100 -clerical 32 -aloft 32 -catherine 4 -now-legendary 32 -bolstered 32 -intend 32 -york-new 4 -columbus 12 -fibrous 32 -long-running 32 -lammermoor 4 -two-pronged 32 -intent 32 -w.c. 4 -phones 32 -caucasus 4 -regards 32 -brad 4 -cost-reduction 32 -mikhail 4 -steal 32 -oechslin 12 -ibew 4 -bandag 2 -boosts 44 -buffer 32 -cantor 4 -animal 32 -monitors 32 -auto-parts 32 -canton 4 -sanity 32 -vaal 4 -whoever 2 -written 34 -stormy 36 -reconstituted 32 -scullion 12 -fraternity 32 -skids 32 -chip-industry 32 -elsewhere 32 -backers 32 -civilians 32 -allentown 4 -epa 4 -harold 12 -rabbi 32 -rocket 32 -foursquare 32 -hi-bred 4 -appreciated 32 -snapping 32 -epi 8 -colors 32 -pellet 32 -distinction 32 -proposals 38 -doubt 32 -end-use 32 -kid 32 -asia/ 4 -fourth-largest 32 -groove 32 -bidders 34 -overaged 32 -cutback 32 -reliable 34 -abiding 32 -suggesting 32 -embracing 32 -columns 32 -opossms 4 -audacious 32 -marsam 8 -deals 36 -jackson 12 -denials 32 -diners 36 -endgame 32 -alley 36 -smugglers 32 -diagnose 32 -hitters 32 -modifier 32 -available 32 -reliably 32 -anchors 32 -jefferson 8 -matt 4 -students 46 -dan 14 -shape-memory 32 -dal 4 -plight 32 -inevitable 32 -playoffs 32 -liquid-yield 32 -mate 36 -day 36 -nemesis 32 -hindered 32 -alfonso 4 -medical-insurance 32 -off-brand 32 -continuing 34 -alfonse 2 -licensing 32 -forever 32 -spices 32 -blue-collar 32 -atoka 4 -spreading 32 -dumping 34 -persuading 32 -waynesboro 4 -faculty 32 -pasztor 4 -subversion 32 -niggers 32 -cried 32 -chronic 32 -waved 32 -fliers 34 -yawn 32 -understood 32 -barksdale 4 -workers 102 -city-sponsored 32 -hamadei 4 -donned 32 -dealing 32 -provoked 32 -corporate-bureaucracy 32 -recalculating 32 -london 14 -chemical-analysis 32 -fanny 4 -englewood 12 -advancement 32 -passes 32 -donald 6 -await 36 -scurrying 32 -charge-offs 64 -volkswagen 14 -violently 32 -suppliers 96 -bkn 8 -court-appointed 32 -prices 102 -bisporus 32 -commerce 36 -militia 32 -trigger 32 -olivetti-made 4 -creditor 40 -sands 4 -warmth 32 -breach 32 -falls 36 -curling 32 -natural 44 -knowlton 12 -markets 110 -wide 32 -aids-information 4 -quebecor 6 -prohibits 32 -distorts 32 -inherent 32 -task 40 -incarnated 32 -announce 32 -surcharge 32 -graham 14 -violations 32 -kawanishi 4 -responded 32 -johnny 4 -alexandra 4 -unlv 6 -propaganda 32 -dark-haired 36 -bandwagon 32 -storage 32 -akshay 2 -hes 8 -hafez 4 -gatt 4 -sale-leasebacks 32 -ky. 4 -pressure 100 -rereading 8 -chevron 14 -prohibitions 32 -broker-dealer 36 -ex-cell-o 4 -hypertension 32 -virazole-against-aids 4 -colombian 4 -sacrifices 32 -neighboring 32 -reporterof 4 -arts 36 -emigres 32 -n.c. 4 -politeness 32 -varsity 32 -csfb 4 -preserved 32 -macroeconomic 32 -capitalization 32 -treatable 32 -unions 34 -symphony 4 -ill-informed 32 -realigns 8 -technique 32 -ancient 32 -joke 32 -painless 32 -fermenta 14 -error 36 -pincham 4 -shabby 32 -touting 32 -worsens 32 -statements 32 -ship 36 -cumbersome 32 -hilford 4 -ditches 32 -audio-video 34 -cheek 12 -trim 32 -stored 32 -cameramen 32 -wojciech 8 -trip 36 -inappropriately 32 -lowest 36 -leap 32 -e.l. 4 -hong 12 -extension 36 -reuben 2 -medicare 4 -pakistan 6 -fatal 32 -restless 32 -injured 32 -bantam 4 -integration 32 -lawrenceville 8 -talc 32 -temporary 32 -missing 40 -stamina 32 -talk 38 -reflected 32 -awry 32 -tall 32 -kathleen 4 -asthmatic 32 -saturday 4 -indefinitely 32 -indiana 4 -measurement 32 -when-issued 32 -petro-canada 4 -certificate 32 -rowdiness 32 -breaking 34 -technology 110 -reregulation 32 -bettner 4 -glacial 32 -searched 32 -mercoil 4 -peres 12 -brenton 4 -textbook 32 -grossman 12 -irreverence 32 -interpreters 32 -triboro 4 -scrutinizing 32 -d-rams 4 -munin 8 -pushed 96 -correctional 6 -minorities 32 -wagon 32 -abolished 32 -offer 44 -slowing 32 -joe 14 -colts 4 -admissions 32 -stanger 8 -effect 36 -symbolic 32 -croons 32 -wanting 32 -inducing 32 -mentioned 32 -spun 96 -predated 32 -commanding 32 -honestly 32 -plausibly 32 -merger-defense 32 -virginia 12 -jump 36 -quoting 32 -disturbing 32 -no-win 32 -crime 36 -whisper 32 -caesarean-section 4 -malaysia 4 -va. 4 -floor 32 -important 32 -control 44 -nobel 4 -tre 4 -elephants 32 -screenplay 32 -trn 4 -tro 4 -rafael 4 -unmanageable 32 -translate 32 -wash. 4 -debt-to-capital 32 -trw 4 -flood 32 -atorino 4 -tariffs 36 -try 42 -borger 4 -overseas 102 -steer 32 -steep 32 -personal-injury 32 -reversing 32 -samples 32 -bureaucratic 32 -brahms 4 -astronautics 4 -dual-purpose 32 -baylor 4 -equicor 2 -curfew 96 -heroine 32 -republic 46 -immunology 36 -jeyaretnam 12 -redwood 4 -steel 46 -finish 32 -lab 32 -auctioned 32 -hair-trigger 32 -ledgard 4 -b.v. 4 -quarter 100 -parties 36 -exclude 32 -nutrasweet 14 -itself 36 -p&l 4 -liberals 32 -las 14 -originated 64 -tehran 6 -law 108 -tremolite 32 -lau 4 -solicitousness 32 -topics 32 -lay 36 -blinders 32 -mugged 4 -deserves 32 -coach 36 -mechanics 32 -inconsistent 32 -goodyear 14 -outperforming 32 -cafe 36 -interpretive 32 -spot 36 -materials 100 -minister 36 -jubilant 32 -personal-care 32 -regard 32 -chrysanthemum 32 -goodwill 32 -aspiration 32 -usda 4 -instinctively 32 -hillsborough 4 -anti-takeover 44 -hanson 8 -blistering 32 -extended 100 -presides 32 -process 32 -ancestors 32 -voted 36 -amfac 6 -october 4 -complete 32 -distributed 32 -evolutionary 32 -fresher 32 -transferable 32 -delfim 4 -votes 32 -nestle 2 -flight 32 -presided 32 -thurs 10 -viewed 36 -zweig 8 -close-out 32 -shakespeare 4 -column 32 -weary-looking 32 -complained 32 -factor 32 -outweigh 32 -buffeted 32 -slithering 32 -pollution 36 -fisheries 32 -much-needed 32 -cumulative 32 -rose-colored 32 -thrusts 32 -eaux 4 -rhr 8 -impairing 2 -smart 36 -giorgio 4 -aer 4 -aes 12 -r-word 4 -permanently 32 -latin 6 -governor 32 -cable-television 32 -resplendent 32 -twentieth 4 -assessments 32 -ragbag 32 -hispanics 4 -monitoring 32 -accreditation 32 -macdonald 12 -suspension 32 -assailed 32 -observation 32 -roller-coaster 32 -unintended 32 -affected 96 -grants 32 -deprosse 4 -iraqi 6 -aegis 4 -shortened 32 -wingding 32 -harsco 12 -hullabaloo 32 -crying 34 -data-communications 32 -mobil 4 -instant-camera 32 -egg 32 -auroras 4 -vcc.a 4 -worrell 12 -coldwell 4 -transmutes 32 -obstacles 36 -rebuild 32 -jan. 14 -mutual 46 -political-science 32 -broader-based 32 -comsat 8 -dickson 4 -triple-b 32 -setback 40 -flick 4 -period-vaulting 32 -rebuilt 32 -buybacks 8 -store 36 -maneuvering 32 -ala. 4 -plots 32 -unattractive 96 -science 36 -sound 36 -digital 36 -capsule 32 -greensburg 4 -overload 32 -overall 34 -curbs 36 -excuses 32 -depository 32 -virgin 4 -racing 32 -olivetti 12 -walkout 32 -brake 32 -cruzado 36 -nasdaq 4 -servicing 32 -barely 32 -mainframe 32 -dull 36 -swarthy 32 -excused 32 -excited 40 -trap 32 -winter 46 -growers 32 -alas 2 -completing 32 -export-guarantee 32 -immunity 32 -disarmed 32 -nixon 4 -proclaiming 32 -t 12 -jane 4 -warehouse 32 -sharply 34 -splits 32 -sideline 32 -technologies 44 -expects 108 -tate 4 -bureaucracies 32 -reverence 32 -waller 4 -launching 32 -black-lung 32 -advocated 32 -gather 32 -punch 4 -dawn 32 -apprehension 32 -leases 32 -belong 32 -fewer 34 -reps. 6 -nonbank 32 -vacated 32 -potty 4 -tagamet 6 -allies 96 -accurately 32 -tempestuous 32 -unsuccessfully 32 -undergone 32 -strangers 32 -height 32 -boyd 12 -fiscal 44 -lbr 4 -gideon 4 -election 36 -delay 36 -deliver 32 -filings 36 -alums 32 -self-defeating 32 -entertainment 36 -bottle 32 -maseng 4 -hillsman 4 -arbitragers 34 -hugely 32 -declining 36 -synthetic 32 -classicism 32 -preceding 32 -informal 32 -tourist 32 -trips 32 -piecuch 8 -a-2 4 -occur 32 -a-1 4 -mcclelland 4 -eighth 4 -ritter 4 -thimbleful 32 -activism 32 -dennis 6 -resemble 32 -credits 32 -a-% 32 -demento 8 -strauss 4 -termination 32 -terrorists 32 -raider 32 -rejection 32 -repressed 32 -crownx 4 -conceptually 32 -contracts 36 -observe 32 -risen 36 -strategist 32 -photographer 32 -fiscal-year 34 -rodents 4 -video-disk 32 -renomination 36 -feelings 36 -pre-reform 32 -prints 36 -wnews 12 -osaka 4 -transferred 32 -turning 32 -bible 4 -wives 32 -undersecretary 32 -restoration 32 -promoted 32 -outlaw 32 -feel 36 -bombing 32 -roanoke 12 -choices 32 -lovett 8 -schoolmasters 32 -newcomers 32 -oil-field 32 -clandestine 32 -addict 32 -with 110 -existed 32 -audrey 4 -interpretation 32 -unqualified 32 -incidents 32 -jayne 2 -portasol 4 -late-night 32 -bite 32 -mice 36 -there 46 -museum 36 -equities 36 -weighing 32 -nhy 8 -concord 4 -kogod 4 -bits 32 -benedetti 4 -blacklisted 32 -militants 32 -replica 32 -methodist 4 -excite 32 -declined 104 -diamond 14 -top-of-the-line 32 -oral 32 -allay 32 -storer 4 -stores 36 -declines 36 -smorgasboard 32 -romanticism 32 -whopping 32 -s&l 4 -machines 36 -allan 4 -asher 4 -full-year 38 -undeveloped 32 -statistical 38 -terribly 32 -storyteller 32 -deploying 32 -gci 4 -meet 36 -zipping 32 -gravely 44 -tide 32 -recruits 32 -genetic 36 -powers 32 -dravidians 4 -focused 32 -shlaes 4 -abelson 4 -swathed 32 -hard-pressed 32 -terrible 32 -brigade 32 -ripe 32 -pik 4 -drug 102 -six-week 32 -d. 4 -toured 32 -pie 12 -included 32 -productive 32 -brokerages 36 -intelligent 32 -slower-than-expected 32 -impossible 32 -intervention 32 -fuzzy 32 -johnston 12 -kiwi 4 -servants 32 -small-parcel 32 -pit 32 -cowboys 4 -transformation 32 -ronald 6 -contribute 32 -ample 32 -impaired 32 -stood 32 -portec 8 -ico 4 -kureishi 4 -phoenix 14 -discerning 32 -long-lived 32 -kirby 4 -artists 36 -noise 32 -suspend 96 -gouging 32 -operating 46 -bancshares 4 -hairline 32 -contractionary 32 -amply 32 -marketings 32 -herbert 2 -fatality 32 -tried 96 -dj 4 -mixture 32 -gray 46 -carlos 4 -byswi 4 -modified 32 -interfaith 4 -capable 32 -dd 8 -pole 32 -wallach 4 -gran 4 -astonishes 32 -tries 32 -bonds 110 -pulmonary 32 -patronizing 32 -slow 32 -grab 32 -contractual 32 -slot 8 -exercise 32 -tidy 32 -trash-collection 32 -meleiha 4 -three-lane 32 -monitored 32 -blas 4 -excuse 32 -cushion 32 -unicorp 12 -rogue 32 -pathetic 32 -criminally 32 -norway 14 -lurking 32 -heitzeberg 4 -specialty 96 -adjustment 32 -mission 38 -batterymarch 6 -extrapolation 32 -lara 12 -tempted 32 -deputy 38 -meals 32 -reverse 40 -raises 36 -posner 12 -encounter 32 -curtis 12 -sanctuary 32 -champion 44 -demanded 32 -washingtonians 4 -underwood 12 -perkins 4 -scene 32 -emotional 32 -salesman 32 -angola 4 -antitrust 32 -sears 4 -laughing 32 -interiors 4 -vinnell 4 -givebacks 32 -non-canadians 32 -fading 36 -rounded 32 -newport 4 -invented 32 -diplomatic 32 -rituals 32 -reilly 4 -clda 8 -absenteeism 32 -periodically 32 -herself 32 -works 44 -surface 32 -betsy 4 -realistically 36 -abm 4 -rainier 6 -heavy-truck 32 -raiding 32 -hoop 32 -broadening 32 -engaging 32 -luders 12 -believe 34 -pharmacists 32 -arising 32 -misconduct 32 -unexpectedly 32 -binns 4 -hood 32 -vanilla 32 -darling 32 -metzenbaum 4 -gaynors 4 -going-away 32 -centerior 4 -distributions 32 -disable 32 -acreage 32 -rca 12 -demonstrated 32 -counties 32 -suburbs 32 -sloan-kettering 4 -self-deprecating 32 -escorted 32 -upjohn 14 -ph.d. 4 -progression 32 -recipe 32 -hamburg 4 -painters 32 -vegetables 32 -skittish 32 -altmann 4 -coerce 32 -asbestos-lawsuit 32 -anti-terrorist 32 -ended 36 -license-renewal 32 -obvious 32 -sought-after 32 -information-swapping 32 -commits 32 -beset 36 -chernobyl 4 -chiam 4 -plunder 32 -terzi 12 -hates 32 -ex-chairman 40 -override 32 -third-country 32 -standardization 32 -revamping 40 -georg 4 -suits 38 -expressiveness 32 -mchenry 8 -tuition 32 -bridal 32 -stunned 32 -situations 32 -foxboro 8 -snake 36 -incantations 32 -torrance 4 -term 32 -diarrhea 32 -fisher-price 4 -reciprocity 32 -clayton 4 -sickened 32 -conditional 32 -difficulty 32 -youngest 32 -manipulative 32 -clamor 32 -opposition 38 -nutty 32 -biography 32 -anaylsts 32 -operations 102 -conventional 32 -spc 4 -torture 32 -black-cowboy 32 -duplicating 32 -sodium 32 -overhang 32 -soft 38 -occurs 32 -soviet-bloc 4 -jobs 36 -harshly 32 -instead 106 -susceptible 32 -authority 36 -spy 36 -loss-ridden 32 -reseve 4 -ataturk 4 -amortization 32 -globo 4 -bloc 36 -afforded 32 -nicolas 6 -propelled 32 -marching 32 -inside 34 -knocking 32 -meticulously 32 -tax-increase 32 -skies 32 -wall 46 -incorrectly 32 -incarcerated 32 -homosexuality 32 -truman 2 -buoyant 32 -consortium 100 -propeller 32 -over-represented 32 -tamanrasset 8 -provides 32 -illusions 32 -aghast 32 -luff 32 -demjanjuk 6 -deteriorating 32 -debates 32 -bernado 4 -class-action 32 -vision 32 -provided 96 -rozelle 12 -tears 36 -fredericks 8 -mystifies 32 -depriving 32 -superlatives 32 -madison 4 -architecture 32 -weyerhaeuser 12 -exploit 32 -ct. 4 -approves 44 -parksigns 4 -drives 36 -invoking 32 -maclaine 4 -hogwash 32 -t.cdc 8 -air-conditioner 32 -spruce-pine-fir 32 -newest 32 -convenience 32 -approved 36 -aspidistra 32 -signature 36 -six-month 32 -cos. 4 -valet 32 -idaho 4 -mesopotamian 4 -mca 6 -formation 32 -excess 34 -courier 32 -pearl 4 -outdoor 32 -drawing 32 -libyan 4 -intended 100 -goes 32 -nyse 4 -confront 32 -writers 32 -satin 32 -dump 32 -zodiac 4 -scramble 4 -diminishing 32 -plain-talking 32 -wearying 2 -director 102 -crew 32 -gandhi 4 -forward-looking 32 -mushrooms 36 -w-4s 4 -inventory 42 -researchers 42 -enthusiasm 36 -instructed 32 -cref 14 -counterlogic 32 -youngsters 32 -pectorals 32 -memoirs 4 -fsia 4 -reception 32 -oil-well 32 -ranges 32 -gng 4 -flies 32 -example 32 -bombshell 4 -telectronics 6 -bullishness 32 -keenly 32 -centered 32 -ranged 32 -security 46 -acquired 100 -spurred 32 -dardi 4 -responding 34 -tightrope 32 -stan 4 -cmb 8 -bennett 8 -eliminating 32 -stay 36 -specials 32 -bowie 4 -satans 4 -wellsford 4 -disposes 32 -envy 32 -star 36 -props 32 -wgr-am 4 -centuries 32 -workplace 32 -facing 32 -bankruptcy 44 -generous 32 -erode 32 -priori 32 -stallions 32 -birdwell 12 -figured 32 -special 46 -feelingly 32 -grades 36 -specialize 32 -payload 32 -strain 36 -repeating 32 -tickers 32 -stage-dominating 32 -unspectacularly 32 -prd 8 -subservient 32 -sigma 4 -self-imposed 32 -nail-painting 32 -legendary 32 -hoods 32 -concentrate 32 -caspar 4 -finanziaria 8 -somewhere 44 -exceeds 32 -economical 32 -grieveson 4 -mckay 12 -vehicle 32 -donors 38 -refaat 4 -earthly 32 -surveys 34 -canceled 32 -devastatingly 32 -forging 32 -hardship 32 -turkey 38 -fnd 4 -benjamin 4 -chiba 4 -zim 8 -reassigned 32 -iranian 4 -chocolatier 4 -mesmerized 32 -carberry 4 -antigovernment 32 -charnel 32 -cede 36 -dallas 14 -praising 32 -sr. 4 -acquisitive 32 -fragile 32 -crippling 32 -potato 32 -oft-stated 32 -minolta 4 -mcguire 4 -disrupted 32 -g-5 4 -well-qualified 32 -drug-enforcement 32 -prisoners 32 -exchanging 32 -undervalued 32 -rogers 12 -gerson 12 -toning 32 -material-handling 32 -groundwork 36 -edwards 12 -deepen 32 -bankcard 4 -stockbrokerage 32 -understand 32 -francaise 8 -fetch 96 -vietnam 6 -a.a. 4 -low-end 32 -bulky 32 -breadth 32 -srv 8 -sri 4 -harrah 12 -relaxed 32 -nightwatch 4 -crimps 32 -kemp 4 -venture 100 -smell 32 -recruitment 32 -dirac 8 -low-status 32 -custody 32 -shoulder 32 -purpose 36 -terry 4 -precise 40 -manitoba 4 -smooth 32 -fell 100 -harcourt 4 -accessories 32 -gestures 32 -supply-and-demand 32 -tanner 4 -mg. 32 -lobster 32 -felt 32 -today 46 -felv 4 -taras 4 -protege 32 -warnings 32 -company-operated 32 -evening 36 -her 34 -availability 32 -lime 36 -hew 4 -granted 38 -chocolat 4 -pours 32 -nfl 12 -dictionary 32 -hey 36 -hex 12 -concurrently 2 -motoi 2 -hiccup 32 -ischemia 32 -utah 14 -comfortable 32 -hepatitis 32 -detail 32 -evidenced 32 -rosalie 6 -auction 36 -rage 32 -length 32 -hefty 32 -slowness 32 -acquires 44 -promises 32 -bugs 8 -wards 32 -shapes 32 -grenfell 4 -animosity 32 -bobby 12 -report 44 -boat-and-motor 32 -serve 32 -relocating 34 -last-minute 32 -oil-producing 32 -unity 32 -clashes 32 -budgetary 32 -ignited 32 -dependents 32 -caves 32 -indigent 36 -possibilities 32 -borrowed 36 -irregularities 32 -kazakhstan 4 -piecemeal 32 -unfolding 32 -tijuana-san 4 -co-workers 32 -holding-company 32 -pollution-control 32 -rules-enforcing 32 -latchkey 32 -general-interest 32 -nazaire 8 -magnet 32 -marched 32 -perk 32 -retaining 32 -turmoil 32 -roads 32 -two-day 32 -gm 14 -at&t 14 -go 36 -budapest 8 -novelty 32 -gd 4 -ge 14 -andersen 4 -violating 32 -petroles 4 -flew 36 -actuarial 32 -fabian 8 -occasional 32 -poll 32 -gy 8 -dwight 8 -gt 12 -medical-technology 32 -icc 4 -gq 4 -gr 4 -assuage 32 -rembleske 8 -koenig 4 -breed 32 -documentaries 32 -lows 32 -protectionist 32 -now-notorious 32 -celebrating 32 -robberies 32 -refineries 32 -all-time 32 -allegis 4 -planes 32 -decried 32 -jwt 14 -branded 32 -ba-2 4 -planet 32 -forson 8 -prosecutors 32 -beatrice 4 -jugglers 32 -champs 32 -run-up 32 -raiser 32 -links 32 -adapts 4 -drawn-out 32 -confirm 32 -wrecked 32 -torto 4 -electronic 38 -printers 32 -ease 36 -visibly 32 -welex 4 -happened 32 -easy 36 -visible 32 -neglecting 32 -raised 108 -desperately 32 -guess 32 -adversely 32 -east 46 -deferral 32 -globe 36 -carteret 6 -bid-rigging 32 -j.k. 4 -jett 12 -pure 32 -improvise 32 -internal 46 -pura 4 -jets 32 -thereafter 34 -unremitting 32 -paulus 8 -leagues 32 -hazard 32 -low- 32 -faint 32 -mcfarlane 14 -student 36 -means 36 -afoul 32 -garden 44 -miami-based 4 -ram 8 -drawn 32 -rao 8 -meant 36 -de-americanized 32 -consequence 32 -mcginley 4 -ray 4 -elements 32 -danish 14 -seated 32 -nonprofessional 32 -capita 32 -battered 32 -yevgeny 4 -anti-jazz 8 -agencies 32 -raw 32 -experts 34 -takeover-oriented 32 -fictionalized 32 -hailed 32 -nothing 40 -but 126 -bookstore 32 -fweets 32 -nli 8 -continent-wide 32 -five-week 32 -between 34 -zy 4 -painted 32 -mathewson 4 -precipitate 32 -calculations 32 -addington 2 -racial 32 -reich 12 -stockholdings 32 -bus 32 -dumagami 10 -taught 32 -wishful 32 -bumpy 40 -payouts 36 -chocolate-coated 32 -fingerprinted 32 -thwarting 32 -de-germanize 32 -adopting 32 -brannigan 4 -witless 32 -hoak 8 -plunking 32 -modernizes 32 -competitors 34 -fenders 32 -debenture 32 -eight 34 -defendant 32 -knack 32 -spike 14 -pre-graham 32 -mcdonnell 6 -mousetraps 32 -mismanaged 32 -makers 110 -gramm-rudman 4 -pertained 32 -bram 2 -scandals 44 -stock-swap 32 -yell 36 -chamorro 12 -s.c.-based 4 -casinos 44 -representatives 32 -impractical 32 -foreclose 32 -frank 14 -tyger 4 -reporter 36 -grocery 32 -stripes 4 -impulses 32 -frisked 32 -until 38 -lotus 14 -exaggerated 32 -foam 32 -foal 32 -reported 100 -kensington 4 -radiation 32 -structuring 32 -rescheduling 32 -registration 32 -initiating 32 -recognizable 32 -federated 4 -tinker 12 -solicit 4 -eric 4 -kindercare 4 -cushman 4 -voluntarily 32 -marches 32 -willing 32 -defending 32 -libya-style 4 -once-monolithic 32 -broadcast 36 -specious 32 -chandler 8 -forgotten 36 -free-standing 32 -second-ranked 32 -chestnut 32 -visits 32 -advisory 32 -authentic 32 -churns 32 -mood 32 -slowly 40 -insight 32 -truell 4 -string 32 -understandably 32 -associates 38 -solicited 32 -inscrutability 32 -support 116 -dominate 32 -moot 32 -issues 36 -precedes 32 -callous 32 -messel 8 -concrete 32 -understandable 32 -refuge 36 -urban 44 -sell-off 32 -memory 46 -single 32 -either 34 -armon 4 -qualifies 32 -collecting 34 -recommends 32 -hodding 4 -administered 32 -conversions 32 -zeroed 32 -charters 32 -effluent 32 -favorites 32 -netto 4 -cigarettes 36 -tues. 8 -reporters 36 -informally 32 -indicted 44 -obliged 32 -brag 32 -twa 4 -unlike 34 -provisionally 32 -oversold 32 -my 38 -carriers 100 -quadrex 4 -singapore 14 -angrily 32 -taipei 8 -regularly 32 -romeo 4 -owned 32 -atomization 32 -borne 32 -openness 32 -intrauterine 32 -rocks 32 -tahoe 4 -antiprotons 32 -reflecting 40 -oswego 4 -dispense 32 -owner 32 -referred 32 -zimmer 14 -rocky 36 -motivation 32 -year-old 32 -ladder 32 -lear 14 -printer 32 -raise 100 -presentation 32 -excoriating 32 -jed 4 -certified 44 -kudelka 4 -ludwig 4 -nuys 4 -printed 32 -sanguine 32 -beta 32 -aron 10 -manufacturer 32 -manufactures 32 -chocolate-chunk 32 -patent-infringement 32 -franc 32 -microbe 32 -milton 4 -richfield 4 -corrosion 32 -junk-bond 32 -disclose 32 -rohr 14 -pinch 32 -stuck 32 -animal-health 32 -converting 32 -expand 32 -otherwise-anemic 32 -worlds 36 -platoon 4 -more 110 -beneficial 32 -fnbf 8 -auto-buying 32 -adhesives 32 -vignettes 32 -convinced 32 -beats 32 -touchy 32 -definitely 100 -falling 42 -accounts 36 -poorest 32 -david-weill 12 -quondam 32 -siegler 4 -leaning 32 -chefs 32 -inject 32 -category 36 -morris 12 -admitted 32 -white-haired 32 -tweedledee 4 -favors 36 -loosen 32 -maxcell 4 -disability 32 -envoys 32 -attesting 32 -stockbrokers 32 -stares 32 -lasted 32 -toronto 12 -chilly 32 -paine 4 -ardmore 4 -trademarks 32 -hourly 32 -boo-boos 32 -byron 4 -complements 32 -domtar 4 -grounds 32 -stoic 32 -anemia 32 -anemic 32 -empire 32 -credibly 32 -six-year-old 32 -probing 36 -rhodes 4 -meter 32 -otherwise 34 -harney 4 -doulton 4 -supposed 32 -hanisee 12 -loose 32 -disciplined 32 -toxic 32 -pargas 4 -plea 36 -tuxedo 32 -three-year 32 -grimm 12 -actions 32 -special-events 32 -furnished 32 -latch 32 -wfc 8 -discounting 32 -parcel 4 -leftovers 32 -carve 32 -hmo 4 -comparison 32 -furthered 32 -wrko-am 4 -stoltenberg 4 -drugs 32 -majority-owned 32 -sergeants 32 -topic 36 -express 36 -moslem 6 -operators 32 -paraphrase 32 -inner 32 -innes 8 -battle-tested 32 -less-developed 32 -pleased 32 -nominees 32 -occupations 32 -auditorium 36 -bickering 32 -fernley 4 -stock-in-trade 32 -nobodies 32 -reaffirm 32 -millon 4 -splintering 32 -drucker 4 -detected 32 -elaborately 32 -books 44 -completion 34 -catalogued 32 -invests 32 -relays 32 -overpriced 32 -convicted 32 -beseechingly 32 -bukovsky 4 -claremont 12 -belittle 32 -gigolo 4 -asymmetrical 32 -stylish 32 -arkansas 4 -collusion 32 -boller/coates/spadaro 4 -trinity 4 -zoo 32 -optimistic 36 -qualifications 32 -pinpoint 32 -reminds 32 -vendors 32 -dna 4 -transtechnology 14 -sites 36 -in-house 32 -st. 12 -characterize 32 -crisscross 32 -birth 32 -handcuffed 32 -nod 4 -knife 32 -c-123 4 -attacking 32 -prods 32 -sexy 32 -debilitating 32 -speakes 4 -speaker 36 -advancing 32 -lawyer 44 -kilpatrick 8 -carrot 32 -unsung 32 -hoopsters 4 -high-density 32 -public 38 -cedar 8 -communicated 32 -carmine 4 -administrative 32 -idea 32 -overture 36 -apartment-housing 32 -swiss 14 -harrow 4 -indication 32 -ontario 4 -eugene 14 -omit 32 -satiated 32 -bilateral 32 -sto 4 -stk 4 -spite 32 -stockpile 32 -nighttime 32 -beneath 32 -conquered 32 -stb 8 -responsibilities 96 -guaranteeing 32 -glashow 4 -sharks 32 -debt 100 -speeding 32 -hottest 32 -outperformed 32 -bar-hopping 32 -wohl 12 -hardware 36 -glenn 4 -lauding 32 -attrition 32 -software 36 -alert 32 -faction 32 -attached 32 -emr 8 -film 44 -captives 32 -fcc 14 -o-daiko 32 -supporter 32 -sentiments 32 -file 32 -amnesia 32 -slipped 40 -religious 32 -moon 32 -ndco 8 -tax-loss 32 -prejudiced 32 -unselfish 32 -medications 32 -ears 32 -concentrating 32 -professionals 32 -scout 32 -chapters 32 -havens 4 -flavor 32 -concert 36 -heroines 32 -wished 32 -creator 32 -indicate 36 -limited-partnership 32 -edinburgh 4 -improvements 32 -dismissed 32 -run-and-gun 32 -concern 38 -sparse 32 -anti-apartheid 38 -cost-cutting 36 -second-guessing 32 -supported 34 -liven 32 -leveraged-buyout 32 -high-school 32 -lived 32 -market 110 -approval 110 -expiring 32 -intra-european 32 -parallel 32 -inhabitants 32 -district 108 -centocor 4 -brookings 2 -marked 32 -eclipsed 32 -jeopardizing 32 -evaporate 32 -marquette 4 -lives 34 -vegas 4 -death-rate 32 -lambsdorff 4 -gender 32 -sustained 32 -damage-control 32 -tatum 8 -milestone 32 -danny 4 -sever 32 -knight 12 -beauty 36 -optical 4 -seven 34 -cabs 32 -frightening 32 -tenafly 4 -preventable 32 -photography 40 -organisation 4 -crawl 32 -sloan 12 -gina 2 -belched 32 -huyssteen 4 -five-month 32 -chicken-restaurant 32 -passed 32 -bumpers 32 -financial-services 32 -jus 4 -voice 36 -colby 4 -rationalization 32 -first 46 -jug 4 -construction-materials 32 -worn-out 32 -random 36 -whip 32 -kicks 32 -legalization 32 -dispositions 32 -air 46 -semiconductor-industry 32 -better-known 32 -ait 4 -inevitability 32 -compliance 32 -inroads 32 -aim 32 -me 38 -freedman 12 -shirts 32 -freshman 32 -aid 100 -conservatism 32 -visa 6 -hurlburt 4 -excellence 32 -halted 32 -vise 32 -exports 110 -ketchum 12 -automatically 32 -subscribe 32 -thumbed 32 -hyperbole 32 -convert 32 -later 46 -tpa 4 -diane 6 -autonomy 32 -repressive 32 -preoccupation 32 -n.h. 4 -co. 12 -mullally 12 -m.j. 6 -cephalexin 6 -surgeon 32 -lowenthal 12 -trammell 4 -fattening 32 -nyack 4 -reorganizing 32 -judge 46 -friday 14 -lowly 32 -dodge 40 -lounge 32 -auckland 4 -conquest 4 -retailer 96 -activity 36 -investable 32 -clg 8 -adapting 32 -familiarity 32 -meaning 32 -clf 8 -depreciable 32 -bounce 32 -randall 4 -adjuvant 32 -disaster 32 -liberties 32 -waltz 4 -derby 8 -recession 32 -perceive 32 -freight 44 -opening 32 -g-man 4 -tops 32 -legg 4 -con 4 -heinemann 4 -copies 32 -excitement 32 -office-automation 64 -plunked 32 -temptations 32 -euroyen 4 -collaboration 32 -experiments 32 -deferred 32 -front-runners 32 -ma 4 -stop 46 -rarely 34 -copied 32 -cox 4 -coy 4 -charing 4 -awakening 32 -cot 4 -nsany 4 -sweetly 32 -brides 32 -deliberations 32 -wdg 8 -recurrence 32 -quarterly 46 -rigdon 4 -spouses 32 -tactic 32 -tomorrow 34 -clouds 4 -inefficiency 32 -inserts 32 -fanuc 4 -luanda 12 -sachem 4 -cellular 36 -hokum 32 -courtroom 32 -pinks 32 -proportioned 32 -obscures 32 -profit-spreads 32 -disposal 32 -exorbitant 32 -creates 32 -fred 12 -free 44 -placed 36 -sommer 8 -caucused 32 -vann 4 -finished 32 -questionable 32 -while 46 -institution 36 -passions 32 -frew 12 -sense 32 -productivity 32 -federation 4 -relative 32 -petipa 4 -spain 4 -deposit 36 -tex 12 -deservedly 32 -malaysian 12 -claudine 8 -orchestrated 32 -departing 32 -bygone 32 -cruzeiro 32 -enthusiastic 32 -reimburse 32 -rapaciousness 32 -durant 4 -tornto 4 -statute 32 -transition 32 -emil 4 -floating-interest-rate 32 -derivative 32 -casualty 38 -whatever 38 -often 46 -delco 4 -currency-trading 32 -temple 4 -reinstatement 32 -keynesian 4 -world-traded 32 -prevailed 32 -heel 36 -second-highest 32 -healthvest 8 -state-court 32 -cash-flow 32 -estate 36 -contingents 32 -disputes 38 -ferris 12 -guys 36 -vancouver-based 4 -officials 110 -moderate-priced 32 -advise 32 -amended 32 -isc 4 -devour 32 -isn 104 -centralized 32 -glittering 32 -dried 32 -unreliable 32 -cokes 4 -hague 4 -transient 32 -workweeks 32 -morgan 14 -peaks 32 -peripheral 32 -harlem 4 -attempted 32 -resistant 32 -secrecy 32 -dtc 12 -kill 32 -reiterated 32 -common-stock 32 -stabilizing 32 -conner 12 -dealer 36 -competitor 32 -quadrennial 32 -reassess 32 -ensemble 36 -related 36 -marbury 4 -soft-spoken 32 -portugal 4 -home-video 32 -covered 32 -originating 32 -recommendation 32 -persecuting 32 -menlo 4 -unsuccesful 32 -reply 32 -sunni 4 -parachutes 32 -entrepreneurship 32 -tumult 32 -buffalo 4 -borders 32 -survive 36 -rejecting 32 -last-ditch 32 -dummy 32 -ldc 4 -conn.-based 4 -comparing 32 -huntington 6 -very-small-business 32 -members 38 -imelda 4 -clips 32 -accidents 32 -salinas 4 -native 32 -latter 32 -write 36 -seldom 34 -coleman 14 -zagorski 4 -enderlin 4 -staples 32 -am 38 -fabled 32 -couple 32 -klaus 4 -nightly 36 -conrail 14 -dental 32 -definition 32 -magnitude 32 -moment 32 -artistically 32 -schreyer 8 -waterhouse 4 -square 36 -leventhal 4 -fashion 32 -toronto-based 4 -midpriced 32 -compatibility 32 -rohowsky 8 -recede 32 -ken 4 -abolition 32 -threw 32 -bets 32 -earning 34 -thompson 4 -hinder 32 -key 44 -veritable 32 -risks 36 -olof 4 -lasers 32 -three 110 -ddc 4 -risky 36 -bookish 32 -perception 32 -grata 32 -repayment 32 -denigrating 32 -wilcox 4 -michigan-based 4 -pugnacious 32 -non-steel 32 -high-ranking 32 -put 34 -foreseen 32 -borovoy 8 -manufactured 42 -pul 4 -pub 4 -pullman-peabody 12 -ernest 14 -semifinals 32 -interns 32 -bothered 32 -brockman 4 -entry 32 -ba-3 4 -noel 2 -ba-1 4 -under-represented 32 -entries 32 -ventres 8 -numerals 32 -criticism 32 -users 36 -coat 32 -department-related 4 -earned 36 -groves 8 -coal 36 -eight-foot 32 -foskett 4 -selling 42 -repurchased 32 -masters 36 -misstated 32 -kahle 12 -lift 32 -sheridan 4 -modern 38 -ulcer 32 -secured-lease 32 -varied 32 -cripple 32 -supercomputers 32 -refunded 32 -mari 4 -life 46 -barley 32 -reopened 32 -brigades 32 -flirting 32 -wednesday 12 -postponed 100 -complain 32 -hijacking 32 -waving 32 -data-processing 32 -vogel 4 -ontario-based 4 -audience 32 -subordinate 32 -underwrote 32 -backed 32 -nummi 8 -infractions 32 -smuin 12 -gerry 8 -slender 32 -forms 36 -sciences 4 -culture 36 -pm 4 -concept 46 -pn 8 -evans 4 -pk 4 -strengthen 32 -scott 4 -pd 4 -origin 32 -waive 32 -halved 32 -applications 42 -cabbage 4 -booty 32 -reliance 46 -gant 8 -boots 36 -halves 32 -wisconsin 14 -rgs 4 -roxani 4 -troops 32 -builders 44 -projects 32 -maneaty 12 -economic 108 -trager 4 -disbanding 32 -muhammad 12 -curtail 32 -departments 32 -curently 32 -satisfactorily 32 -nicholson 4 -davidweill 8 -inventive 32 -arbor 4 -orthodontic 32 -audiences 32 -induction 32 -manipulation 96 -kimberly 4 -nonwhite 32 -jesus 4 -localized 32 -salomon 14 -valued 100 -criticizes 32 -greece 6 -new-car 32 -minn. 4 -ewall 32 -mortgage-backed 32 -urgency 40 -ceramic 32 -job-conscious 32 -spying 32 -dispel 32 -conversations 32 -values 32 -hindawi 4 -coryphees 32 -resurrecting 32 -hock 4 -sharing 32 -edition 32 -omens 32 -sauce 32 -declarations 32 -lambert 4 -paper 38 -pizza 38 -phenomenon 32 -underdeveloped 2 -inflation-adjusted 32 -lawton 12 -architectural 32 -visions 32 -macdougal 8 -improving 32 -reaches 32 -locating 32 -far-fetched 32 -lucia 4 -finest 32 -reis 12 -sailboat 32 -unfairness 32 -faded 32 -cassoni 4 -upstate 32 -valvano 4 -hyperinflation 32 -refiner 32 -microscope 32 -speakerphone 32 -deceptive 32 -donna 4 -carmen 4 -detection 32 -find 36 -inhaled 32 -commissioner 38 -frears 4 -rush 32 -referral 32 -captors 32 -balancing 32 -public-liaison 32 -fink 12 -pesticides 32 -ex-microsoft 32 -continue 36 -commissioned 32 -baa-1 4 -baa-3 4 -applaud 32 -occasionally 34 -swiss-registered 4 -toppped 4 -atrocious 32 -gael 4 -completely 32 -adler 12 -iceberg 32 -respiratory 36 -reciprocal 32 -arm 104 -directing 34 -supply-siders 34 -arb 32 -bearer 32 -screenings 32 -coast 44 -costa 12 -publicity 96 -banca 4 -prettier 32 -nonperforming 32 -evolve 32 -announcement 32 -months 32 -alchemist 32 -costs 38 -lipton 12 -art 38 -distracted 32 -utx 8 -peaking 32 -exchange-rate 32 -acquaintance 32 -venice 4 -comprehensive 36 -rode 32 -colorado 6 -cut-and-dried 32 -posing 32 -uti 4 -murders 32 -ansa 4 -napery 32 -brandao 4 -rods 32 -ftlv 4 -drop 36 -suspects 32 -portray 32 -sony 4 -expire 32 -rings 32 -ngo 4 -enabled 32 -rosenberg 4 -opponents 34 -mother 32 -wakefield 4 -heavy-duty 32 -biggest-selling 32 -roles 32 -sultry 32 -tiny 32 -weathers 4 -enormously 32 -countermeasures 32 -enables 32 -paid 32 -untutored 32 -directions 32 -explore 32 -imperils 8 -phoebus 4 -refinancing 40 -morristown 4 -kimbriel 8 -closest 32 -already-crowded 32 -marcos 12 -apart 36 -downward 32 -orlov 12 -convocations 32 -burgess 4 -peelers 32 -macabre 32 -substituting 32 -fans 34 -eaten 32 -play 32 -screens 32 -defendants 34 -slip 36 -fe 4 -plas 4 -flower 32 -enormous 32 -higher-ranking 32 -subsidize 32 -pre-strike 32 -slim 32 -plan 44 -luncheon 32 -drum 32 -viability 32 -flowed 32 -ostrow 4 -robbed 32 -slid 32 -attend 32 -ecstatic 32 -morale 36 -wbi 8 -ensconced 32 -overuns 32 -force 36 -ill-fated 32 -factory 32 -wbb 12 -denial 34 -blindly 32 -interrupted 32 -championed 64 -telephone-installation 32 -rallies 32 -grow 36 -dollar-owners 32 -gros 12 -carry 32 -arabs 4 -approach 32 -budget-cutting 32 -canny 40 -mackin 4 -tucker 4 -pate 32 -abela 12 -allegation 32 -associate 40 -factors 38 -gear 36 -arranging 32 -manila 6 -pitching 32 -others 46 -dissolved 32 -permanent 32 -subsistance 32 -lender 36 -specialist 32 -self-defense 32 -unavailability 32 -l.p. 12 -slices 32 -paced 34 -bsc 4 -laborers 32 -grinspun 4 -anti-union 32 -nearby 32 -visited 32 -tort 32 -mifflin 4 -guerrilla 32 -brief 36 -sliced 32 -rescind 32 -rifkin 12 -saturation 32 -crowe 12 -crowd 32 -modeling 32 -relieve 32 -keyed 32 -caa 4 -cab 4 -cal 4 -can 116 -crown 36 -malaise 32 -cai 4 -virtues 32 -kuhn 2 -cat 36 -thoughts 32 -asahi 2 -keyes 12 -cap 36 -promoting 32 -car 106 -galvanize 32 -grandmotherly 32 -carries 32 -diagnosing 32 -reason 36 -spent 32 -magnificent 32 -dutch-auction-rate 4 -taxation 32 -talking 36 -earle 4 -coordination 32 -genteel 32 -wondering 32 -misleading 36 -bowen 12 -fly-drive-sleep 32 -carnegie 4 -robust 40 -inflections 32 -summaries 32 -dissolver 68 -misting 32 -ravitch 4 -waste-disposal 36 -contends 32 -downfall 32 -attractions 4 -brewed 32 -weighed 36 -overwhelming 32 -dinsmore 4 -icbms 4 -self-tender 32 -brazilians 2 -reputation 32 -freedom 36 -gifted 32 -snobbism 32 -thalmann 4 -waterfront 34 -deepest 32 -chevelles 4 -nearest 96 -enabling 34 -heaviest 32 -work-study 32 -embarrassment 32 -sunbeam 14 -flournoy 4 -rolling 36 -poker 32 -guests 34 -rates 38 -firmer 32 -unrequited 32 -fleet 36 -chaplains 32 -girl 34 -u.s.s.r. 4 -compact 32 -subcompact 32 -value-added-tax 32 -reviews 32 -adjacent 32 -weren 32 -enhance 32 -sheet-metal 32 -quips 32 -reinstated 32 -borohydride 32 -underbilling 32 -arena 32 -forces 32 -regain 32 -warranty 36 -confuses 32 -mmm 8 -mush 32 -eke 32 -ekg 4 -must 36 -pre-paid 32 -forced 32 -regular-season 32 -wipe 32 -dialogue 32 -seniors 32 -confused 32 -yen-denominated 34 -buck 4 -remotely 32 -endotronics 14 -displaying 32 -isabella 4 -detailed 32 -militiamen 32 -stock-market 36 -anti-gadhafi 32 -a.m. 32 -pregnant 32 -independents 32 -analyzed 32 -bigness 4 -mixing 4 -buckley 12 -sunglasses 32 -kind 36 -foolish 32 -neuro 4 -edina 8 -projected 32 -museums 32 -easy-cash 2 -uterine 32 -wander 32 -reminded 32 -epistolary 32 -heavier 32 -complaint 36 -staffs 32 -clones 32 -overallotment 32 -hopelessly 32 -failures 36 -clot 36 -circuitry 32 -reminder 32 -poles 36 -leave 36 -nationale 4 -material 36 -radial 32 -bow 32 -stronghold 32 -oval 32 -negotiate 40 -realistic 32 -tudor 4 -videocassette 32 -equicor-equitable 4 -price/earnings 32 -blender 32 -bags 32 -phrases 32 -shanghai 4 -planned 32 -televsion 32 -certainty 32 -responsive 32 -assumption 32 -mild 34 -accelerating 32 -amarillo 12 -lopes 4 -toshiba 6 -exhibited 32 -competitive 32 -jolt 32 -egypt 4 -capping 32 -therapies 32 -bullying 32 -non-farm 2 -movie-maker 4 -published 32 -non-canadian 32 -interests 32 -inc. 12 -acute 32 -logan 4 -edge 32 -cute 32 -apologized 32 -triple-b-plus 32 -precisely 32 -neutrons 32 -spokane 12 -publishes 32 -commenting 32 -imitators 32 -subvert 32 -overextending 32 -grave 32 -comparatively 32 -glimpse 32 -gateway 4 -beers 32 -clearance 36 -surplus 36 -awaits 32 -predicts 32 -mcmurray 4 -notoriously 32 -knock 32 -theft 32 -suicide 32 -word-processing 32 -dayco 4 -weakening 32 -balance 36 -never-never 32 -negotiation 32 -amounted 32 -minus 32 -year-ending 32 -seneker 4 -phi 4 -aspin 12 -tyce 12 -stanley 14 -bidder 32 -planner 4 -md-11 12 -repairing 32 -manufacture 32 -vice-president 32 -heat 36 -prejudice 32 -ancher 12 -consistent 32 -asides 4 -raid 32 -dirk 4 -extra 32 -rain 32 -holocaust 32 -xanax 4 -mer 8 -upheld 32 -mep 4 -casualties 32 -pensacola 4 -balance-of-payments 32 -met 36 -bavarian 4 -shoppes 4 -gives 32 -indecisive 32 -graying 32 -high-bracket 32 -accelerated 32 -jawboning 32 -purchasing 36 -weakened 32 -lunatic 32 -lowers 36 -acknowledges 32 -gregorian 12 -advertised 32 -commerciale 4 -sailor 32 -accelerates 32 -recalled 32 -willliams 4 -medea 4 -mckeown 4 -neighborhood 32 -cabrera 12 -fda 6 -contaminated 32 -allen 12 -token-ring 32 -instructions 32 -sweeping 32 -fdn 4 -involvement 36 -fds 8 -gordon 6 -timing 32 -seeds 32 -becket 8 -myopia 32 -best-selling 32 -cieply 4 -envisions 32 -price-earnings 32 -progressive 36 -penal 32 -insists 32 -translation 32 -waite 6 -saves 32 -saver 32 -interleukin-1 32 -intolerable 32 -reviewing 32 -light-skinned 32 -lubove 12 -rev 4 -rock-bottom 32 -late 46 -worsen 32 -application-specific 32 -matriculated 32 -suitors 32 -refugee 32 -saved 32 -witches 32 -dance 44 -momentary 32 -vladimir 4 -fried 44 -heterodox 32 -persisted 32 -contract 36 -moraine 4 -strollers 32 -hosted 32 -bench 32 -analogues 32 -gibert 4 -goods 100 -villages 32 -besides 42 -investor 46 -proceeds 42 -nonviolent 32 -going 32 -nancy 12 -casts 32 -link 36 -skin 32 -australian 14 -china 46 -recouping 32 -limits 36 -sydney 8 -seven-year 32 -reemerge 32 -jesse 2 -el-sayed 96 -skip 32 -group/business 4 -glen 4 -sewerage 4 -sin 32 -flavors 32 -seesaws 4 -ignored 32 -slide 36 -capitulation 32 -sunworld 14 -ultimate 32 -equality 32 -instruments 36 -sib 4 -mushroom 38 -total 46 -highfliers 32 -opted 32 -hair 32 -six 102 -unauthorized 32 -infringement 32 -sit 32 -oversees 32 -non-precious 4 -kathy 4 -forestall 32 -bosses 32 -soda 32 -bull 36 -trouble-free 32 -strategists 96 -computers/office 32 -bulk 32 -hurl 32 -appetite 36 -embezzlement 32 -scanditoy 4 -order 36 -otto 4 -miguel 4 -high-priced 32 -state-controlled 32 -perle 12 -pills 32 -jpm 4 -advance-purchase 32 -totty 4 -interplay 32 -animators 32 -experiencing 32 -waldholz 4 -desperation 32 -strapped 32 -disgust 32 -fuels 32 -merchant-ivory 4 -groans 32 -mosley 4 -chopped 32 -king 38 -aqua 36 -biological 32 -lesser 34 -dying 32 -certain 44 -religions 32 -hess 4 -reactions 32 -italiana 4 -drexel 14 -scale 32 -turnabout 32 -instincts 32 -enterprises 44 -northwestern 4 -tolerance 32 -waging 32 -buyback 36 -democrat 4 -describe 32 -actively 32 -explains 32 -murder 32 -countervailing-duty 32 -friendly 32 -realms 32 -pepper 12 -articles 32 -diseases 36 -triumphs 4 -pasok 4 -pump 36 -albani 12 -savage-western 4 -planning 36 -haggling 32 -being 34 -necessarily 32 -decree 32 -estimate 36 -formulated 32 -consist 32 -hanif 4 -shuttle 32 -advocate 36 -noble 14 -cancers 32 -revival 36 -quasi-protectionist 32 -rjf 8 -resist 32 -seductive 32 -abused 32 -providers 32 -millie 4 -microsoft 14 -merging 32 -countered 32 -epileptic-like 32 -altruistic 32 -scorns 32 -shambles 32 -comforting 32 -offers 36 -combinations 32 -ellsberg 4 -pqb 4 -page 32 -expansive 32 -shufro 4 -no-nonsense 4 -relax 32 -detector 32 -utilize 32 -lined 32 -yet 34 -nurtures 32 -homeowners 32 -yes 34 -scrutiny 32 -hadn 32 -groundwater 32 -litigants 32 -dealership 32 -relocations 32 -noir 32 -yevtushenko 4 -produce 36 -restructured 32 -triangle 4 -yen 32 -physicians 36 -circumstances 32 -developed 32 -x-there 64 -ticket 32 -potent 32 -fusillade 32 -switchboards 32 -proxmire 12 -bucksbaum 12 -francisco-based 4 -earthquake-free 32 -coughlin 8 -abhor 32 -initiatives 32 -openly 32 -skimmed 32 -displayed 32 -nationalism 32 -children 38 -lower-cost 32 -marketable 32 -historically 34 -glassmakers 32 -full 44 -realignment 32 -director-general 32 -exceeding 32 -grid 32 -applicant 32 -comex 12 -anybody 32 -dissented 32 -widow 4 -adjoins 32 -wholesaler 32 -covering 32 -leigh 12 -poverty 32 -inflammatory 32 -shorts 32 -understands 32 -eduardo 4 -grip 32 -cdii 8 -confirmed 96 -beatrice/hunt 4 -educator 32 -generator 32 -extensive 36 -mid-to-upper 32 -pledge 32 -produced 32 -after 110 -proceed 32 -concerto 4 -pa.-based 4 -having 46 -enemies 32 -emissions 32 -produces 32 -producer 46 -wage 34 -onto 32 -stoke 32 -computer-information 32 -affiliation 32 -bogdanich 4 -vtek 4 -mastered 32 -southbend 6 -fixed-cost 32 -invites 32 -councillors 32 -hyman 8 -removing 32 -fatalistic 32 -dictatorship 32 -flatter 32 -narrower 34 -brands 36 -invited 32 -swb 4 -scathing 32 -brightest 32 -garland 4 -mckinnon 12 -helmut 4 -grungy 32 -johann 4 -esber 12 -emma 4 -pmco 4 -pulp 40 -cords 8 -advisable 32 -georgiadis 4 -emmy 4 -mevacor 4 -carpeting 32 -present 32 -trait 32 -repositories 32 -georgian 4 -gown 32 -connolly 12 -impatient 32 -tax-rate 32 -adjunct 32 -dynamics 4 -expertly 32 -quacks 32 -workplace-related 32 -sentencing 40 -anti-smoking 32 -mediterranean 4 -lifted 32 -chloe 4 -comments 32 -zoot 32 -pockets 32 -wreck 32 -black-owned 32 -fla. 4 -underwriting 32 -lubin 12 -punishments 32 -colgate-palmolive 2 -hints 32 -zone 32 -preferred-stock 32 -sabre 4 -chum 32 -fraught 32 -revoking 32 -stigma 32 -records 44 -targets 32 -truer 40 -changing 36 -breaching 32 -poppea 12 -mergers 36 -brian 14 -mateo 4 -ft. 4 -fundamentalist 32 -exploits 32 -mediator 32 -impunity 32 -ratings 36 -u.k. 12 -suspended 32 -dual-career 32 -fundamentalism 32 -cooperation 100 -weinstein 12 -record-breaking 32 -advocating 32 -montedison 4 -crosses 32 -auctions 38 -compiled 40 -horror 32 -chicken 36 -waldman 4 -congressional 38 -riding 32 -speech 32 -kurt 4 -indebtedness 32 -patti 4 -nucleus 4 -gags 32 -resisted 32 -europ 12 -educated 32 -ceded 32 -vaccine 32 -downplay 32 -paragraphs 32 -saddled 32 -check 32 -wire 36 -brandy 32 -one-sixth 32 -merc 4 -urokinase 32 -apparel 36 -lowered 96 -tango 32 -mere 32 -sweethearts 32 -jacobson 8 -sweeny 4 -inaccurate 32 -watchers 32 -argentine 4 -buss 8 -argentina 4 -lucky 38 -eaton 4 -busy 36 -hartford 12 -articulately 32 -corroboration 32 -withdrawal 44 -commons 4 -modernize 32 -obliterate 32 -poink 4 -condon 12 -hinges 32 -marketeers 32 -wor-fm 4 -co-creator 32 -bush 14 -scientific 36 -dynes 8 -dramatic 32 -two-series 32 -respectful 2 -assaults 32 -currencies 40 -dangers 32 -redemption 44 -ringing 36 -decreased 32 -lund 4 -congregated 32 -valves 32 -alienate 32 -whirring 32 -upgrading 32 -assortment 32 -acumen 32 -gentleman 32 -gallantly 32 -dissidents 32 -medicinal 32 -warranted 32 -awards 32 -assurance 44 -stoga 8 -recently 114 -dismissal 32 -deleting 32 -chagrin 32 -dimetapp 4 -spirit 32 -louisville 4 -mostow 4 -arbitration 32 -raw-steel 2 -newsrooms 32 -roulac 8 -midwest 4 -urs 10 -thomas 14 -crocodile 4 -morrow 4 -tissue 2 -invoked 32 -pantzer 12 -ever-improving 32 -rely 32 -barrels 32 -blacklist 32 -petersburg 8 -teleglobe 6 -mexico 14 -laid 32 -encourages 32 -protect 32 -ousted 32 -dismisses 36 -hustlings 4 -constituent 32 -fill 32 -hollow 32 -speculating 32 -canadians 4 -ouster 32 -chesebrough-pond 12 -nettlesome 32 -ekstrom 8 -obsessional 34 -eagle-picher 10 -underline 32 -convincingly 32 -laughable 32 -accuser 32 -surprisingly 32 -triumph 32 -replaceable 32 -contented 32 -lynn 12 -accomplished 32 -perceiving 32 -accord 36 -vw 8 -nbc 12 -academies 32 -abshire 4 -midsized 32 -prince 32 -handsomely 32 -credited 32 -positioning 32 -estrangement 32 -focus 36 -israeli 6 -leveraged 32 -plea-bargaining 32 -seriousness 32 -lose 32 -brutish 32 -appealing 32 -social-security 32 -short-year 32 -lowell 12 -begins 44 -stressed 32 -layoffs 32 -wash.-based 4 -loss 46 -rummy 32 -lost 36 -qualifying 32 -luckily 8 -mci 14 -lavish 32 -willam 2 -separates 32 -behold 32 -frailty 4 -massenet 6 -broccoli 32 -fbs 8 -occasion 32 -situation 36 -aliens 36 -fbi 4 -separated 32 -physician-researcher 32 -locality 32 -fbc 8 -generations 32 -sweep 32 -invasion 32 -inman 8 -sanders 4 -invaluable 32 -sexiness 32 -ambition 32 -running 44 -below-cost 32 -reasons 36 -though 42 -syrian 4 -s. 14 -belgo-luxembourg 8 -attaining 32 -stumbled 32 -intensify 32 -flair 32 -practitioners 32 -adjustments 32 -half-empty 32 -continuously 32 -appreciation 32 -vaunted 32 -plazas 4 -kreider 4 -politicized 32 -headstrong 32 -trashy 32 -lobbied 32 -strassels 8 -andrei 4 -hopped 32 -choreographer 32 -fish 32 -helped 34 -choppy 32 -spar 4 -borg-warner 14 -sb 4 -conscious 32 -se 32 -rut 4 -non-japanese 32 -participants 34 -cluster 32 -traded 32 -fabricating 32 -sm 8 -single-b 32 -losers 32 -structural 32 -buyouts 32 -st 4 -winnipeg 4 -blacks 32 -hazardous-waste 32 -gasoline 98 -trades 36 -trader 32 -omb 12 -exercisable 32 -enrico 4 -stroh 4 -lgn 4 -mother-in-law 40 -schneider 12 -enrich 32 -soprano 32 -carlson 12 -developers 34 -superstitious 32 -meese 12 -moralizing 32 -hildegard 2 -ashton-tate 6 -petersen 8 -fray 32 -enhancing 32 -balked 32 -n.j. 4 -southfield 8 -insurance 44 -supervisory 32 -hanoi 4 -swear 32 -projecting 32 -observes 32 -observer 36 -wavering 32 -cozied 32 -comanches 4 -bollerer 4 -sadder 2 -c.o.m.b. 6 -billing 32 -victories 32 -lowry 4 -observed 32 -bourj 4 -unresolved 32 -open-market 32 -anti-dumping 42 -stoves 32 -price 110 -collateral 32 -sneaker 32 -moniker 32 -shark 36 -frenzied 32 -blustery 32 -persuasive 32 -bristol-myers 4 -expropriated 32 -spurned 32 -sharp 44 -pranksterism 32 -ethics 36 -contingent 32 -resources 36 -rua 4 -aids 38 -absolutely 32 -smiled 32 -draftsman 32 -recipients 32 -bloom 4 -custom 32 -newsweek 6 -monteith 8 -struts 32 -blood 32 -shifted 32 -solicitor 32 -analysts 46 -scant 32 -lighting 32 -syria 4 -simon 12 -amend 32 -memorandum 32 -jurisdictional 32 -fri. 2 -nevertheless 34 -didn 32 -highest 32 -severe 32 -heal 32 -decisions 32 -full-sized 32 -skyscraper 32 -single-digit 32 -bottlenecks 32 -financed 96 -second 46 -aviv 4 -post-split 32 -guessing 32 -woodcuts 32 -armin 4 -season 32 -stever 4 -ernst 6 -hot-blow 32 -schwarz 4 -light-filled 32 -gained 96 -lead-recycling 32 -three-day 32 -utilization 32 -cloud 32 -gartner 4 -executor 32 -spurt 32 -treadway 4 -affiliates 36 -birmingham 14 -clout 32 -marshal 32 -problem-loan 32 -tvla 8 -chicago-area 4 -defense 108 -implies 32 -indictment 32 -bertrand 4 -critical 36 -salaries 38 -lavender 32 -caliber 32 -bloodstock 32 -buick 8 -somehow 42 -schwartzman 4 -leaking 36 -bigwig 32 -allows 96 -marche 32 -nervousness 34 -tune 32 -arose 32 -inscrutable 32 -wound-healing 32 -imagine 32 -seventh-largest 32 -consumer-group 32 -hal 14 -okla.-based 4 -castings 32 -tails 32 -carrying 32 -developing 32 -digest 4 -convoluted 32 -had 108 -froze 32 -condoms 32 -hay 4 -staggers 32 -insult 32 -frankel 4 -out-of-favor 32 -has 108 -toadstool 32 -hereditary 32 -refused 32 -dishwashers 32 -transamerica 4 -sleazy 32 -speakers 32 -managements 32 -healy 12 -k-12 4 -carroll 8 -staff 46 -telecommunications 46 -failure 36 -supervisors 32 -kit 32 -endangered 32 -flooded 32 -redistribution 32 -attempts 34 -dhl 4 -allegations 34 -strenuously 32 -endanger 32 -halsor 8 -penetrate 32 -re 36 -above-average 32 -resolve 100 -rj 10 -implementing 32 -gilts 32 -kempe 4 -t.j. 2 -newmark 14 -mathematically 32 -jamaica 4 -rt 4 -clutch 32 -weinger 12 -andrews 4 -centrifuge 32 -propping 32 -dosages 32 -doubly 32 -ponce 4 -savin 4 -market-oriented 32 -panicked 2 -lentivirus 4 -would-be 32 -mortages 32 -talent 32 -principals 32 -outlawing 32 -emits 32 -debated 32 -rid 32 -kilmer 4 -variability 32 -technician 32 -walkie-talkies 32 -philosophical 32 -considerably 32 -clerk 32 -charged 36 -imbroglio 4 -voicing 32 -liberate 32 -metall 6 -tighten 32 -winner 32 -one-time 32 -appalachian 4 -consumer-oriented 32 -monday 14 -considerable 32 -telecasting 32 -share-for-share 32 -charges 102 -novel 32 -kfv 4 -needing 32 -maid 32 -pharmaceutical 36 -a.t. 4 -maneuvers 32 -smr 4 -sms 4 -balloons 40 -bypass 32 -posted 100 -vans 36 -marvels 32 -norwegian 6 -return 36 -nonunion 36 -healthy 32 -plaintive 32 -illegitimate 32 -disguised 32 -smf 12 -outstanding 32 -shelves 32 -portfolios 32 -bicentennial 32 -roommate 32 -rapid-fire 32 -chronology 36 -clues 32 -nurse 36 -shulman 12 -valve 32 -disgruntled 32 -quell 32 -flora 2 -gte 14 -radio-controlled 32 -f 12 -shoring 32 -week 38 -chimpanzee 32 -semi-annual 32 -shuffling 32 -formless 32 -daiwa 4 -acqusitions 32 -shelley 4 -intentions 32 -think 32 -ballyhooed 4 -thing 36 -gainers 34 -andreas 4 -unitek 4 -alliance 36 -hesitate 32 -raphael 4 -range 32 -lacks 32 -chain 36 -marilyn 6 -tchaikovsky 4 -disagreements 32 -rebel 36 -colleague 32 -colodny 12 -likud 4 -deliberately 32 -kinetic 32 -mayor 38 -r.j. 2 -faring 32 -convergent 4 -someone 34 -printemps 6 -invitation 32 -latitude 32 -paranoid 32 -cleveland-cliffs 10 -paranoia 32 -network-news 32 -thank 32 -boats 32 -conjures 32 -framingham 4 -arbitrage-trading 32 -dataquest 4 -toyoo 4 -baffled 32 -objection 32 -protective 32 -blackstone 4 -whichever 32 -bartlett 4 -buying 44 -uncharacteristically 32 -gathering 32 -klein 12 -rubber 44 -d-ram 4 -isles 4 -specifications 32 -fellas 32 -depositary 36 -reverted 32 -factual 32 -unguaranteed 2 -curtain 36 -consternation 32 -lately 34 -greenberg 12 -idolized 32 -publicized 32 -a-discounted 64 -dethrone 32 -jasinowski 12 -instrumental 32 -irritations 32 -bowery 4 -delaware 14 -somewhat 32 -infancy 32 -arbiter 4 -maxxum 4 -buoyed 32 -beesley 12 -ottoman 4 -accident 32 -legitimate 36 -haughty 32 -comparable 32 -unveil 32 -lonsdale 4 -mellifluously 32 -industries 108 -designing 32 -liberation 4 -initial 42 -orleans 4 -regulation 38 -allots 32 -ava 4 -howls 32 -surpassed 32 -ventron 4 -informed 34 -anti-ulcer 32 -monday-night 4 -mcgee 12 -slated 44 -legislation 40 -generating 32 -indulgence 32 -thus 42 -location 32 -continuation 32 -interleukin-2 32 -erect 32 -stamp 32 -interview 36 -eastdil 4 -anguished 32 -ups 6 -uniformly 32 -puffy 32 -wanted 32 -furor 32 -wallboard 36 -hughes 14 -hobor 12 -centronics 14 -insurers 36 -female 32 -sniff 32 -door 32 -delano 4 -baroque 4 -workings 32 -kiss 40 -disdaining 32 -upj 8 -spoken 32 -sculpture 4 -unsupported 32 -bulgarians 4 -ensures 32 -jones-irwin 4 -swamped 32 -baptista 12 -galoob 4 -mutuals 32 -new 46 -collapse 36 -investigate 32 -interest 110 -masterpieces 32 -revolve 32 -easier 32 -second-fiddle 32 -pso 8 -paying 36 -conflict-of-interest 32 -spinning 34 -rendering 32 -balloon 32 -psd 8 -complexities 32 -orchard 4 -supporting 32 -coming 46 -oakland 4 -regardless 42 -predicting 32 -revise 32 -lardner 4 -noon 32 -shortly 34 -resigning 32 -auto-company 32 -slumped 32 -big-bucks 32 -salzburg 4 -correspondents 32 -banking 108 -making 44 -caleb 4 -plantations 32 -commensurate 32 -endemic 32 -embroiled 32 -trying 34 -reputed 32 -money-supply 32 -multimillion-dollar 32 -usefulness 32 -colonel 32 -macfadden-bartell 4 -retailers 40 -discovery 32 -inquire 64 -fung 12 -fund 36 -route 36 -kodak 14 -coherence 32 -forge 32 -ameritech 4 -moammar 12 -forgn 12 -flashing 32 -slew 32 -honoring 32 -leaped 32 -jerk 32 -kemmons 6 -ncnb 4 -tastiest 32 -catastrophe 32 -paulette 4 -variety 32 -linchpin 32 -undoubtedly 34 -barclays 4 -tenure 32 -mobilize 4 -anderson 12 -montreal 12 -editor-in-chief 32 -bajarin 12 -flaky 32 -comparative 32 -perils 32 -subcommittees 32 -specializing 32 -price-fixing 32 -c 36 -grabbing 32 -estes 8 -schools 36 -westbound 32 -elizabeth 4 -offensive 32 -fights 36 -double-digit 32 -beni 4 -nine-to-10-month 32 -headed 96 -instability 32 -north 44 -amr 14 -fertilizer 32 -thought 32 -hydro 4 -launched 98 -chaos 32 -amy 4 -amf 4 -gaining 32 -amd 4 -foals 2 -tcf 4 -amc 14 -ama 8 -bakes 4 -amo 12 -launches 36 -societe 4 -swings 36 -fast-food 32 -automation 32 -barrage 32 -inquiry 36 -exemption 32 -puts 32 -hadlock 4 -globalization 32 -seats 46 -takeda 4 -patrons 32 -prepared 32 -explosives 32 -ridgefield 8 -latam 12 -powerhouse 32 -dallas-fort 4 -dorm 6 -vidal 4 -tax-overhaul 32 -dingell 12 -ltd. 4 -octavia 4 -proportional 32 -possibility 32 -profits 38 -customizing 32 -double-a-minus/a-1-plus 32 -gecm 4 -fslic 4 -freely 32 -recreational 32 -alliant 4 -soldiers 32 -adorjan 8 -pole-vaulter 32 -hays 2 -fitzwater 12 -lazarus 4 -long-run 32 -inference 32 -freezeframe 4 -t-lymphotropic 4 -prominently 32 -toll 36 -freeport-mcmoran 4 -infringe 32 -acceptance 32 -conservation 36 -told 36 -record 44 -far-ranging 32 -tagliabue 4 -brick 32 -hanged 32 -accepts 40 -cofide 12 -liquor 34 -barry 12 -videos 32 -spell 32 -telequest 14 -milwaukee 12 -collected 32 -belief 32 -run-down 32 -definitive 34 -cci 12 -keyboards 32 -ambassadors 32 -bank 110 -ccb 12 -ccc 4 -intrigue 32 -cce 12 -evaporation 32 -mining 46 -provinces 32 -politician 32 -mound 32 -billionaire 32 -awake 32 -successor 32 -half-step 32 -duds 32 -identical 32 -waiving 32 -declare 32 -mount 36 -irritants 32 -wales 4 -taste 32 -association 36 -dispute 36 -updated 32 -ticket-purchase 32 -increasing 100 -builds 32 -windhaven 4 -graze 32 -directorial 32 -balls 32 -inexplicably 32 -centuries-old 32 -addicts 32 -inexplicable 32 -sea-land 4 -editorial-page 32 -updates 32 -soar 32 -refinance 32 -soap 32 -symposium 32 -willingness 32 -learn 32 -norm 32 -ever 34 -stately 32 -consequential 32 -editorial 36 -break-up 32 -anticipated 32 -over 110 -peanuts 32 -warns 32 -tasks 36 -adam 4 -remote 32 -technological 34 -adverse 32 -anticipates 32 -statue 32 -paris 12 -nomura 4 -settles 44 -hca 12 -gradison 4 -shindig 32 -erria 4 -third-ranked 32 -edelman 12 -reservations 36 -crude-oil 32 -palestinians 4 -mainstream 32 -biting 32 -thinking 32 -needham 4 -entertained 32 -restrictions 32 -contributions 32 -anaheim 12 -helpful 32 -memo 32 -thunderous 32 -classes 32 -possible 36 -was 108 -farmer-borrowers 32 -roped 32 -postpone 32 -unesco 4 -known 34 -subcabinet-level 32 -counterparts 32 -preserving 32 -basement 32 -pickens 12 -discounts 32 -latent 32 -effort 36 -minneapolis-based 4 -third-highest 32 -s.g. 4 -stardom 32 -possibly 34 -disconnect 32 -quarters 32 -iranian-syrian 4 -founders 32 -deficit-reduction 32 -itek 4 -mobile-home 32 -abalone-processing 32 -nations 36 -item 32 -pete 4 -brusqueness 32 -guerrillas 32 -hiccuped 32 -assertions 34 -barrios 4 -equity 36 -discussions 32 -daylong 32 -u.s.-soviet 4 -mid-40s 32 -minoxidil 32 -devices 36 -overabundance 32 -faulted 32 -hyper-reactivity 32 -jaws 32 -lyle 4 -unsuccessful 32 -waddell 8 -testify 32 -shifting 32 -cooperman 12 -speidell 4 -intention 32 -krug 12 -aln 8 -regarded 32 -audits 36 -glaser 4 -fourth 38 -motel 32 -apollo 4 -oversubscribed 32 -toothpaste 32 -small-appliance 32 -columbia-registered 4 -polypropylene 32 -spree 32 -diverting 32 -e-estimated 64 -pliable 32 -cashing 32 -therapeutic 32 -ldcs 4 -cabinet 32 -flounder 32 -antibody 36 -neo-darwinists 32 -burger 12 -active 32 -knows 36 -might 100 -feb. 14 -safety-related 32 -weighty 32 -implored 32 -instructive 32 -violated 96 -mad 4 -slips 32 -mac 4 -rarity 32 -professing 32 -depositor 32 -deathbed 32 -parasitical 32 -max 12 -germany 12 -measures 32 -ain 32 -hungary 14 -arkin 4 -ridiculous 32 -germans 6 -s.a. 4 -tax 110 -demonstrate 32 -brand 32 -canadian 14 -tar 4 -it 110 -tap 32 -on-site 34 -kidneys 32 -middletown 8 -tan 4 -remember 34 -veress 4 -centigrade 4 -lewis 12 -duration 32 -appeals 44 -triple 32 -segment 32 -laced 32 -menus 32 -photo-processing 32 -difficult 32 -occupy 32 -long-awaited 32 -seamen 100 -inducements 32 -placate 32 -mass-manufactured 2 -degrees 32 -sidestep 32 -assistance 100 -eager 32 -animals 32 -amplifications 4 -incomplete 32 -persuade 32 -doctors 46 -desmond 6 -calgary 12 -immigrant 32 -tucson 4 -reallocation 32 -chairs 32 -securities-law 32 -retail 110 -board 110 -ave. 4 -schumann 4 -short-haul 32 -block 44 -redmond 4 -generated 32 -retain 32 -elma 4 -tommaso 8 -catholic 4 -says 110 -information-driven 32 -calvert 4 -fuzziness 32 -atmosphere 32 -denationalized 32 -resuscitate 32 -satellite-beamed 32 -talked 32 -f.g. 4 -jeans 34 -legends 32 -merit 32 -food-store 32 -raw-material 36 -acquistion 32 -ledger 32 -umm 4 -peace 36 -katz 4 -assembled 32 -six-packs 32 -bach 4 -tariff 32 -preston 12 -ballet 36 -speyer 4 -eurobonds 12 -specifically 42 -kids 32 -prohibition 32 -donations 32 -prior 106 -northwest 4 -fast-moving 32 -soon 98 -per-capita 10 -represented 32 -back 46 -pertinent 32 -monsanto 6 -labib 4 -preparation 32 -sandra 4 -long-form 32 -forest-products 32 -jose 4 -lancet 4 -rehnquist 4 -rubin 4 -converted 32 -foot 32 -ducks 32 -easton 4 -capital-spending 32 -wallow 32 -bedside 32 -dumez 6 -dilemma 36 -shortage 36 -meddlesome 32 -overhead 32 -heated 32 -stock 110 -afraid 32 -slacks 32 -six-figure 32 -wheelchair 32 -outskirts 32 -clouded 32 -pay-fone 14 -fanatics 32 -softdrink 32 -clev 4 -tasted 32 -savage 44 -capital-gains 32 -regulates 32 -boxes 32 -highways 32 -office-temporary 32 -bing 4 -objectionable 32 -hospital 46 -kaufman 14 -drawings 32 -nordmann 12 -outbursts 32 -self-consciously 32 -top-secret 32 -hastings 2 -deterioration 32 -regulated 32 -status 32 -conspicuous 32 -again 38 -oct. 6 -justices 40 -virtually 32 -high-limit 32 -tractors 32 -stereo 32 -artitragers 32 -backfired 32 -dated 32 -proper 32 -vital 36 -propel 32 -gourmet 36 -level 100 -malt 32 -dates 32 -bids 34 -deer 32 -belted 32 -structure 32 -blur 32 -boat 36 -index 36 -ogil 4 -fleming 4 -covington 4 -bahamas 4 -doorbells 32 -ate 32 -blue 38 -bronze 32 -messianic 32 -cat-and-mouse 32 -hazards 32 -government-sponsored 34 -appealed 32 -impetus 32 -everyone 46 -pace 36 -provisions 34 -trilling 32 -draw 32 -spiced 32 -reassure 36 -pack 32 -intricacy 32 -renegade 32 -raked 32 -brokers 34 -dram 4 -bidu 4 -welfare 32 -drag 32 -neco 12 -harder 32 -neck 32 -maurier 12 -simpson 12 -superstitions 38 -container 36 -death-penalty 32 -burbank 4 -vindication 32 -reversed 32 -strategic-arms 32 -czm 8 -fully 34 -containment 32 -effective 32 -feed 36 -ineffective 32 -overbuilding 32 -compressor 32 -official 108 -confess 32 -single-engine 32 -bureaucratically 32 -yelling 32 -paratungstate 32 -berlin 4 -ourselves 32 -fpa 8 -why 38 -architects 36 -aware 32 -award 44 -whites 32 -rayner 8 -increased 42 -bti 8 -stronger-than-expected 40 -introductory 32 -reproduce 32 -defended 32 -hyped 32 -rotondo 12 -scandal-ridden 32 -increases 44 -carlo 4 -permit 32 -line 36 -bty 4 -margoshes 4 -garret 4 -illuminate 32 -away 36 -emphasize 36 -global 34 -small-minded 32 -discussed 32 -drake 4 -nearing 32 -liters 32 -deflated 32 -ga.-based 4 -cohens 4 -traveling 32 -cure 32 -profilic 32 -auto-plant 32 -garza 4 -blithe 32 -janet 4 -alternating 32 -palestinian 4 -balkanized 4 -significantly 32 -handsome 32 -model 36 -diego 4 -wake 36 -intones 32 -facilities 100 -unreasonable 32 -finders 32 -raising 44 -belgium 6 -refurbishing 32 -doron 4 -one-tenth 32 -joining 32 -presumptions 32 -ore.-based 4 -television-type 32 -superstations 32 -zico 6 -already 98 -broadcasts 32 -subscriber-line 32 -halliburton 12 -cheers 32 -ribbons 32 -versatile 38 -rope-like 32 -charge 40 -cynthia 4 -routinely 32 -beat 32 -domestic-appliance 32 -summit 42 -volunteered 32 -phased 32 -chry-plym 8 -underwritten 32 -sox 4 -incentive 36 -chartered 36 -sow 32 -ekofisk 4 -heightening 32 -represents 32 -son 34 -pumped 32 -tadeusz 8 -thrown 32 -vocal 36 -kimball 8 -loquacity 32 -catalyst 4 -reno 12 -made 44 -battery-powered 32 -leaned 32 -concluding 32 -bristol 4 -broadcastmail 32 -durenberger 4 -adrar 4 -orleans-style 4 -theresa 8 -government-recognized 32 -apostles 32 -reser 12 -chart-guided 32 -jamming 32 -teeth 36 -into 96 -woodhead 8 -stockpiles 32 -underwear 32 -censor 32 -calif-based 4 -intc 12 -including 34 -roadside 32 -conceived 32 -transformers 4 -tons 32 -nostalgia 32 -conversely 2 -judiciously 32 -poisonous 32 -tony 4 -tong 4 -tone 40 -kouril 12 -construed 32 -existing 32 -plaid 32 -coveted 32 -considered 34 -beseech 32 -frontal 32 -athletes 32 -u.s.-backed 4 -bendjedid 4 -reversion 32 -onetime 32 -memos 36 -cross 44 -airports 32 -hits 32 -goetabanken 4 -persists 32 -eroded 32 -n.j.-based 4 -alarmed 32 -product-liability 32 -misappropriated 32 -uncertain 32 -runners 32 -lowering 34 -assigned 34 -virtual 32 -mcauliffe 8 -ceo 4 -foreign-debt 32 -clothing 32 -algeria 12 -aviator 32 -waukegan 8 -creation 32 -liquid 32 -weighted 32 -cents 32 -imasco 4 -afghan 4 -stablilizers 32 -defense-electronics 32 -tyranny 32 -waves 32 -rust 36 -advest 6 -lee 14 -mercury 4 -crisanti 4 -semi-negative 32 -heart-shaped 32 -juries 32 -representation 32 -unleaded 34 -calder 4 -perot 4 -harris 14 -hint 32 -celestre 8 -governing 36 -silent 36 -animator 32 -laptop 32 -congressmen 32 -outgrowth 32 -adequately 32 -fluidly 32 -abortions 36 -clucks 32 -munoz 12 -supplying 32 -cities/abc 4 -rapport 32 -amsterdam 8 -romance 34 -folly 32 -hasbro 4 -novels 36 -meteoric 32 -give 44 -statesmanlike 32 -pages 32 -urgings 32 -lindner 8 -blitz 32 -hardest 2 -guinea 32 -gonzalez 4 -kidder 14 -clear 44 -old-fashioned 32 -belt 36 -prove 34 -gain 44 -dangling 32 -reproducing 32 -kevin 6 -shotgun 32 -bell 46 -bramco 4 -nondurable 40 -atfc 8 -outcry 32 -clean 36 -tools 36 -television-newsroom 4 -supersedes 32 -reinhardt 4 -santa 12 -houghton 4 -political 98 -unified 36 -pounds 32 -accustomed 32 -presence 32 -achieving 34 -escalate 4 -unimaginable 32 -managers 36 -wits 32 -coffee 46 -mistress 32 -resorts 44 -marketeering 32 -intends 32 -kohlberg 4 -high-intensity 32 -polk 4 -trade 108 -excludes 32 -polo 36 -parody 32 -armageddon 4 -excluded 32 -stresses 32 -noisy 32 -clubs 32 -outlook 36 -absolve 32 -pipe 32 -chagrined 34 -pacts 36 -guards 32 -abrasives 32 -guaranty 4 -drusilla 6 -mailbox 32 -computer-aided 32 -authorization 32 -interbank 36 -excise 32 -accomplishments 32 -jerry 4 -alderman 4 -hootch 32 -whoop 32 -sported 32 -feds 32 -operationally 32 -suggest 32 -impelled 32 -withered 32 -tripling 32 -dinette 32 -libya 4 -chattanooga 8 -sailer 8 -accelerator 32 -coolidge 12 -dollars 96 -lorimar-telepictures 8 -disagreeing 32 -killing 32 -nazi 4 -arturo 4 -farther 32 -toyota 8 -fundamentals 32 -democrats 14 -slogan 32 -allow 32 -climbing 32 -freres 4 -aegean 4 -routes 32 -nicholas 4 -mom 32 -convenient 32 -abdicating 32 -vous 36 -fascination 32 -routed 4 -stringfellow 4 -analysis 32 -dubofsky 4 -corps 32 -priest 32 -lessening 32 -jungle 32 -fort 12 -himself 32 -reflect 96 -latest 44 -deterring 32 -chirac 4 -appoint 32 -backs 32 -perfection 32 -ford 14 -zooming 32 -problem 40 -gnp 4 -form 44 -exported 32 -caution 32 -wheeling 12 -qualify 36 -class 36 -fork 32 -boutique 34 -unsharklike 32 -ravages 32 -federal 110 -shake 32 -blustein 4 -emigrate 32 -integon 6 -shaky 32 -peace-loving 32 -silo 4 -screamers 32 -sufficient 34 -do-it-yourself 34 -astonishing 32 -seekers 32 -stimulating 32 -around 44 -selwyn 4 -danube 4 -fondy 8 -lean 34 -married 36 -guardia 4 -rudani 4 -consumption 98 -expansion-related 32 -leon 6 -khoo 4 -smuggling 32 -edmund 4 -intermediary 32 -adair 12 -anything 32 -paving 32 -intertwined 32 -cheaper 36 -congreso 4 -fonda 4 -ing 4 -cynical 32 -journalist 32 -ind 4 -oil 110 -rest 36 -bearing 32 -keran 4 -pih 4 -inn 4 -windfall 32 -ink 32 -mcentee 8 -ini 4 -fails 44 -headaches 32 -foreign 46 -divest 32 -int 4 -ins 4 -secord 12 -chemical 44 -embarked 32 -babylonian 2 -motorbiking 32 -journalism 32 -devaluation 32 -baghdad 4 -uncollectable 32 -dnb 4 -man 36 -pig 32 -verified 32 -barakat 4 -writing 32 -fungi 36 -mar 4 -islam 4 -informative 32 -forefathers 32 -withdrawals 32 -submersible 96 -arctic 4 -hoping 34 -some 110 -catskills 4 -close 44 -rial 8 -llx 4 -damp 32 -reopen 32 -presumably 34 -alcohol-related 32 -silver 36 -australian-based 4 -stun 4 -faster-growing 32 -circus 32 -pronouncing 32 -figuring 32 -pierced 32 -rentals 32 -fixed 32 -annuities 32 -residual 32 -staged 32 -labor 46 -cng 4 -southwest 14 -experimental 32 -bode 32 -excel 6 -susan 6 -affects 32 -eisenstadt 8 -discontinuing 32 -covertible 4 -bonus 32 -madcap 10 -expertise 32 -professional 38 -disappears 32 -uniforms 32 -zehli 4 -bogeymen 32 -overproducing 32 -yields 34 -flights 36 -laszlo 4 -confidential 34 -fasteners 32 -boycotts 32 -essentially 32 -assignment 32 -places 34 -mid-may 32 -well-done 32 -trimmed 96 -nov. 14 -lurks 32 -climax 32 -clara 4 -curb 36 -snowstorms 32 -s 36 -discretion 32 -adm. 6 -list 32 -asserting 32 -insolvency 32 -unexpected 32 -exponent 32 -transports 32 -yourself 32 -lacy 38 -reagan 14 -wildly 32 -corruption 32 -impact 36 -singapore-based 4 -flx 4 -fly 32 -pont 4 -frey 12 -doctorate 32 -hereabouts 32 -pretoria 4 -perhaps 46 -drum-shaped 32 -suburban 32 -flu 32 -mainframes 32 -offered 96 -clossey 4 -aged 32 -disorders 32 -mysteries 32 -averaged 32 -sneakers 32 -anatoly 4 -tighter 32 -vista-based 4 -spontaneously 32 -detrick 4 -aftershocks 32 -proximity 32 -truffles 38 -memory-chip 32 -fundamentally 32 -hatcher 4 -sorry 32 -pursued 32 -layoff 32 -tire 46 -letters 40 -microphone 32 -outside 102 -b-week 16 -levels 36 -hollywood 4 -mascots 32 -thrived 32 -regrettable 32 -multiplied 32 -afonso 4 -storming 32 -morin 4 -goldsmith 12 -nato 4 -carrington 4 -ziemer 8 -branford 4 -restitution 32 -results 58 -mmr 8 -returns 44 -drummer-athletes 32 -kleenex 4 -reporting 32 -nutritious 32 -decade 32 -declassified 32 -aficionados 32 -fence-sitting 32 -goetzl 12 -absent 32 -grabs 32 -washington-based 6 -news-service 32 -nickeling 32 -elected 100 -proponents 34 -roughed 32 -impeding 32 -teacher 32 -teaches 32 -critique 32 -gangster-and-love 32 -effigy 32 -subjects 32 -inclined 32 -cotton 44 -sensibility 32 -irons 32 -myl 8 -irony 32 -unfair 32 -tronic 4 -cowboy 32 -feature 32 -squad 32 -kicking 32 -jaap 4 -spin 32 -committing 32 -neves 4 -erratically 4 -slowest 32 -saturated 32 -raw-materials 32 -nuclear-weapons 32 -hesitant 32 -iranians 4 -scrap 32 -reduced 96 -damon 4 -diceon 14 -nurtured 32 -sonnenblick 4 -mae 4 -apparatus 32 -organizing 32 -sacrosanct 32 -reduces 36 -two-decade-old 32 -sachs/kidder 4 -sufferers 32 -full-scale 32 -mediation 32 -commitment 96 -sat 32 -hired 32 -accommodations 32 -saw 32 -european-looking 4 -narrative 32 -dependable 32 -greenmail 32 -started 32 -bleak 32 -say 100 -sad 42 -incurring 32 -sag 32 -rabushka 12 -lampooned 32 -fiasco 36 -sam 12 -san 12 -sao 4 -sai 8 -examining 32 -preview 32 -wear 32 -paramount 36 -finely 32 -trouble 36 -tcw 8 -s.c. 4 -nebulous 32 -skeptics 34 -whinney 4 -two-month 32 -illegal 32 -tubes 32 -deficiencies 32 -temperature 32 -weak 38 -stoicism 32 -industrie 4 -undershoot 32 -surpluses 32 -accomplishes 32 -starter 36 -discouraged 32 -ratepayers 32 -humphries 8 -footlick 4 -attracts 32 -crisis 36 -from 110 -x-rayed 4 -errors 32 -christianity 4 -inflationary 32 -carry-forwards 32 -assisted 34 -riots 32 -compare 32 -objects 32 -french-sounding 4 -train 32 -spawn 32 -kiichi 4 -diapers 32 -daffynition 8 -forwarders 32 -pix 4 -roots 32 -saudia 4 -bathing 32 -percent 40 -moonlighting 4 -sunbathing 32 -preoccupied 32 -refers 32 -skims 32 -frisch 12 -disputed 36 -sean 4 -saudis 4 -constitutes 32 -decorous 32 -conceal 32 -cacophonous 32 -machinery 38 -scoff 32 -introspective 32 -teradata 8 -logistics 32 -ammunition 32 -refuse 32 -francisville 8 -paralyzed 32 -write-down 32 -baltimore-based 6 -diverted 40 -japanese-designed 4 -lazard 4 -pessimistic 36 -unpleasant 32 -gousha 12 -vigor 32 -acquaint 32 -naval 40 -matrimony 32 -major 46 -egyptian 4 -biggest 38 -gringo 32 -western 46 -billion 36 -knowingly 32 -choate 4 -evenhandedly 32 -tax-cutting 32 -heyday 32 -further 46 -absorbing 4 -explicitly 32 -potatoes 32 -dollar-denominated 32 -counsels 32 -bankers 102 -fibers 32 -finesse 32 -diagrams 32 -coffee-growing 32 -believes 96 -nyt 8 -carnivorous 32 -listeners 32 -stadiums 32 -underestimate 32 -nyn 8 -recyclers 32 -phyb 8 -precipitous 32 -standby 32 -nye 4 -finkielstain 12 -stoughton 8 -retaliation 96 -believed 32 -cards 36 -commit 32 -bent 32 -motive 32 -crow 36 -et 32 -inexperienced 32 -immigrants 32 -revenue-raising 32 -campaigns 36 -adviser 36 -witty 32 -panache 2 -bend 36 -cabinet-level 32 -obligations 32 -concentration 32 -first-ranked 32 -world-wide 104 -complaints 32 -swearingen 4 -kline 4 -corporately 32 -appointee 32 -siegels 4 -nest 36 -topped 32 -pakistanis 4 -couldn 36 -streamline 32 -a-330 4 -rumors 32 -parenting 32 -franyo 4 -op-ed 32 -coal-burning 32 -unmitigated 32 -assume 34 -celebrity-laden 32 -adolescents 32 -rockford 6 -frequency 32 -t.nor 4 -witchlike 32 -inspected 32 -sellers 32 -mosle 4 -ious 4 -online 4 -made-for-television 32 -heroic 32 -marina 4 -agent 32 -fidel 4 -readied 32 -vierdanck 4 -hausfeld 12 -mcdonald 12 -graduates 32 -celebrate 32 -harpsichord 32 -gentlemanly 32 -jolted 32 -shantytown 32 -ministry 36 -fide 32 -governmental 32 -graduated 32 -engineer 32 -wholesale 36 -cash-and-stock 32 -thumb 32 -ballistic 32 -m.b.a. 4 -deliverable 32 -robitussin 4 -well-paid 32 -harden 36 -even 46 -tennis-racquets 32 -milburn 4 -iss 40 -coin 32 -sucralose 34 -seniority 32 -depicting 32 -torrent 32 -three-quarters 32 -discord 32 -return-on-investment 32 -beach 36 -obtained 32 -resent 32 -vodavi 4 -ata 4 -heileman 8 -mandated 32 -matched 32 -britain 14 -excessive 32 -inflation 46 -weakness 32 -advises 32 -tacit 32 -tcoma 4 -lotteries 32 -liquids 32 -swingers 4 -ransom 32 -griswold 8 -unshaken 32 -investigative 32 -formal 32 -dabbling 32 -college 46 -advised 32 -precision 32 -divestitures 32 -reasonable 32 -downgrading 32 -jockey 36 -wstf 8 -homeowner 32 -garrasi 4 -byzantine 4 -litigators 32 -red-haired 32 -hangover 32 -subpoena 32 -elvis 4 -side 100 -carajas 4 -pre-apprentices 32 -struggles 32 -authorities 98 -anti-lawyer 32 -compulsively 32 -alberta 4 -costly 34 -struggling 32 -florio 8 -ok. 4 -attache 32 -pedal 32 -vigorous 32 -neanderthals 4 -reportedly 32 -senator 104 -yutaka 4 -carted 32 -publication 32 -securities-services 32 -catastrophic 32 -misguided 32 -fee-for-service 32 -furthermore 34 -abbes 4 -cswc 4 -unforeseen 32 -harmed 32 -steel-ingot 32 -accrue 32 -accost 32 -white-collar 32 -lsi 12 -terrorism 32 -sweet 36 -dancing 32 -elkay 4 -stages 32 -regions 40 -swiss-based 4 -uk 4 -uh 32 -formerly 98 -ut 4 -anti-black 32 -thatcher 4 -us 34 -up 46 -terrorist 32 -j&l 4 -expression 32 -non-recoverable 32 -taffeta 32 -jonathan 4 -litchfield 4 -h.m. 4 -disconnected 32 -matters 36 -oki 4 -rusch 4 -chunk 32 -chamber 36 -limitations 32 -leverage 32 -whims 32 -dixieland 4 -adequate 32 -analyst 100 -attachments 32 -friendships 32 -confrontationism 32 -rugged 32 -bamberger 12 -computervision 14 -maintenance 32 -melamed 12 -multifamily 32 -soviets 4 -decisive 32 -investment-bank 64 -hires 32 -undertook 32 -academia 32 -outline 36 -video 38 -rosenthal 4 -economies 32 -biocraft 10 -widening 44 -falsify 32 -holiday 46 -leahy 12 -confectionery 32 -meierfeld 8 -cooking 32 -gtx 8 -dryburgh 8 -spigot 32 -nearly 46 -distasteful 32 -hamburger 32 -crop 32 -bart 4 -preoccupies 32 -installations 32 -heightened 32 -headlining 32 -blissful 32 -questioner 32 -squander 32 -dispersed 32 -soldier 32 -edged 32 -enacted 32 -superscope 4 -cgp 4 -freed 32 -reasonably 32 -clarence 2 -bohai 4 -indignities 32 -amos 12 -cge 4 -edges 4 -shaken 32 -peelings 32 -amok 32 -dancers 32 -sheen 4 -dominating 32 -foreign-policy 32 -hire 32 -freer 32 -none-too-subtle 32 -kra 8 -centurion 4 -chileans 4 -lafayette 8 -station 36 -education 46 -nauslar 12 -decides 32 -christmas 4 -overproduction 32 -negotiated 32 -g. 4 -cie. 6 -discover 32 -rohatyn 4 -vaccines 32 -decided 36 -conciliatory 32 -accurate 32 \ No newline at end of file diff --git a/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt b/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt deleted file mode 100644 index 478fd833..00000000 --- a/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +++ /dev/null @@ -1,39 +0,0 @@ -most -he -since -so -both -these -it -nevertheless -this -indeed -however -instead -under -similarly -some -though -while -when -in -despite -although -nonetheless -thus -there -if -the -nor -separately -moreover -but -they -yet -many -according -sales -among -meanwhile -even -i \ No newline at end of file diff --git a/scripts/generate.sh b/scripts/generate.sh deleted file mode 100644 index be0bee9b..00000000 --- a/scripts/generate.sh +++ /dev/null @@ -1 +0,0 @@ -python3 -m graphgen.generate --config_file graphgen/configs/graphgen_config.yaml --output_dir cache/ diff --git a/scripts/judge.sh b/scripts/judge.sh deleted file mode 100644 index f6fc134e..00000000 --- a/scripts/judge.sh +++ /dev/null @@ -1,2 +0,0 @@ -python3 -m graphgen.judge --input cache \ - --output cache/output/new_graph.graphml \ diff --git a/setup.py b/setup.py index 3dee7f8b..2b30f103 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,31 @@ +# pylint: skip-file import os from setuptools import find_packages, setup pwd = os.path.dirname(__file__) -version_file = 'graphgen/version.py' +version_file = "graphgen/_version.py" def readme(): - with open(os.path.join(pwd, 'README.md'), encoding='utf-8') as f: + with open(os.path.join(pwd, "README.md"), encoding="utf-8") as f: content = f.read() return content def get_version(): - with open(os.path.join(pwd, version_file), 'r') as f: - exec(compile(f.read(), version_file, 'exec')) - return locals()['__version__'] + with open(os.path.join(pwd, version_file), "r") as f: + exec(compile(f.read(), version_file, "exec")) + return locals()["__version__"] def read_requirements(): lines = [] - with open('requirements.txt', 'r') as f: + with open("requirements.txt", "r") as f: for line in f.readlines(): - if line.startswith('#'): + if line.startswith("#"): continue - if 'textract' in line: + if "textract" in line: continue if len(line) > 0: lines.append(line) @@ -33,32 +34,27 @@ def read_requirements(): install_packages = read_requirements() -if __name__ == '__main__': +if __name__ == "__main__": setup( - name='graphg', + name="graphg", version=get_version(), - url='https://github.com/open-sciencelab/GraphGen', - description= # noqa E251 - 'GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation', # noqa E501 + url="https://github.com/open-sciencelab/GraphGen", + description="GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation", long_description=readme(), - long_description_content_type='text/markdown', - author='open-sciencelab', - author_email='open-sciencelab@pjlab.org.cn', + long_description_content_type="text/markdown", + author="open-sciencelab", + author_email="open-sciencelab@pjlab.org.cn", packages=find_packages(exclude=["models"]), - package_data={ - 'GraphGen': ['configs/*'] - }, + package_data={"GraphGen": ["configs/*"]}, include_package_data=True, install_requires=install_packages, classifiers=[ - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Intended Audience :: Developers', - 'Intended Audience :: Education', - 'Intended Audience :: Science/Research', + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", ], - entry_points={'console_scripts': ['graphgen=graphgen.generate:main']}, + entry_points={"console_scripts": ["graphg=graphgen.generate:main"]}, ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/e2e_tests/__init__.py b/tests/e2e_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/e2e_tests/conftest.py b/tests/e2e_tests/conftest.py new file mode 100644 index 00000000..39cc4100 --- /dev/null +++ b/tests/e2e_tests/conftest.py @@ -0,0 +1,63 @@ +import json +import os +import subprocess +from pathlib import Path + + +def run_generate_test(tmp_path: Path, config_name: str): + """ + Run the generate test with the given configuration file and temporary path. + + Args: + tmp_path: pytest temporary path + config_name: configuration file name (e.g. "atomic_config.yaml") + + Returns: + tuple: (run_folder, json_files[0]) + """ + repo_root = Path(__file__).resolve().parents[2] + os.chdir(repo_root) + + config_path = repo_root / "graphgen" / "configs" / config_name + output_dir = tmp_path / "output" + output_dir.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [ + "python", + "-m", + "graphgen.generate", + "--config_file", + str(config_path), + "--output_dir", + str(output_dir), + ], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, f"Script failed with error: {result.stderr}" + + data_root = output_dir / "data" / "graphgen" + assert data_root.exists(), f"{data_root} does not exist" + run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True) + assert run_folders, f"No run folders found in {data_root}" + run_folder = run_folders[0] + + config_saved = run_folder / "config.yaml" + assert config_saved.exists(), f"{config_saved} not found" + + json_files = list(run_folder.glob("*.json")) + assert json_files, f"No JSON output found in {run_folder}" + + log_files = list(run_folder.glob("*.log")) + assert log_files, "No log file generated" + + with open(json_files[0], "r", encoding="utf-8") as f: + data = json.load(f) + assert ( + isinstance(data, list) and len(data) > 0 + ), "JSON output is empty or not a list" + + return run_folder, json_files[0] + diff --git a/tests/e2e_tests/test_generate_aggregated.py b/tests/e2e_tests/test_generate_aggregated.py new file mode 100644 index 00000000..faebf3ac --- /dev/null +++ b/tests/e2e_tests/test_generate_aggregated.py @@ -0,0 +1,7 @@ +from pathlib import Path + +from .conftest import run_generate_test + + +def test_generate_aggregated(tmp_path: Path): + run_generate_test(tmp_path, "aggregated_config.yaml") diff --git a/tests/e2e_tests/test_generate_atomic.py b/tests/e2e_tests/test_generate_atomic.py new file mode 100644 index 00000000..26e47532 --- /dev/null +++ b/tests/e2e_tests/test_generate_atomic.py @@ -0,0 +1,7 @@ +from pathlib import Path + +from .conftest import run_generate_test + + +def test_generate_atomic(tmp_path: Path): + run_generate_test(tmp_path, "atomic_config.yaml") diff --git a/tests/e2e_tests/test_generate_cot.py b/tests/e2e_tests/test_generate_cot.py new file mode 100644 index 00000000..b1ee74d9 --- /dev/null +++ b/tests/e2e_tests/test_generate_cot.py @@ -0,0 +1,7 @@ +from pathlib import Path + +from .conftest import run_generate_test + + +def test_generate_cot(tmp_path: Path): + run_generate_test(tmp_path, "cot_config.yaml") diff --git a/tests/e2e_tests/test_generate_multi_hop.py b/tests/e2e_tests/test_generate_multi_hop.py new file mode 100644 index 00000000..709f5918 --- /dev/null +++ b/tests/e2e_tests/test_generate_multi_hop.py @@ -0,0 +1,7 @@ +from pathlib import Path + +from .conftest import run_generate_test + + +def test_generate_multi_hop(tmp_path: Path): + run_generate_test(tmp_path, "multi_hop_config.yaml") diff --git a/tests/e2e_tests/test_generate_vqa.py b/tests/e2e_tests/test_generate_vqa.py new file mode 100644 index 00000000..796bc286 --- /dev/null +++ b/tests/e2e_tests/test_generate_vqa.py @@ -0,0 +1,7 @@ +from pathlib import Path + +from .conftest import run_generate_test + + +def test_generate_vqa(tmp_path: Path): + run_generate_test(tmp_path, "vqa_config.yaml") diff --git a/tests/integration_tests/__init__.py b/tests/integration_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration_tests/models/llm/api/test_http_client.py b/tests/integration_tests/models/llm/api/test_http_client.py new file mode 100644 index 00000000..d2996d1c --- /dev/null +++ b/tests/integration_tests/models/llm/api/test_http_client.py @@ -0,0 +1,143 @@ +# pylint: disable=protected-access +import math + +import pytest + +from graphgen.models.llm.api.http_client import HTTPClient + + +class DummyTokenizer: + def encode(self, text: str): + # simple tokenization: split on spaces + return text.split() + + +class _MockResponse: + def __init__(self, data): + self._data = data + + def raise_for_status(self): + return None + + async def json(self): + return self._data + + +class _PostCtx: + def __init__(self, data): + self._resp = _MockResponse(data) + + async def __aenter__(self): + return self._resp + + async def __aexit__(self, exc_type, exc, tb): + return False + + +class MockSession: + def __init__(self, data): + self._data = data + self.closed = False + + def post(self, *args, **kwargs): + return _PostCtx(self._data) + + async def close(self): + self.closed = True + + +class DummyLimiter: + def __init__(self): + self.calls = [] + + async def wait(self, *args, **kwargs): + self.calls.append((args, kwargs)) + + +@pytest.mark.asyncio +async def test_generate_answer_records_usage_and_uses_limiters(): + # arrange + data = { + "choices": [{"message": {"content": "Hello world!"}}], + "usage": {"prompt_tokens": 3, "completion_tokens": 2, "total_tokens": 5}, + } + client = HTTPClient(model="m", base_url="http://test") + client._session = MockSession(data) + client.tokenizer = DummyTokenizer() + client.system_prompt = "sys" + client.temperature = 0.0 + client.top_p = 1.0 + client.max_tokens = 10 + client.filter_think_tags = lambda s: s.replace("", "").replace( + "", "" + ) + rpm = DummyLimiter() + tpm = DummyLimiter() + client.rpm = rpm + client.tpm = tpm + client.request_limit = True + + # act + out = await client.generate_answer("hi", history=["u1", "a1"]) + + # assert + assert out == "Hello world!" + assert client.token_usage[-1] == { + "prompt_tokens": 3, + "completion_tokens": 2, + "total_tokens": 5, + } + assert len(rpm.calls) == 1 + assert len(tpm.calls) == 1 + + +@pytest.mark.asyncio +async def test_generate_topk_per_token_parses_logprobs(): + # arrange + # create two token items with top_logprobs + data = { + "choices": [ + { + "logprobs": { + "content": [ + { + "token": "A", + "logprob": math.log(0.6), + "top_logprobs": [ + {"token": "A", "logprob": math.log(0.6)}, + {"token": "B", "logprob": math.log(0.4)}, + ], + }, + { + "token": "B", + "logprob": math.log(0.2), + "top_logprobs": [ + {"token": "B", "logprob": math.log(0.2)}, + {"token": "C", "logprob": math.log(0.8)}, + ], + }, + ] + } + } + ] + } + client = HTTPClient(model="m", base_url="http://test") + client._session = MockSession(data) + client.tokenizer = DummyTokenizer() + client.system_prompt = None + client.temperature = 0.0 + client.top_p = 1.0 + client.max_tokens = 10 + client.topk_per_token = 2 + + # act + tokens = await client.generate_topk_per_token("hi", history=[]) + + # assert + assert len(tokens) == 2 + # check probabilities and top_candidates + assert abs(tokens[0].prob - 0.6) < 1e-9 + assert abs(tokens[1].prob - 0.2) < 1e-9 + assert len(tokens[0].top_candidates) == 2 + assert tokens[0].top_candidates[0].text == "A" + assert tokens[0].top_candidates[1].text == "B" diff --git a/tests/integration_tests/models/llm/api/test_ollama_client.py b/tests/integration_tests/models/llm/api/test_ollama_client.py new file mode 100644 index 00000000..b20bc44c --- /dev/null +++ b/tests/integration_tests/models/llm/api/test_ollama_client.py @@ -0,0 +1,91 @@ +# pylint: disable=redefined-outer-name +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from graphgen.models import OllamaClient + + +# ----------------- fixture ----------------- +@pytest.fixture +def mock_ollama_pkg(): + """ + mock ollama + """ + ollama_mock = MagicMock() + ollama_mock.AsyncClient = AsyncMock + with patch.dict("sys.modules", {"ollama": ollama_mock}): + yield ollama_mock + + +@pytest.fixture +def ollama_client(mock_ollama_pkg) -> OllamaClient: + """ + Returns a default-configured OllamaClient with client.chat mocked + """ + cli = OllamaClient(model="gemma3", base_url="http://test:11434") + cli.tokenizer = MagicMock() + cli.tokenizer.encode = MagicMock(side_effect=lambda x: x.split()) + cli.client.chat = AsyncMock( + return_value={ + "message": {"content": "hi from ollama"}, + "prompt_eval_count": 10, + "eval_count": 5, + } + ) + return cli + + +@pytest.mark.asyncio +async def test_generate_answer_basic(ollama_client: OllamaClient): + ans = await ollama_client.generate_answer("hello") + assert ans == "hi from ollama" + ollama_client.client.chat.assert_awaited_once() + call = ollama_client.client.chat.call_args + assert call.kwargs["model"] == "gemma3" + assert call.kwargs["messages"][-1]["content"] == "hello" + assert call.kwargs["stream"] is False + + +@pytest.mark.asyncio +async def test_generate_answer_with_history(ollama_client: OllamaClient): + hist = [{"role": "user", "content": "prev"}] + await ollama_client.generate_answer("now", history=hist) + msgs = ollama_client.client.chat.call_args.kwargs["messages"] + assert msgs[-2]["content"] == "prev" + assert msgs[-1]["content"] == "now" + + +@pytest.mark.asyncio +async def test_token_usage_recorded(ollama_client: OllamaClient): + await ollama_client.generate_answer("test") + assert len(ollama_client.token_usage) == 1 + assert ollama_client.token_usage[0]["prompt_tokens"] == 10 + assert ollama_client.token_usage[0]["completion_tokens"] == 5 + assert ollama_client.token_usage[0]["total_tokens"] == 15 + + +@pytest.mark.asyncio +async def test_rpm_tpm_limiter_called(ollama_client: OllamaClient): + ollama_client.request_limit = True + with patch.object(ollama_client.rpm, "wait", AsyncMock()) as rpm_mock, patch.object( + ollama_client.tpm, "wait", AsyncMock() + ) as tpm_mock: + + await ollama_client.generate_answer("limited") + rpm_mock.assert_awaited_once_with(silent=True) + tpm_mock.assert_awaited_once_with( + ollama_client.max_tokens + len("limited".split()), silent=True + ) + + +def test_import_error_when_ollama_missing(): + with patch.dict("sys.modules", {"ollama": None}): + with pytest.raises(ImportError, match="Ollama SDK is not installed"): + OllamaClient() + + +@pytest.mark.asyncio +async def test_generate_inputs_prob_not_implemented(ollama_client: OllamaClient): + with pytest.raises(NotImplementedError): + await ollama_client.generate_inputs_prob("any") diff --git a/tests/integration_tests/models/llm/local/test_hf_wrapper.py b/tests/integration_tests/models/llm/local/test_hf_wrapper.py new file mode 100644 index 00000000..ae23ce11 --- /dev/null +++ b/tests/integration_tests/models/llm/local/test_hf_wrapper.py @@ -0,0 +1,43 @@ +from unittest.mock import MagicMock + +import pytest + +from graphgen.models.llm.local.hf_wrapper import HuggingFaceWrapper + + +@pytest.fixture(autouse=True) +def mock_hf(monkeypatch): + mock_tokenizer = MagicMock() + mock_tokenizer.pad_token = None + mock_tokenizer.eos_token = "" + mock_tokenizer.eos_token_id = 0 + mock_tokenizer.decode.return_value = "hello" + mock_tokenizer.encode.return_value = [1, 2, 3] + monkeypatch.setattr( + "graphgen.models.llm.local.hf_wrapper.AutoTokenizer.from_pretrained", + lambda *a, **kw: mock_tokenizer, + ) + + mock_model = MagicMock() + mock_model.device = "cpu" + mock_model.generate.return_value = MagicMock( + __getitem__=lambda s, k: [0, 1, 2, 3], shape=(1, 4) + ) + mock_model.eval.return_value = None + monkeypatch.setattr( + "graphgen.models.llm.local.hf_wrapper.AutoModelForCausalLM.from_pretrained", + lambda *a, **kw: mock_model, + ) + + monkeypatch.setattr( + "graphgen.models.llm.local.hf_wrapper.torch.no_grad", MagicMock() + ) + + return mock_tokenizer, mock_model + + +@pytest.mark.asyncio +async def test_generate_answer(): + wrapper = HuggingFaceWrapper("fake-model") + result = await wrapper.generate_answer("hi") + assert isinstance(result, str) diff --git a/tests/integration_tests/models/partitioner/test_bfs_partitioner.py b/tests/integration_tests/models/partitioner/test_bfs_partitioner.py new file mode 100644 index 00000000..48558cc1 --- /dev/null +++ b/tests/integration_tests/models/partitioner/test_bfs_partitioner.py @@ -0,0 +1,81 @@ +import tempfile + +import pytest + +from graphgen.bases.datatypes import Community +from graphgen.models import BFSPartitioner, NetworkXStorage + + +@pytest.mark.asyncio +async def test_empty_graph(): + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage(working_dir=tmpdir, namespace="empty") + partitioner = BFSPartitioner() + communities = await partitioner.partition(storage, max_units_per_community=5) + assert communities == [] + + +@pytest.mark.asyncio +async def test_single_node(): + nodes = [("A", {"desc": "alone"})] + edges = [] + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage(working_dir=tmpdir, namespace="single_node") + + for nid, ndata in nodes: + await storage.upsert_node(nid, ndata) + for src, tgt, edata in edges: + await storage.upsert_edge(src, tgt, edata) + + partitioner = BFSPartitioner() + communities: list[Community] = await partitioner.partition( + storage, max_units_per_community=5 + ) + assert len(communities) == 1 + assert communities[0].nodes == ["A"] + assert communities[0].edges == [] + + +@pytest.mark.asyncio +async def test_small_graph(): + """ + 0 - 1 - 2 + | | | + 3 - 4 - 5 + 6 nodes & 7 edges, max_units=4 => at least 3 communities + """ + nodes = [(str(i), {"desc": f"node{i}"}) for i in range(6)] + edges = [ + ("0", "1", {"desc": "e01"}), + ("1", "2", {"desc": "e12"}), + ("0", "3", {"desc": "e03"}), + ("1", "4", {"desc": "e14"}), + ("2", "5", {"desc": "e25"}), + ("3", "4", {"desc": "e34"}), + ("4", "5", {"desc": "e45"}), + ] + + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage(working_dir=tmpdir, namespace="small_graph") + + for nid, ndata in nodes: + await storage.upsert_node(nid, ndata) + for src, tgt, edata in edges: + await storage.upsert_edge(src, tgt, edata) + + partitioner = BFSPartitioner() + communities: list[Community] = await partitioner.partition( + storage, max_units_per_community=4 + ) + + assert len(communities) <= 5 + + all_nodes = set() + all_edges = set() + for c in communities: + assert len(c.nodes) + len(c.edges) <= 4 + all_nodes.update(c.nodes) + all_edges.update(c.edges) + + assert all_nodes == {str(i) for i in range(6)} + assert len(all_edges) == 7 diff --git a/tests/integration_tests/models/partitioner/test_dfs_partitioner.py b/tests/integration_tests/models/partitioner/test_dfs_partitioner.py new file mode 100644 index 00000000..536e139a --- /dev/null +++ b/tests/integration_tests/models/partitioner/test_dfs_partitioner.py @@ -0,0 +1,89 @@ +import tempfile + +import pytest + +from graphgen.bases.datatypes import Community +from graphgen.models import DFSPartitioner, NetworkXStorage + + +@pytest.mark.asyncio +async def test_empty_graph(): + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage( + working_dir=tmpdir, + namespace="empty", + ) + partitioner = DFSPartitioner() + communities = await partitioner.partition(storage, max_units_per_community=5) + assert communities == [] + + +@pytest.mark.asyncio +async def test_single_node(): + nodes = [("A", {"desc": "alone"})] + edges = [] + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage( + working_dir=tmpdir, + namespace="single_node", + ) + + for nid, ndata in nodes: + await storage.upsert_node(nid, ndata) + for src, tgt, edata in edges: + await storage.upsert_edge(src, tgt, edata) + + partitioner = DFSPartitioner() + communities: list[Community] = await partitioner.partition( + storage, max_units_per_community=5 + ) + assert len(communities) == 1 + assert communities[0].nodes == ["A"] + assert communities[0].edges == [] + + +@pytest.mark.asyncio +async def test_small_graph(): + """ + 0 - 1 - 2 + | | | + 3 - 4 - 5 + 6 nodes & 7 edges, max_units=4 => 3 communities + """ + nodes = [(str(i), {"desc": f"node{i}"}) for i in range(6)] + edges = [ + ("0", "1", {"desc": "e01"}), + ("1", "2", {"desc": "e12"}), + ("0", "3", {"desc": "e03"}), + ("1", "4", {"desc": "e14"}), + ("2", "5", {"desc": "e25"}), + ("3", "4", {"desc": "e34"}), + ("4", "5", {"desc": "e45"}), + ] + + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage( + working_dir=tmpdir, + namespace="small_graph", + ) + + for nid, ndata in nodes: + await storage.upsert_node(nid, ndata) + for src, tgt, edata in edges: + await storage.upsert_edge(src, tgt, edata) + + partitioner = DFSPartitioner() + + communities: list[Community] = await partitioner.partition( + storage, max_units_per_community=4 + ) + + assert len(communities) <= 5 + all_nodes = set() + all_edges = set() + for c in communities: + assert len(c.nodes) + len(c.edges) <= 4 + all_nodes.update(c.nodes) + all_edges.update(c.edges) + assert all_nodes == {str(i) for i in range(6)} + assert len(all_edges) == 7 diff --git a/tests/integration_tests/models/partitioner/test_ece_partitioner.py b/tests/integration_tests/models/partitioner/test_ece_partitioner.py new file mode 100644 index 00000000..93fc2ea1 --- /dev/null +++ b/tests/integration_tests/models/partitioner/test_ece_partitioner.py @@ -0,0 +1,202 @@ +import tempfile + +import pytest + +from graphgen.bases.datatypes import Community +from graphgen.models import ECEPartitioner, NetworkXStorage + + +@pytest.mark.asyncio +async def test_ece_empty_graph(): + """ECE partitioning on an empty graph should return an empty community list.""" + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage(working_dir=tmpdir, namespace="empty") + partitioner = ECEPartitioner() + communities = await partitioner.partition( + storage, max_units_per_community=5, unit_sampling="random" + ) + assert communities == [] + + +@pytest.mark.asyncio +async def test_ece_single_node(): + """A single node must be placed in exactly one community under any edge-sampling strategy.""" + nodes = [("A", {"desc": "alone", "length": 10, "loss": 0.1})] + + for strategy in ("random", "min_loss", "max_loss"): + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage( + working_dir=tmpdir, namespace=f"single_{strategy}" + ) + for nid, ndata in nodes: + await storage.upsert_node(nid, ndata) + + partitioner = ECEPartitioner() + communities: list[Community] = await partitioner.partition( + storage, max_units_per_community=5, unit_sampling=strategy + ) + assert len(communities) == 1 + assert communities[0].nodes == ["A"] + assert communities[0].edges == [] + + +@pytest.mark.asyncio +async def test_ece_small_graph_random(): + """ + 2x3 grid graph: + 0 — 1 — 2 + | | | + 3 — 4 — 5 + 6 nodes & 7 edges, max_units=4 => at least 3 communities expected with random sampling. + """ + nodes = [(str(i), {"desc": f"node{i}", "length": 10}) for i in range(6)] + edges = [ + ("0", "1", {"desc": "e01", "loss": 0.1, "length": 5}), + ("1", "2", {"desc": "e12", "loss": 0.2, "length": 5}), + ("0", "3", {"desc": "e03", "loss": 0.3, "length": 5}), + ("1", "4", {"desc": "e14", "loss": 0.4, "length": 5}), + ("2", "5", {"desc": "e25", "loss": 0.5, "length": 5}), + ("3", "4", {"desc": "e34", "loss": 0.6, "length": 5}), + ("4", "5", {"desc": "e45", "loss": 0.7, "length": 5}), + ] + + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage(working_dir=tmpdir, namespace="small_random") + for nid, ndata in nodes: + await storage.upsert_node(nid, ndata) + for src, tgt, edata in edges: + await storage.upsert_edge(src, tgt, edata) + + partitioner = ECEPartitioner() + communities: list[Community] = await partitioner.partition( + storage, max_units_per_community=4, unit_sampling="random" + ) + + # Basic integrity checks + all_nodes = set() + all_edges = set() + for c in communities: + assert len(c.nodes) + len(c.edges) <= 4 + all_nodes.update(c.nodes) + all_edges.update((u, v) if u < v else (v, u) for u, v in c.edges) + assert all_nodes == {str(i) for i in range(6)} + assert len(all_edges) == 7 + + +@pytest.mark.asyncio +async def test_ece_small_graph_min_loss(): + """ + Same grid graph, but using min_loss sampling. + Edges with lower loss should be preferred during community expansion. + """ + nodes = [ + (str(i), {"desc": f"node{i}", "length": 10, "loss": i * 0.1}) for i in range(6) + ] + edges = [ + ("0", "1", {"desc": "e01", "loss": 0.05, "length": 5}), + ("1", "2", {"desc": "e12", "loss": 0.10, "length": 5}), + ("0", "3", {"desc": "e03", "loss": 0.15, "length": 5}), + ("1", "4", {"desc": "e14", "loss": 0.20, "length": 5}), + ("2", "5", {"desc": "e25", "loss": 0.25, "length": 5}), + ("3", "4", {"desc": "e34", "loss": 0.30, "length": 5}), + ("4", "5", {"desc": "e45", "loss": 0.35, "length": 5}), + ] + + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage(working_dir=tmpdir, namespace="small_min") + for nid, ndata in nodes: + await storage.upsert_node(nid, ndata) + for src, tgt, edata in edges: + await storage.upsert_edge(src, tgt, edata) + + partitioner = ECEPartitioner() + communities: list[Community] = await partitioner.partition( + storage, max_units_per_community=4, unit_sampling="min_loss" + ) + + all_nodes = set() + all_edges = set() + for c in communities: + assert len(c.nodes) + len(c.edges) <= 4 + all_nodes.update(c.nodes) + all_edges.update((u, v) if u < v else (v, u) for u, v in c.edges) + assert all_nodes == {str(i) for i in range(6)} + assert len(all_edges) == 7 + + +@pytest.mark.asyncio +async def test_ece_small_graph_max_loss(): + """ + Same grid graph, but using max_loss sampling. + Edges with higher loss should be preferred during community expansion. + """ + nodes = [ + (str(i), {"desc": f"node{i}", "length": 10, "loss": (5 - i) * 0.1}) + for i in range(6) + ] + edges = [ + ("0", "1", {"desc": "e01", "loss": 0.35, "length": 5}), + ("1", "2", {"desc": "e12", "loss": 0.30, "length": 5}), + ("0", "3", {"desc": "e03", "loss": 0.25, "length": 5}), + ("1", "4", {"desc": "e14", "loss": 0.20, "length": 5}), + ("2", "5", {"desc": "e25", "loss": 0.15, "length": 5}), + ("3", "4", {"desc": "e34", "loss": 0.10, "length": 5}), + ("4", "5", {"desc": "e45", "loss": 0.05, "length": 5}), + ] + + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage(working_dir=tmpdir, namespace="small_max") + for nid, ndata in nodes: + await storage.upsert_node(nid, ndata) + for src, tgt, edata in edges: + await storage.upsert_edge(src, tgt, edata) + + partitioner = ECEPartitioner() + communities: list[Community] = await partitioner.partition( + storage, max_units_per_community=4, unit_sampling="max_loss" + ) + + all_nodes = set() + all_edges = set() + for c in communities: + assert len(c.nodes) + len(c.edges) <= 4 + all_nodes.update(c.nodes) + all_edges.update((u, v) if u < v else (v, u) for u, v in c.edges) + assert all_nodes == {str(i) for i in range(6)} + assert len(all_edges) == 7 + + +@pytest.mark.asyncio +async def test_ece_max_tokens_limit(): + """Ensure max_tokens_per_community is respected.""" + # node id -> data + node_data = {"A": {"length": 3000}, "B": {"length": 3000}, "C": {"length": 3000}} + # edge list + edges = [("A", "B", {"loss": 0.1, "length": 2000})] + + with tempfile.TemporaryDirectory() as tmpdir: + storage = NetworkXStorage(working_dir=tmpdir, namespace="token_limit") + for nid, ndata in node_data.items(): + await storage.upsert_node(nid, ndata) + for src, tgt, edata in edges: + await storage.upsert_edge(src, tgt, edata) + + partitioner = ECEPartitioner() + communities: list[Community] = await partitioner.partition( + storage, + max_units_per_community=10, + max_tokens_per_community=5000, # 1 node (3000) + 1 edge (2000) = 5000 + unit_sampling="random", + ) + + # With a 5000-token budget we need at least two communities + assert len(communities) >= 2 + + # helper: quick edge lookup + edge_lens = {(u, v): d["length"] for u, v, d in edges} + edge_lens.update({(v, u): d["length"] for u, v, d in edges}) # undirected + + for c in communities: + node_tokens = sum(node_data[n]["length"] for n in c.nodes) + edge_tokens = sum(edge_lens[e] for e in c.edges) + assert node_tokens + edge_tokens <= 5000 diff --git a/tests/integration_tests/models/reader/test_mineru_parser.py b/tests/integration_tests/models/reader/test_mineru_parser.py new file mode 100644 index 00000000..c30863f8 --- /dev/null +++ b/tests/integration_tests/models/reader/test_mineru_parser.py @@ -0,0 +1,41 @@ +import os +from pathlib import Path + +from graphgen.models.reader.pdf_reader import MinerUParser + + +def test_check_bin(): + """Ensure mineru CLI is available.""" + MinerUParser() + + +def test_parse_pdf(): + """Parse a real PDF and verify basic structure.""" + repo_root = Path(__file__).resolve().parents[4] + + sample_pdf = os.path.join(repo_root, "resources", "input_examples", "pdf_demo.pdf") + parser = MinerUParser() + blocks = parser.parse_pdf(sample_pdf, device="cpu", method="auto") + + assert isinstance(blocks, list) + assert blocks, "At least one block expected" + + text_blocks = [b for b in blocks if b.get("type") == "text"] + assert text_blocks, "No text block found" + + first = text_blocks[0] + assert "text" in first + assert isinstance(first["content"], str) + assert first["content"].strip(), "Empty text content" + + +def test_empty_pdf(tmp_path: Path) -> None: + """Gracefully handle blank PDF.""" + empty = tmp_path / "empty.pdf" + empty.write_bytes(b"%PDF-1.4\n%%EOF\n") # syntactically valid, no content + + parser = MinerUParser() + blocks = parser.parse_pdf(empty, device="cpu") + + # Empty list or list with empty text block are both acceptable + assert isinstance(blocks, list) diff --git a/tests/integration_tests/models/splitter/test_character_splitter.py b/tests/integration_tests/models/splitter/test_character_splitter.py new file mode 100644 index 00000000..5d41547d --- /dev/null +++ b/tests/integration_tests/models/splitter/test_character_splitter.py @@ -0,0 +1,30 @@ +import pytest + +from graphgen.models.splitter.character_splitter import CharacterSplitter + + +@pytest.mark.parametrize( + "text,chunk_size,chunk_overlap,expected", + [ + ( + "This is a test.\n\nThis is only a test.\n\nIn the event of an actual emergency...", + 25, + 5, + [ + "This is a test.", + "This is only a test.", + "In the event of an actual emergency...", + ], + ), + ], +) +def test_character_splitter(text, chunk_size, chunk_overlap, expected): + splitter = CharacterSplitter( + separator="\n\n", + is_separator_regex=False, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + keep_separator=False, + ) + chunks = splitter.split_text(text) + assert chunks == expected diff --git a/tests/integration_tests/models/splitter/test_markdown_splitter.py b/tests/integration_tests/models/splitter/test_markdown_splitter.py new file mode 100644 index 00000000..8d02e1b2 --- /dev/null +++ b/tests/integration_tests/models/splitter/test_markdown_splitter.py @@ -0,0 +1,40 @@ +from graphgen.models.splitter.markdown_splitter import MarkdownTextRefSplitter + + +def test_split_markdown_structures(): + md = ( + "# Header1\n\n" + "Some introduction here.\n\n" + "## Header2\n\n" + "```python\nprint('hello')\n```\n" + "Paragraph under code block.\n\n" + "***\n" + "### Header3\n\n" + "More text after horizontal rule.\n\n" + "#### Header4\n\n" + "Final paragraph." + ) + + splitter = MarkdownTextRefSplitter( + chunk_size=120, + chunk_overlap=0, + keep_separator=True, + is_separator_regex=True, + ) + chunks = splitter.split_text(md) + assert len(chunks) > 1 + + for chk in chunks: + assert len(chk) <= 120 + + assert any("## Header2" in c for c in chunks) + assert any("***" in c for c in chunks) + assert any("```" in c for c in chunks) + + +def test_split_size_less_than_single_char(): + """Edge case: chunk_size is smaller than any segment; should still return the original text.""" + short = "# A\n\nB" + splitter = MarkdownTextRefSplitter(chunk_size=1, chunk_overlap=0) + chunks = splitter.split_text(short) + assert "".join(chunks) == short diff --git a/tests/integration_tests/models/splitter/test_recursive_character_splitter.py b/tests/integration_tests/models/splitter/test_recursive_character_splitter.py new file mode 100644 index 00000000..7d104f3e --- /dev/null +++ b/tests/integration_tests/models/splitter/test_recursive_character_splitter.py @@ -0,0 +1,49 @@ +from graphgen.models.splitter.recursive_character_splitter import ( + ChineseRecursiveTextSplitter, + RecursiveCharacterSplitter, +) + + +def test_split_english_paragraph(): + text = ( + "Natural language processing (NLP) is a subfield of linguistics, computer science, " + "and artificial intelligence. It focuses on the interaction between computers and " + "humans through natural language. The ultimate objective of NLP is to read, decipher, " + "understand, and make sense of human languages in a manner that is valuable.\n\n" + "Most NLP techniques rely on machine learning." + ) + + splitter = RecursiveCharacterSplitter( + chunk_size=150, + chunk_overlap=0, + keep_separator=True, + is_separator_regex=False, + ) + chunks = splitter.split_text(text) + + assert len(chunks) > 1 + for chk in chunks: + assert len(chk) <= 150 + + +def test_split_chinese_with_punctuation(): + text = ( + "自然语言处理是人工智能的重要分支。它研究能实现人与计算机之间用自然语言" + "进行有效通信的各种理论和方法!融合语言学、计算机科学、数学于一体?" + "近年来,深度学习极大推动了NLP的发展;Transformer、BERT、GPT等模型层出不穷," + ",,,甚至出现了多模态大模型。\n\n" + "未来,NLP 将继续向通用人工智能迈进。" + ) + + splitter = ChineseRecursiveTextSplitter( + chunk_size=60, + chunk_overlap=0, + keep_separator=True, + is_separator_regex=True, + ) + chunks = splitter.split_text(text) + + assert len(chunks) > 1 + for chk in chunks: + assert len(chk) <= 60 + assert "\n\n\n" not in chk diff --git a/tests/integration_tests/test_engine.py b/tests/integration_tests/test_engine.py new file mode 100644 index 00000000..6a389e42 --- /dev/null +++ b/tests/integration_tests/test_engine.py @@ -0,0 +1,78 @@ +import pytest + +from graphgen.engine import Context, Engine, op + +engine = Engine(max_workers=2) + + +def test_simple_dag(capsys): + """Verify the DAG A->B/C->D execution results and print order.""" + ctx = Context() + + @op("A") + def op_a(self, ctx): + print("Running A") + ctx.set("A", 1) + + @op("B", deps=["A"]) + def op_b(self, ctx): + print("Running B") + ctx.set("B", ctx.get("A") + 1) + + @op("C", deps=["A"]) + def op_c(self, ctx): + print("Running C") + ctx.set("C", ctx.get("A") + 2) + + @op("D", deps=["B", "C"]) + def op_d(self, ctx): + print("Running D") + ctx.set("D", ctx.get("B") + ctx.get("C")) + + # Explicitly list the nodes to run; avoid relying on globals(). + ops = [op_a, op_b, op_c, op_d] + engine.run(ops, ctx) + + # Assert final results. + assert ctx["A"] == 1 + assert ctx["B"] == 2 + assert ctx["C"] == 3 + assert ctx["D"] == 5 + + # Assert print order: A must run before B and C; D must run after B and C. + captured = capsys.readouterr().out.strip().splitlines() + assert "Running A" in captured + assert "Running B" in captured + assert "Running C" in captured + assert "Running D" in captured + + a_idx = next(i for i, line in enumerate(captured) if "Running A" in line) + b_idx = next(i for i, line in enumerate(captured) if "Running B" in line) + c_idx = next(i for i, line in enumerate(captured) if "Running C" in line) + d_idx = next(i for i, line in enumerate(captured) if "Running D" in line) + + assert a_idx < b_idx + assert a_idx < c_idx + assert d_idx > b_idx + assert d_idx > c_idx + + +def test_cyclic_detection(): + """A cyclic dependency should raise ValueError.""" + ctx = Context() + + @op("X", deps=["Y"]) + def op_x(self, ctx): + pass + + @op("Y", deps=["X"]) + def op_y(self, ctx): + pass + + ops = [op_x, op_y] + with pytest.raises(ValueError, match="Cyclic dependencies"): + engine.run(ops, ctx) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/uv.lock b/uv.lock new file mode 100644 index 00000000..a02a6a37 --- /dev/null +++ b/uv.lock @@ -0,0 +1,3 @@ +version = 1 +revision = 3 +requires-python = ">=3.10" diff --git a/webui/app.py b/webui/app.py index 7e6f4ebb..98b02601 100644 --- a/webui/app.py +++ b/webui/app.py @@ -1,27 +1,26 @@ +import json import os import sys -import json import tempfile +from importlib.resources import files -import pandas as pd import gradio as gr - -from gradio_i18n import Translate, gettext as _ - -from base import GraphGenParams -from test_api import test_api_connection -from cache_utils import setup_workspace, cleanup_workspace -from count_tokens import count_tokens - -# pylint: disable=wrong-import-position -root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +import ray +from dotenv import load_dotenv + +from graphgen.engine import Engine +from graphgen.operators import operators +from graphgen.utils import CURRENT_LOGGER_VAR, set_logger +from webui.base import WebuiParams +from webui.i18n import Translate +from webui.i18n import gettext as _ +from webui.test_api import test_api_connection +from webui.utils import cleanup_workspace, count_tokens, preview_file, setup_workspace + +root_dir = files("webui").parent sys.path.append(root_dir) -from graphgen.graphgen import GraphGen -from graphgen.models import OpenAIModel, Tokenizer, TraverseStrategy -from graphgen.models.llm.limitter import RPM, TPM -from graphgen.utils import set_logger - +load_dotenv() css = """ .center-row { @@ -32,220 +31,213 @@ """ -def init_graph_gen(config: dict, env: dict) -> GraphGen: - # Set up working directory - log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache")) - - set_logger(log_file, if_stream=False) - graph_gen = GraphGen( - working_dir=working_dir - ) - - # Set up LLM clients - graph_gen.synthesizer_llm_client = OpenAIModel( - model_name=env.get("SYNTHESIZER_MODEL", ""), - base_url=env.get("SYNTHESIZER_BASE_URL", ""), - api_key=env.get("SYNTHESIZER_API_KEY", ""), - request_limit=True, - rpm= RPM(env.get("RPM", 1000)), - tpm= TPM(env.get("TPM", 50000)), - ) - - graph_gen.trainee_llm_client = OpenAIModel( - model_name=env.get("TRAINEE_MODEL", ""), - base_url=env.get("TRAINEE_BASE_URL", ""), - api_key=env.get("TRAINEE_API_KEY", ""), - request_limit=True, - rpm= RPM(env.get("RPM", 1000)), - tpm= TPM(env.get("TPM", 50000)), - ) - - graph_gen.tokenizer_instance = Tokenizer( - config.get("tokenizer", "cl100k_base")) - - strategy_config = config.get("traverse_strategy", {}) - graph_gen.traverse_strategy = TraverseStrategy( - qa_form=config.get("qa_form"), - expand_method=strategy_config.get("expand_method"), - bidirectional=strategy_config.get("bidirectional"), - max_extra_edges=strategy_config.get("max_extra_edges"), - max_tokens=strategy_config.get("max_tokens"), - max_depth=strategy_config.get("max_depth"), - edge_sampling=strategy_config.get("edge_sampling"), - isolated_node_strategy=strategy_config.get("isolated_node_strategy"), - loss_strategy=str(strategy_config.get("loss_strategy")) - ) +def _get_partition_params(params: WebuiParams): + method = params.partition_method + if method == "dfs": + return { + "max_units_per_community": params.dfs_max_units, + } + if method == "bfs": + return { + "max_units_per_community": params.bfs_max_units, + } + if method == "leiden": + return { + "max_size": params.leiden_max_size, + "use_lcc": params.leiden_use_lcc, + "random_seed": params.leiden_random_seed, + } + # ece + return { + "max_units_per_community": params.ece_max_units, + "min_units_per_community": params.ece_min_units, + "max_tokens_per_community": params.ece_max_tokens, + "unit_sampling": params.ece_unit_sampling, + } - return graph_gen # pylint: disable=too-many-statements -def run_graphgen(params, progress=gr.Progress()): - def sum_tokens(client): - return sum(u["total_tokens"] for u in client.token_usage) - - config = { - "if_trainee_model": params.if_trainee_model, - "input_file": params.input_file, - "tokenizer": params.tokenizer, - "qa_form": params.qa_form, - "web_search": False, - "quiz_samples": params.quiz_samples, - "traverse_strategy": { - "bidirectional": params.bidirectional, - "expand_method": params.expand_method, - "max_extra_edges": params.max_extra_edges, - "max_tokens": params.max_tokens, - "max_depth": params.max_depth, - "edge_sampling": params.edge_sampling, - "isolated_node_strategy": params.isolated_node_strategy, - "loss_strategy": params.loss_strategy +def run_graphgen(params: WebuiParams, progress=gr.Progress()): + # 1. Setup Workspace + log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache")) + driver_logger = set_logger(log_file, "GraphGeb", if_stream=True) + CURRENT_LOGGER_VAR.set(driver_logger) + + # 2. Setup Environment Variables for Ray Actors/LLM Init + # The refactored code relies on env vars in graphgen/common/init_llm.py + os.environ["SYNTHESIZER_BACKEND"] = "openai_api" # Assuming OpenAI compatible API + os.environ["SYNTHESIZER_BASE_URL"] = params.synthesizer_url + os.environ["SYNTHESIZER_API_KEY"] = params.api_key + os.environ["SYNTHESIZER_MODEL"] = params.synthesizer_model + os.environ["RPM"] = str(params.rpm) + os.environ["TPM"] = str(params.tpm) + os.environ["TOKENIZER_MODEL"] = params.tokenizer + + if params.if_trainee_model: + os.environ["TRAINEE_BACKEND"] = "openai_api" + os.environ["TRAINEE_BASE_URL"] = params.trainee_url + os.environ["TRAINEE_API_KEY"] = params.trainee_api_key + os.environ["TRAINEE_MODEL"] = params.trainee_model + + # 3. Construct Pipeline Configuration (DAG) + nodes = [ + { + "id": "read", + "op_name": "read", + "type": "source", + "dependencies": [], + "params": { + "input_path": [params.upload_file], + }, }, - "chunk_size": params.chunk_size, - } - - env = { - "SYNTHESIZER_BASE_URL": params.synthesizer_url, - "SYNTHESIZER_MODEL": params.synthesizer_model, - "TRAINEE_BASE_URL": params.trainee_url, - "TRAINEE_MODEL": params.trainee_model, - "SYNTHESIZER_API_KEY": params.api_key, - "TRAINEE_API_KEY": params.trainee_api_key, - "RPM": params.rpm, - "TPM": params.tpm, - } + { + "id": "chunk", + "op_name": "chunk", + "type": "map_batch", + "dependencies": ["read"], + "execution_params": {"replicas": 1}, + "params": { + "chunk_size": params.chunk_size, + "chunk_overlap": params.chunk_overlap, + }, + }, + { + "id": "build_kg", + "op_name": "build_kg", + "type": "map_batch", + "dependencies": ["chunk"], + "execution_params": {"replicas": 1, "batch_size": 128}, + }, + ] + + last_node_id = "build_kg" + + # Optional: Quiz and Judge + if params.if_trainee_model: + nodes.append( + { + "id": "quiz", + "op_name": "quiz", + "type": "aggregate", # QuizService uses aggregate in config + "dependencies": ["build_kg"], + "execution_params": {"replicas": 1, "batch_size": 128}, + "params": { + "quiz_samples": params.quiz_samples, + "concurrency_limit": 200, + }, + } + ) - # Test API connection - test_api_connection(env["SYNTHESIZER_BASE_URL"], - env["SYNTHESIZER_API_KEY"], env["SYNTHESIZER_MODEL"]) - if config['if_trainee_model']: - test_api_connection(env["TRAINEE_BASE_URL"], - env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]) + nodes.append( + { + "id": "judge", + "op_name": "judge", + "type": "map_batch", + "dependencies": ["quiz"], + "execution_params": {"replicas": 1, "batch_size": 128}, + } + ) + last_node_id = "judge" + + # Node: Partition + nodes.append( + { + "id": "partition", + "op_name": "partition", + "type": "aggregate", # PartitionService uses aggregate + "dependencies": [last_node_id], + "params": { + "method": params.partition_method, + "method_params": _get_partition_params(params), + }, + } + ) - # Initialize GraphGen - graph_gen = init_graph_gen(config, env) - graph_gen.clear() + # Node: Generate + nodes.append( + { + "id": "generate", + "op_name": "generate", + "type": "map_batch", + "dependencies": ["partition"], + "execution_params": {"replicas": 1, "batch_size": 128}, + "params": { + "method": params.mode, + "data_format": params.data_format, + }, + } + ) - graph_gen.progress_bar = progress + config = {"global_params": {"working_dir": working_dir}, "nodes": nodes} try: - # Load input data - file = config['input_file'] - if isinstance(file, list): - file = file[0] - - data = [] - - if file.endswith(".jsonl"): - data_type = "raw" - with open(file, "r", encoding='utf-8') as f: - data.extend(json.loads(line) for line in f) - elif file.endswith(".json"): - data_type = "chunked" - with open(file, "r", encoding='utf-8') as f: - data.extend(json.load(f)) - elif file.endswith(".txt"): - # 读取文件后根据chunk_size转成raw格式的数据 - data_type = "raw" - content = "" - with open(file, "r", encoding='utf-8') as f: - lines = f.readlines() - for line in lines: - content += line.strip() + " " - size = int(config.get("chunk_size", 512)) - chunks = [ - content[i:i + size] for i in range(0, len(content), size) - ] - data.extend([{"content": chunk} for chunk in chunks]) - else: - raise ValueError(f"Unsupported file type: {file}") - - # Process the data - graph_gen.insert(data, data_type) - - if config['if_trainee_model']: - # Generate quiz - graph_gen.quiz(max_samples=config['quiz_samples']) - - # Judge statements - graph_gen.judge() + # 4. Initialize and Run Engine + # Initialize Ray if not already running (Engine handles this mostly, but good for safety) + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True, log_to_driver=True) + + engine = Engine(config, operators) + + # Start with an empty dataset to kick off the pipeline + ds = ray.data.from_items([]) + + # Execute pipeline + results = engine.execute(ds) + + # 5. Process Output + # Extract the result from the 'generate' node + if "generate" in results: + result_ds = results["generate"] + + # Create a temporary file to save the output + with tempfile.NamedTemporaryFile( + mode="w", suffix=".jsonl", delete=False, encoding="utf-8" + ) as tmpfile: + # Iterate over rows and write to file + for row in result_ds.iter_rows(): + json.dump(row, tmpfile, ensure_ascii=False) + tmpfile.write("\n") + output_file = tmpfile.name else: - graph_gen.traverse_strategy.edge_sampling = "random" - # Skip judge statements - graph_gen.judge(skip=True) - - # Traverse graph - graph_gen.traverse() - - # Save output - output_data = graph_gen.qa_storage.data - with tempfile.NamedTemporaryFile( - mode="w", - suffix=".jsonl", - delete=False, - encoding="utf-8") as tmpfile: - json.dump(output_data, tmpfile, ensure_ascii=False) - output_file = tmpfile.name - - synthesizer_tokens = sum_tokens(graph_gen.synthesizer_llm_client) - trainee_tokens = sum_tokens(graph_gen.trainee_llm_client) if config['if_trainee_model'] else 0 - total_tokens = synthesizer_tokens + trainee_tokens - - data_frame = params.token_counter - try: - _update_data = [ - [ - data_frame.iloc[0, 0], - data_frame.iloc[0, 1], - str(total_tokens) - ] - ] - new_df = pd.DataFrame( - _update_data, - columns=data_frame.columns - ) - data_frame = new_df + raise gr.Error("Generation step failed to produce output.") - except Exception as e: - raise gr.Error(f"DataFrame operation error: {str(e)}") + # Note: Dynamic token counting from distributed actors is not directly available + # via client properties in the new architecture. We return the estimated stats from input. - return output_file, gr.DataFrame(label='Token Stats', - headers=["Source Text Token Count", "Expected Token Usage", "Token Used"], - datatype="str", - interactive=False, - value=data_frame, - visible=True, - wrap=True) + return output_file, params.token_counter except Exception as e: # pylint: disable=broad-except raise gr.Error(f"Error occurred: {str(e)}") finally: # Clean up workspace - cleanup_workspace(graph_gen.working_dir) + cleanup_workspace(working_dir) # Optional: keep for debugging or enable -with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), - css=css) as demo): + +with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo: # Header - gr.Image(value=os.path.join(root_dir, 'resources', 'images', 'logo.png'), - label="GraphGen Banner", - elem_id="banner", - interactive=False, - container=False, - show_download_button=False, - show_fullscreen_button=False) + gr.Image( + value=os.path.join(root_dir, "resources", "images", "logo.png"), + label="GraphGen Banner", + elem_id="banner", + interactive=False, + container=False, + show_download_button=False, + show_fullscreen_button=False, + ) lang_btn = gr.Radio( choices=[ ("English", "en"), ("简体中文", "zh"), ], value="en", - # label=_("Language"), render=False, container=False, elem_classes=["center-row"], ) - gr.HTML(""" + gr.HTML( + """ - """) + """ + ) with Translate( - os.path.join(root_dir, 'webui', 'translation.json'), - lang_btn, - placeholder_langs=["en", "zh"], - persistant= - False, # True to save the language setting in the browser. Requires gradio >= 5.6.0 + os.path.join(root_dir, "webui", "translation.json"), + lang_btn, + placeholder_langs=["en", "zh"], + persistant=False, ): lang_btn.render() - gr.Markdown( - value = "# " + _("Title") + "\n\n" + \ - "### [GraphGen](https://github.com/open-sciencelab/GraphGen) " + _("Intro") - ) + gr.Markdown(value=_("Title") + _("Intro")) - if_trainee_model = gr.Checkbox(label=_("Use Trainee Model"), - value=False, - interactive=True) + if_trainee_model = gr.Checkbox( + label=_("Use Trainee Model"), value=False, interactive=True + ) with gr.Accordion(label=_("Model Config"), open=False): - synthesizer_url = gr.Textbox(label="Synthesizer URL", - value="https://api.siliconflow.cn/v1", - info=_("Synthesizer URL Info"), - interactive=True) - synthesizer_model = gr.Textbox(label="Synthesizer Model", - value="Qwen/Qwen2.5-7B-Instruct", - info=_("Synthesizer Model Info"), - interactive=True) - trainee_url = gr.Textbox(label="Trainee URL", - value="https://api.siliconflow.cn/v1", - info=_("Trainee URL Info"), - interactive=True, - visible=if_trainee_model.value is True) + tokenizer = gr.Textbox( + label="Tokenizer", value="cl100k_base", interactive=True + ) + synthesizer_url = gr.Textbox( + label="Synthesizer URL", + value="https://api.siliconflow.cn/v1", + info=_("Synthesizer URL Info"), + interactive=True, + ) + synthesizer_model = gr.Textbox( + label="Synthesizer Model", + value="Qwen/Qwen2.5-7B-Instruct", + info=_("Synthesizer Model Info"), + interactive=True, + ) + trainee_url = gr.Textbox( + label="Trainee URL", + value="https://api.siliconflow.cn/v1", + info=_("Trainee URL Info"), + interactive=True, + visible=if_trainee_model.value is True, + ) trainee_model = gr.Textbox( label="Trainee Model", value="Qwen/Qwen2.5-7B-Instruct", info=_("Trainee Model Info"), interactive=True, - visible=if_trainee_model.value is True) + visible=if_trainee_model.value is True, + ) trainee_api_key = gr.Textbox( - label=_("SiliconCloud Token for Trainee Model"), - type="password", - value="", - info="https://cloud.siliconflow.cn/account/ak", - visible=if_trainee_model.value is True) - - - with gr.Accordion(label=_("Generation Config"), open=False): - chunk_size = gr.Slider(label="Chunk Size", - minimum=256, - maximum=4096, - value=512, - step=256, - interactive=True) - tokenizer = gr.Textbox(label="Tokenizer", - value="cl100k_base", - interactive=True) - qa_form = gr.Radio(choices=["atomic", "multi_hop", "aggregated"], - label="QA Form", - value="aggregated", - interactive=True) - quiz_samples = gr.Number(label="Quiz Samples", - value=2, - minimum=1, - interactive=True, - visible=if_trainee_model.value is True) - bidirectional = gr.Checkbox(label="Bidirectional", - value=True, - interactive=True) - - expand_method = gr.Radio(choices=["max_width", "max_tokens"], - label="Expand Method", - value="max_tokens", - interactive=True) - max_extra_edges = gr.Slider( - minimum=1, - maximum=10, - value=5, - label="Max Extra Edges", - step=1, - interactive=True, - visible=expand_method.value == "max_width") - max_tokens = gr.Slider(minimum=64, - maximum=1024, - value=256, - label="Max Tokens", - step=64, - interactive=True, - visible=(expand_method.value - != "max_width")) - - max_depth = gr.Slider(minimum=1, - maximum=5, - value=2, - label="Max Depth", - step=1, - interactive=True) - edge_sampling = gr.Radio( - choices=["max_loss", "min_loss", "random"], - label="Edge Sampling", - value="max_loss", - interactive=True, - visible=if_trainee_model.value is True) - isolated_node_strategy = gr.Radio(choices=["add", "ignore"], - label="Isolated Node Strategy", - value="ignore", - interactive=True) - loss_strategy = gr.Radio(choices=["only_edge", "both"], - label="Loss Strategy", - value="only_edge", - interactive=True) + label=_("SiliconFlow Token for Trainee Model"), + type="password", + value="", + info="https://cloud.siliconflow.cn/account/ak", + visible=if_trainee_model.value is True, + ) with gr.Row(equal_height=True): with gr.Column(scale=3): api_key = gr.Textbox( - label=_("SiliconCloud Token"), + label=_("SiliconFlow Token"), type="password", value="", - info="https://cloud.siliconflow.cn/account/ak") + info=_("SiliconFlow Token Info"), + ) with gr.Column(scale=1): test_connection_btn = gr.Button(_("Test Connection")) + with gr.Row(equal_height=True): + with gr.Column(scale=1): + with gr.Blocks(): + with gr.Row(equal_height=True): + with gr.Column(scale=1): + upload_file = gr.File( + label=_("Upload File"), + file_count="single", + file_types=[".txt", ".json", ".jsonl", ".csv"], + interactive=True, + ) + examples_dir = os.path.join(root_dir, "webui", "examples") + gr.Examples( + examples=[ + [os.path.join(examples_dir, "txt_demo.txt")], + [os.path.join(examples_dir, "jsonl_demo.jsonl")], + [os.path.join(examples_dir, "json_demo.json")], + [os.path.join(examples_dir, "csv_demo.csv")], + ], + inputs=upload_file, + label=_("Example Files"), + examples_per_page=4, + ) + with gr.Column(scale=1): + with gr.Blocks(): + preview_code = gr.Code( + label=_("File Preview"), + interactive=False, + visible=True, + elem_id="preview_code", + ) + preview_df = gr.DataFrame( + label=_("File Preview"), + interactive=False, + visible=False, + elem_id="preview_df", + ) + + with gr.Accordion(label=_("Split Config"), open=False): + gr.Markdown(value=_("Split Config Info")) + with gr.Row(equal_height=True): + with gr.Column(scale=1): + chunk_size = gr.Slider( + label=_("Chunk Size"), + minimum=256, + maximum=4096, + value=1024, + step=256, + interactive=True, + info=_("Chunk Size Info"), + ) + with gr.Column(scale=1): + chunk_overlap = gr.Slider( + label=_("Chunk Overlap"), + minimum=0, + maximum=500, + value=100, + step=100, + interactive=True, + info=_("Chunk Overlap Info"), + ) + + with gr.Accordion( + label=_("Quiz & Judge Config"), open=False, visible=False + ) as quiz_accordion: + gr.Markdown(value=_("Quiz & Judge Config Info")) + quiz_samples = gr.Number( + label=_("Quiz Samples"), + value=2, + minimum=1, + interactive=True, + info=_("Quiz Samples Info"), + ) + + with gr.Accordion(label=_("Partition Config"), open=False): + gr.Markdown(value=_("Partition Config Info")) + + partition_method = gr.Dropdown( + label=_("Partition Method"), + choices=["dfs", "bfs", "ece", "leiden"], + value="ece", + interactive=True, + info=_("Which algorithm to use for graph partitioning."), + ) + + # DFS method parameters + with gr.Group(visible=False) as dfs_group: + gr.Markdown(_("DFS intro")) + dfs_max_units = gr.Slider( + label=_("Max Units Per Community"), + minimum=1, + maximum=100, + value=5, + step=1, + interactive=True, + info=_("Max Units Per Community Info"), + ) + # BFS method parameters + with gr.Group(visible=False) as bfs_group: + gr.Markdown(_("BFS intro")) + bfs_max_units = gr.Slider( + label=_("Max Units Per Community"), + minimum=1, + maximum=100, + value=5, + step=1, + interactive=True, + info=_("Max Units Per Community Info"), + ) + + # Leiden method parameters + with gr.Group(visible=False) as leiden_group: + gr.Markdown(_("Leiden intro")) + leiden_max_size = gr.Slider( + label=_("Maximum Size of Communities"), + minimum=1, + maximum=100, + value=20, + step=1, + interactive=True, + info=_("Maximum Size of Communities Info"), + ) + leiden_use_lcc = gr.Checkbox( + label=_("Use Largest Connected Component"), + value=False, + interactive=True, + info=_("Use Largest Connected Component Info"), + ) + leiden_random_seed = gr.Number( + label=_("Random Seed"), + value=42, + precision=0, + interactive=True, + info=_("Random Seed Info"), + ) + + # ECE method parameters + with gr.Group(visible=True) as ece_group: + gr.Markdown(_("ECE intro")) + ece_max_units = gr.Slider( + label=_("Max Units Per Community"), + minimum=1, + maximum=100, + value=20, + step=1, + interactive=True, + info=_("Max Units Per Community Info"), + ) + ece_min_units = gr.Slider( + label=_("Min Units Per Community"), + minimum=1, + maximum=100, + value=3, + step=1, + interactive=True, + info=_("Min Units Per Community Info"), + ) + ece_max_tokens = gr.Slider( + label=_("Max Tokens Per Community"), + minimum=512, + maximum=20_480, + value=10_240, + step=512, + interactive=True, + info=_("Max Tokens Per Community Info"), + ) + ece_unit_sampling = gr.Radio( + label=_("Unit Sampling Strategy"), + choices=["random"], + value="random", + interactive=True, + info=_("Unit Sampling Strategy Info"), + ) + + def toggle_partition_params(method): + dfs = method == "dfs" + bfs = method == "bfs" + leiden = method == "leiden" + ece = method == "ece" + return ( + gr.update(visible=dfs), # dfs_group + gr.update(visible=bfs), # bfs_group + gr.update(visible=leiden), # leiden_group + gr.update(visible=ece), # ece_group + ) + + partition_method.change( + fn=toggle_partition_params, + inputs=partition_method, + outputs=[dfs_group, bfs_group, leiden_group, ece_group], + ) + + with gr.Accordion(label=_("Generation Config"), open=False): + gr.Markdown(value=_("Generation Config Info")) + mode = gr.Radio( + choices=["atomic", "multi_hop", "aggregated", "CoT"], + label=_("Mode"), + value="aggregated", + interactive=True, + info=_("Mode Info"), + ) + data_format = gr.Radio( + choices=["Alpaca", "Sharegpt", "ChatML"], + label=_("Output Data Format"), + value="Alpaca", + interactive=True, + info=_("Output Data Format Info"), + ) + + with gr.Blocks(): + token_counter = gr.DataFrame( + label="Token Stats", + headers=[ + "Source Text Token Count", + "Estimated Token Usage", + "Token Used", + ], + datatype="str", + interactive=False, + visible=False, + wrap=True, + ) + with gr.Blocks(): with gr.Row(equal_height=True): with gr.Column(): @@ -392,7 +550,8 @@ def sum_tokens(client): value=1000, step=100, interactive=True, - visible=True) + visible=True, + ) with gr.Column(): tpm = gr.Slider( label="TPM", @@ -401,41 +560,16 @@ def sum_tokens(client): value=50000, step=1000, interactive=True, - visible=True) - - - with gr.Blocks(): - with gr.Row(equal_height=True): - with gr.Column(scale=1): - upload_file = gr.File( - label=_("Upload File"), - file_count="single", - file_types=[".txt", ".json", ".jsonl"], - interactive=True, - ) - examples_dir = os.path.join(root_dir, 'webui', 'examples') - gr.Examples(examples=[ - [os.path.join(examples_dir, "txt_demo.txt")], - [os.path.join(examples_dir, "raw_demo.jsonl")], - [os.path.join(examples_dir, "chunked_demo.json")], - ], - inputs=upload_file, - label=_("Example Files"), - examples_per_page=3) - with gr.Column(scale=1): - output = gr.File( - label="Output(See Github FAQ)", - file_count="single", - interactive=False, + visible=True, ) with gr.Blocks(): - token_counter = gr.DataFrame(label='Token Stats', - headers=["Source Text Token Count", "Estimated Token Usage", "Token Used"], - datatype="str", - interactive=False, - visible=False, - wrap=True) + with gr.Column(scale=1): + output = gr.File( + label=_("Output File"), + file_count="single", + interactive=False, + ) submit_btn = gr.Button(_("Run GraphGen")) @@ -443,32 +577,48 @@ def sum_tokens(client): test_connection_btn.click( test_api_connection, inputs=[synthesizer_url, api_key, synthesizer_model], - outputs=[]) + outputs=[], + ) if if_trainee_model.value: - test_connection_btn.click(test_api_connection, - inputs=[trainee_url, api_key, trainee_model], - outputs=[]) + test_connection_btn.click( + test_api_connection, + inputs=[trainee_url, api_key, trainee_model], + outputs=[], + ) - expand_method.change(lambda method: - (gr.update(visible=method == "max_width"), - gr.update(visible=method != "max_width")), - inputs=expand_method, - outputs=[max_extra_edges, max_tokens]) + if_trainee_model.change( + lambda use_trainee: [gr.update(visible=use_trainee)] * 4, + inputs=if_trainee_model, + outputs=[ + trainee_url, + trainee_model, + trainee_api_key, + quiz_accordion, + ], + ) if_trainee_model.change( - lambda use_trainee: [gr.update(visible=use_trainee)] * 5, + lambda on: ( + gr.update( + choices=["random"] + if not on + else ["random", "max_loss", "min_loss"], + value="random", + ) + ), inputs=if_trainee_model, - outputs=[trainee_url, trainee_model, quiz_samples, edge_sampling, trainee_api_key]) + outputs=ece_unit_sampling, + ) upload_file.change( - lambda x: (gr.update(visible=True)), - inputs=[upload_file], - outputs=[token_counter], + preview_file, inputs=upload_file, outputs=[preview_code, preview_df] + ).then( + lambda x: gr.update(visible=True), inputs=upload_file, outputs=token_counter ).then( count_tokens, inputs=[upload_file, tokenizer, token_counter], - outputs=[token_counter], + outputs=token_counter, ) # run GraphGen @@ -479,41 +629,41 @@ def sum_tokens(client): ) submit_btn.click( - lambda *args: run_graphgen(GraphGenParams( - if_trainee_model=args[0], - input_file=args[1], - tokenizer=args[2], - qa_form=args[3], - bidirectional=args[4], - expand_method=args[5], - max_extra_edges=args[6], - max_tokens=args[7], - max_depth=args[8], - edge_sampling=args[9], - isolated_node_strategy=args[10], - loss_strategy=args[11], - synthesizer_url=args[12], - synthesizer_model=args[13], - trainee_model=args[14], - api_key=args[15], - chunk_size=args[16], - rpm=args[17], - tpm=args[18], - quiz_samples=args[19], - trainee_url=args[20], - trainee_api_key=args[21], - token_counter=args[22], - )), + lambda *args: run_graphgen( + WebuiParams(**dict(zip(WebuiParams.__annotations__, args))) + ), inputs=[ - if_trainee_model, upload_file, tokenizer, qa_form, - bidirectional, expand_method, max_extra_edges, max_tokens, - max_depth, edge_sampling, isolated_node_strategy, - loss_strategy, synthesizer_url, synthesizer_model, trainee_model, - api_key, chunk_size, rpm, tpm, quiz_samples, trainee_url, trainee_api_key, token_counter + if_trainee_model, + upload_file, + tokenizer, + synthesizer_model, + synthesizer_url, + trainee_model, + trainee_url, + api_key, + trainee_api_key, + chunk_size, + chunk_overlap, + quiz_samples, + partition_method, + dfs_max_units, + bfs_max_units, + leiden_max_size, + leiden_use_lcc, + leiden_random_seed, + ece_max_units, + ece_min_units, + ece_max_tokens, + ece_unit_sampling, + mode, + data_format, + rpm, + tpm, + token_counter, ], outputs=[output, token_counter], ) if __name__ == "__main__": demo.queue(api_open=False, default_concurrency_limit=2) - demo.launch(server_name='0.0.0.0') + demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False) diff --git a/webui/base.py b/webui/base.py index 32f3ed10..1d20f181 100644 --- a/webui/base.py +++ b/webui/base.py @@ -1,31 +1,37 @@ -from dataclasses import dataclass -from typing import Any - -@dataclass -class GraphGenParams: - """ - GraphGen parameters - """ - if_trainee_model: bool - input_file: str - tokenizer: str - qa_form: str - bidirectional: bool - expand_method: str - max_extra_edges: int - max_tokens: int - max_depth: int - edge_sampling: str - isolated_node_strategy: str - loss_strategy: str - synthesizer_url: str - synthesizer_model: str - trainee_model: str - api_key: str - chunk_size: int - rpm: int - tpm: int - quiz_samples: int - trainee_url: str - trainee_api_key: str - token_counter: Any +from dataclasses import dataclass +from typing import Any + + +@dataclass +class WebuiParams: + """ + GraphGen parameters + """ + + if_trainee_model: bool + upload_file: Any # gr.File + tokenizer: str + synthesizer_model: str + synthesizer_url: str + trainee_model: str + trainee_url: str + api_key: str + trainee_api_key: str + chunk_size: int + chunk_overlap: int + quiz_samples: int + partition_method: str + dfs_max_units: int + bfs_max_units: int + leiden_max_size: int + leiden_use_lcc: bool + leiden_random_seed: int + ece_max_units: int + ece_min_units: int + ece_max_tokens: int + ece_unit_sampling: str + mode: str + data_format: str + rpm: int + tpm: int + token_counter: Any diff --git a/webui/examples/chunked_demo.json b/webui/examples/chunked_demo.json deleted file mode 100644 index ad7219a3..00000000 --- a/webui/examples/chunked_demo.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - [ - {"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} - ], - [ - {"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} - ], - [ - {"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} - ], - [ - {"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} - ] -] diff --git a/webui/examples/csv_demo.csv b/webui/examples/csv_demo.csv new file mode 100644 index 00000000..e6c1b521 --- /dev/null +++ b/webui/examples/csv_demo.csv @@ -0,0 +1,5 @@ +type,content +text,云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。 +text,隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。 +text,"Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity." +text,"Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture." diff --git a/webui/examples/json_demo.json b/webui/examples/json_demo.json new file mode 100644 index 00000000..a53101da --- /dev/null +++ b/webui/examples/json_demo.json @@ -0,0 +1,6 @@ +[ + {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}, + {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}, + {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}, + {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} +] diff --git a/webui/examples/jsonl_demo.jsonl b/webui/examples/jsonl_demo.jsonl new file mode 100644 index 00000000..2b743578 --- /dev/null +++ b/webui/examples/jsonl_demo.jsonl @@ -0,0 +1,4 @@ +{"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} +{"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} +{"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} +{"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} diff --git a/webui/examples/raw_demo.jsonl b/webui/examples/raw_demo.jsonl deleted file mode 100644 index 024559a6..00000000 --- a/webui/examples/raw_demo.jsonl +++ /dev/null @@ -1,4 +0,0 @@ -{"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} -{"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} -{"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} -{"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} diff --git a/webui/examples/vqa_demo.json b/webui/examples/vqa_demo.json new file mode 100644 index 00000000..a53101da --- /dev/null +++ b/webui/examples/vqa_demo.json @@ -0,0 +1,6 @@ +[ + {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}, + {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}, + {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}, + {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} +] diff --git a/webui/i18n.py b/webui/i18n.py new file mode 100644 index 00000000..ce6bb40e --- /dev/null +++ b/webui/i18n.py @@ -0,0 +1,395 @@ +# pylint: skip-file +import functools +import inspect +import json +import os +from contextlib import contextmanager + +import gradio as gr +import langcodes +import yaml +from gradio.blocks import Block, BlockContext, Context, LocalContext + + +# Monkey patch to escape I18nString type being stripped in gradio.Markdown +def escape_caller(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if args and isinstance(args[0], I18nString): + add_values = args[0].add_values + radd_values = args[0].radd_values + result = I18nString(func(*args, **kwargs)) + result.add_values = add_values + result.radd_values = radd_values + + return result + return func(*args, **kwargs) + + return wrapper + + +inspect.cleandoc = escape_caller(inspect.cleandoc) + + +class TranslateContext: + available_languages = ["en"] + dictionary: dict = {} + lang_per_session = {} + + def get_available_languages(): + return TranslateContext.available_languages + + def set_available_languages(langs: list): + if not langs or not isinstance(langs, list): + raise ValueError("langs must be a list of languages") + TranslateContext.available_languages = langs + + def get_default_language(): + return TranslateContext.get_available_languages()[0] + + def add_translation(translation: dict): + for k, v in translation.items(): + if k not in TranslateContext.available_languages: + continue + if k not in TranslateContext.dictionary: + TranslateContext.dictionary[k] = {} + TranslateContext.dictionary[k].update(v) + + def get_current_language(request: gr.Request): + return TranslateContext.lang_per_session.get( + request.session_hash, TranslateContext.get_default_language() + ) + + def set_current_language(request: gr.Request, lang: str): + TranslateContext.lang_per_session[request.session_hash] = lang + + def get_lang_from_request(request: gr.Request): + if "Accept-Language" not in request.headers: + return TranslateContext.get_default_language() + + # Get the first language from the Accept-Language header + lang = request.headers["Accept-Language"].split(",")[0] + lang, _ = langcodes.closest_match( + lang, TranslateContext.get_available_languages() + ) + + if not lang or lang == "und": + return TranslateContext.get_default_language() + return lang + + +class I18nString(str): + __slots__ = ("_key", "add_values", "radd_values") + + def __new__(cls, value): + obj = super().__new__(cls, value) + obj._key = value + obj.add_values = [] + obj.radd_values = [] + return obj + + def __str__(self): + try: + request = LocalContext.request.get() + except LookupError: + request = None + if request is None: + return self._key + + lang = TranslateContext.get_current_language(request) + result = TranslateContext.dictionary.get(lang, {}).get(self._key, self._key) + + for v in self.radd_values: + result = str(v) + result + for v in self.add_values: + result = result + str(v) + + while len(result) >= 2 and result.startswith("'") and result.endswith("'"): + result = result[1:-1] + return result + + def __add__(self, other): + self.add_values.append(other) + return self + + def __radd__(self, other): + self.radd_values.append(other) + return self + + def __hash__(self) -> int: + return super().__hash__() + + def format(self, *args, **kwargs) -> str: + v = str(self) + if isinstance(v, I18nString): + return super().format(*args, **kwargs) + return v.format(*args, **kwargs) + + def unwrap(self): + return super().__str__() + + @staticmethod + def unwrap_strings(obj): + """Unwrap all keys in I18nStrings in the object""" + if isinstance(obj, I18nString): + yield obj.unwrap() + for v in obj.add_values: + yield from I18nString.unwrap_strings(v) + for v in obj.radd_values: + yield from I18nString.unwrap_strings(v) + return + yield obj + + +def gettext(key: str): + """Wrapper text string to return I18nString + :param key: The key of the I18nString + """ + return I18nString(key) + + +def iter_i18n_choices(choices): + """Iterate all I18nStrings in the choice, returns the indices of the I18nStrings""" + if not isinstance(choices, list) or len(choices) == 0: + return + + if isinstance(choices[0], tuple): + for i, (k, v) in enumerate(choices): + if isinstance(k, I18nString): + yield i + + else: + for i, v in enumerate(choices): + if isinstance(v, I18nString): + yield i + + +def iter_i18n_fields(component: gr.components.Component): + """Iterate all I18nStrings in the component""" + for name, value in inspect.getmembers(component): + if name == "value" and hasattr(component, "choices"): + # for those components with choices, the value will be kept as is + continue + if isinstance(value, I18nString): + yield name + elif name == "choices" and any(iter_i18n_choices(value)): + yield name + + +def iter_i18n_components(block: Block): + """Iterate all I18nStrings in the block""" + if isinstance(block, BlockContext): + for component in block.children: + for c in iter_i18n_components(component): + yield c + + if any(iter_i18n_fields(block)): + yield block + + +def has_new_i18n_fields(block: Block, existing_translation={}): + """Check if there are new I18nStrings in the block + :param block: The block to check + :param existing_translation: The existing translation dictionary + :return: True if there are new I18nStrings, False otherwise + """ + components = list(iter_i18n_components(block)) + for lang in TranslateContext.get_available_languages(): + for component in components: + for field in iter_i18n_fields(component): + if field == "choices": + for idx in iter_i18n_choices(component.choices): + if isinstance(component.choices[idx], tuple): + value = component.choices[idx][0] + else: + value = component.choices[idx] + if value not in existing_translation.get(lang, {}): + return True + else: + value = getattr(component, field) + if value not in existing_translation.get(lang, {}): + return True + + return False + + +def dump_blocks(block: Block, include_translations={}): + """Dump all I18nStrings in the block to a dictionary + :param block: The block to dump + :param include_translations: The existing translation dictionary + :return: The dumped dictionary + """ + components = list(iter_i18n_components(block)) + + def translate(lang, key): + return include_translations.get(lang, {}).get(key, key) + + ret = {} + + for lang in TranslateContext.get_available_languages(): + ret[lang] = {} + for component in components: + for field in iter_i18n_fields(component): + if field == "choices": + for idx in iter_i18n_choices(component.choices): + if isinstance(component.choices[idx], tuple): + value = component.choices[idx][0] + else: + value = component.choices[idx] + for key in I18nString.unwrap_strings(value): + ret[lang][key] = translate(lang, key) + else: + value = getattr(component, field) + for key in I18nString.unwrap_strings(value): + ret[lang][key] = translate(lang, key) + + return ret + + +def translate_blocks( + block: gr.Blocks = None, + translation={}, + lang: gr.components.Component = None, + persistant=False, +): + """Translate all I18nStrings in the block + :param block: The block to translate, default is the root block + :param translation: The translation dictionary + :param lang: The language component to change the language + :param persistant: Whether to persist the language + """ + if block is None: + block = Context.root_block + + """Translate all I18nStrings in the block""" + if not isinstance(block, gr.Blocks): + raise ValueError("block must be an instance of gradio.Blocks") + + components = list(iter_i18n_components(block)) + TranslateContext.add_translation(translation) + + hidden = gr.HTML( + value="""""" + ) + + if persistant: + try: + from gradio import BrowserState + except ImportError: + raise ValueError("gradio>=5.6.0 is required for persistant language") + + def on_lang_change(request: gr.Request, lang: str, saved_lang: str): + if not lang: + if saved_lang: + lang = saved_lang + else: + lang = TranslateContext.get_lang_from_request(request) + + outputs = [lang, lang, ""] + TranslateContext.set_current_language(request, lang) + + for component in components: + fields = list(iter_i18n_fields(component)) + if component == lang and "value" in fields: + raise ValueError("'lang' component can't has I18nStrings as value") + + modified = {} + + for field in fields: + if field == "choices": + choices = component.choices.copy() + for idx in iter_i18n_choices(choices): + if isinstance(choices[idx], tuple): + k, v = choices[idx] + # We don't need to translate the value + choices[idx] = (str(k), next(I18nString.unwrap_strings(v))) + else: + v = choices[idx] + choices[idx] = (str(v), next(I18nString.unwrap_strings(v))) + modified[field] = choices + else: + modified[field] = str(getattr(component, field)) + + new_comp = gr.update(**modified) + outputs.append(new_comp) + + if len(outputs) == 1: + return outputs[0] + + return outputs + + if lang is None: + lang = gr.State() + + if persistant: + saved_lang = gr.BrowserState(storage_key="lang") + else: + saved_lang = gr.State() + + gr.on( + [block.load, lang.change], + on_lang_change, + inputs=[lang, saved_lang], + outputs=[lang, saved_lang, hidden] + components, + ) + + +@contextmanager +def Translate( + translation, + lang: gr.components.Component = None, + placeholder_langs=[], + persistant=False, +): + """Translate all I18nStrings in the block + :param translation: The translation dictionary or file path + :param lang: The language component to change the language + :param placeholder_langs: The placeholder languages to create a new translation file if translation is a file path + :param persistant: Whether to persist the language + :return: The language component + """ + if lang is None: + lang = gr.State() + yield lang + + if isinstance(translation, dict): + # Static translation + translation_dict = translation + pass + elif isinstance(translation, str): + if os.path.exists(translation): + # Regard as a file path + with open(translation, "r", encoding="utf-8") as f: # Force utf-8 encoding + if translation.endswith(".json"): + translation_dict = json.load(f) + elif translation.endswith(".yaml"): + translation_dict = yaml.safe_load(f) + else: + raise ValueError("Unsupported file format") + else: + translation_dict = {} + else: + raise ValueError("Unsupported translation type") + + if placeholder_langs: + TranslateContext.set_available_languages(placeholder_langs) + + block = Context.block + translate_blocks( + block=block, translation=translation_dict, lang=lang, persistant=persistant + ) + + if isinstance(translation, str) and has_new_i18n_fields( + block, existing_translation=translation_dict + ): + merged = dump_blocks(block, include_translations=translation_dict) + + with open(translation, "w") as f: + if translation.endswith(".json"): + json.dump(merged, f, indent=2, ensure_ascii=False) + elif translation.endswith(".yaml"): + yaml.dump(merged, f, allow_unicode=True, sort_keys=False) diff --git a/webui/translation.json b/webui/translation.json index fef5d579..320482ad 100644 --- a/webui/translation.json +++ b/webui/translation.json @@ -1,36 +1,112 @@ { "en": { - "Title": "✨Easy-to-use LLM Training Data Generation Framework✨", - "Intro": "is a framework for synthetic data generation guided by knowledge graphs, designed to tackle challenges for knowledge-intensive QA generation. \n\nBy uploading your text chunks (such as knowledge in agriculture, healthcare, or marine science) and filling in the LLM API key, you can generate the training data required by **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)** and **[xtuner](https://github.com/InternLM/xtuner)** online. We will automatically delete user information after completion.", - "Use Trainee Model": "Use Trainee Model to identify knowledge blind spots, please keep disable for SiliconCloud", + "Title": "# ✨Easy-to-use LLM Training Data Generation Framework✨\n\n", + "Intro": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) is a framework for synthetic data generation guided by knowledge graphs, designed to tackle challenges for knowledge-intensive QA generation. \n\nBy uploading your text chunks (such as knowledge in agriculture, healthcare, or marine science) and filling in the LLM API key, you can generate the training data required by **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)** and **[xtuner](https://github.com/InternLM/xtuner)** online. We will automatically delete user information after completion.", + "Use Trainee Model": "Use Trainee Model to identify knowledge blind spots, please keep disabled for SiliconCloud", "Synthesizer URL Info": "Base URL for the Synthesizer Model API, use SiliconFlow as default", - "Trainee URL Info": "Base URL for the Trainee Model API, use SiliconFlow as default", "Synthesizer Model Info": "Model for constructing KGs and generating QAs", + "Trainee URL Info": "Base URL for the Trainee Model API, use SiliconFlow as default", "Trainee Model Info": "Model for training", + "SiliconFlow Token for Trainee Model": "SiliconFlow API Key for Trainee Model", "Model Config": "Model Configuration", - "Generation Config": "Generation Config", - "SiliconCloud Token": "SiliconCloud API Key", - "SiliconCloud Token for Trainee Model": "SiliconCloud API Key for Trainee Model", - "Test Connection": "Test Connection", - "Run GraphGen": "Run GraphGen", + "SiliconFlow Token Info": "Get SiliconFlow API Key at \"https://cloud.siliconflow.cn/account/ak\", efficiently and stably use LLM interfaces", + "SiliconFlow Token": "SiliconFlow API Key", "Upload File": "Upload File", - "Example Files": "Example Files" + "Example Files": "Example Files", + "File Preview": "File Preview", + "Split Config Info": "If the input text is a long text without chunks, the system will split the text into appropriate paragraphs based on the following parameters.", + "Chunk Size Info": "Split the long text according to this value. Too short will lead to incomplete knowledge, and too long will lead to LLM input being too long", + "Chunk Size": "chunk_size(Chunk Size)", + "Chunk Overlap Info": "The overlapping part between two adjacent chunks, which can help maintain context continuity", + "Chunk Overlap": "chunk_overlap(Chunk Overlap)", + "Split Config": "Split Config", + "Quiz & Judge Config Info": "Synthesizer Model generates quiz questions based on each knowledge unit in the knowledge graph to assess the Trainee Model's understanding of the knowledge and obtain comprehension loss.", + "Quiz Samples Info": "Configure how many quiz questions to generate for each knowledge unit", + "Quiz Samples": "quiz_samples(Quiz Samples)", + "Quiz & Judge Config": "Quiz & Judge Config", + "Partition Config Info": "Partition the knowledge graph into multiple communities (subgraphs), each community is the smallest unit for generating QAs. Appropriate partitioning methods can improve relevance and diversity.", + "Which algorithm to use for graph partitioning.": "Which algorithm to use for graph partitioning.", + "Partition Method": "method(Partition Method)", + "DFS intro": "The DFS partitioning method uses a depth-first search algorithm to traverse the knowledge graph, starting from one unit and exploring as deeply as possible along connected units until a preset community size is reached or there are no more unvisited units. It then starts a new community from another unvisited unit, repeating this process until all units are assigned to communities.", + "Max Units Per Community Info": "The maximum number of knowledge units (nodes) allowed in each community. If a community exceeds this limit, it will be further partitioned. A unit refers to a node in the knowledge graph, which can be an entity or a relation.", + "Max Units Per Community": "max_units_per_community(Max Units Per Community)", + "BFS intro": "The BFS partitioning method uses a breadth-first search algorithm to traverse the knowledge graph, starting from one unit and exploring all its neighboring units before moving on to the neighbors' neighbors. This process continues until a preset community size is reached or there are no more unvisited units. It then starts a new community from another unvisited unit, repeating this process until all units are assigned to communities.", + "Leiden intro": "The Leiden partitioning method is a community detection algorithm based on modularity optimization, designed to identify tightly connected subgraphs within a graph. The algorithm iteratively optimizes the assignment of nodes to communities, maximizing the density of connections within communities while minimizing connections between communities. The Leiden algorithm can effectively handle large-scale graph data and typically produces higher-quality community partitions compared to other community detection algorithms, such as the Louvain algorithm.", + "Maximum Size of Communities Info": "The maximum number of nodes allowed in a community. If a community exceeds this limit, it will be further partitioned.", + "Maximum Size of Communities": "max_size(Maximum Size of Communities)", + "Use Largest Connected Component Info": "The largest connected component refers to the largest subset of nodes in a graph where there is a path connecting any two nodes. When this option is enabled, the partitioning algorithm will only consider the largest connected component of the knowledge graph for community partitioning, ignoring other smaller connected components. This helps ensure that the generated communities have higher connectivity and relevance.", + "Use Largest Connected Component": "use_lcc(Use Largest Connected Component)", + "Random Seed Info": "The random seed changes the initial state of the graph partitioning, thereby affecting the partitioning results. By setting different random seeds, different community partitioning schemes can be generated, which helps improve the diversity of generated QAs.", + "Random Seed": "random_seed(Random Seed)", + "ECE intro": "ECE is an original graph partitioning method based on the principle of model calibration. It evaluates the performance of each unit under the current model by computing its calibration error (referred to as the comprehension loss) and partitions the graph according to this comprehension error.", + "Min Units Per Community Info": "Limit the minimum number of nodes allowed in each community. If a community has fewer nodes than this limit, it will be discarded.", + "Min Units Per Community": "min_units_per_community(Min Units Per Community)", + "Max Tokens Per Community Info": "The maximum number of tokens allowed in each community. If a community exceeds this limit, it will be further partitioned.", + "Max Tokens Per Community": "max_tokens_per_community(Max Tokens Per Community)", + "Unit Sampling Strategy Info": "Unit sampling strategy determines how to select units from candidate units when constructing communities. Unit sampling strategies include random, max_loss, and min_loss. random means selecting units randomly, max_loss means prioritizing units with higher comprehension loss, and min_loss means prioritizing units with lower comprehension loss.\n\n(Note: Only when the Trainee Model is activated and evaluated will there be comprehension loss, allowing the use of max_loss and min_loss strategies; otherwise, only the random strategy can be used.)", + "Unit Sampling Strategy": "unit_sampling(Unit Sampling Strategy)", + "Partition Config": "Knowledge Graph Partition Config", + "Generation Config Info": "Generation configuration includes generation mode and output data format.", + "Mode Info": "Includes various generation modes such as atomic, aggregated, multi-hop, and chain-of-thought, suitable for tasks of different complexity.", + "Mode": "mode(Mode)", + "Output Data Format Info": "Includes various output formats such as Alpaca, Sharegpt, and ChatML.", + "Output Data Format": "data_format(Output Data Format)", + "Generation Config": "Generation Config", + "Output File": "Output File" }, "zh": { - "Title": "✨开箱即用的LLM训练数据生成框架✨", - "Intro": "是一个基于知识图谱的数据合成框架,旨在知识密集型任务中生成问答。\n\n 上传你的文本块(如农业、医疗、海洋知识),填写 LLM api key,即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。", + "Title": "# ✨开箱即用的LLM训练数据生成框架✨\n\n", + "Intro": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) 是一个基于知识图谱的数据合成框架,旨在知识密集型任务中生成问答。\n\n 上传你的文本块(如农业、医疗、海洋知识),填写 LLM api key,即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。", "Use Trainee Model": "使用Trainee Model来识别知识盲区,使用硅基流动时请保持禁用", "Synthesizer URL Info": "调用合成模型API的URL,默认使用硅基流动", - "Trainee URL Info": "调用学生模型API的URL,默认使用硅基流动", "Synthesizer Model Info": "用于构建知识图谱和生成问答的模型", + "Trainee URL Info": "调用学生模型API的URL,默认使用硅基流动", "Trainee Model Info": "用于训练的模型", + "SiliconFlow Token for Trainee Model": "SiliconFlow Token for Trainee Model", "Model Config": "模型配置", - "Generation Config": "生成配置", - "SiliconCloud Token": "硅基流动 API Key", - "SiliconCloud Token for Trainee Model": "硅基流动 API Key (学生模型)", - "Test Connection": "测试接口", - "Run GraphGen": "运行GraphGen", + "SiliconFlow Token Info": "在 \"https://cloud.siliconflow.cn/account/ak\" 获取硅基流动 API 秘钥, 使用高效稳定的 LLM 接口", + "SiliconFlow Token": "硅基流动 API 秘钥", "Upload File": "上传文件", - "Example Files": "示例文件" + "Example Files": "示例文件", + "File Preview": "文件预览", + "Split Config Info": "如果输入文本是未分块的长文本,系统会根据以下参数将文本分成合适的段落。", + "Chunk Size Info": "按照该值将分割长文本,太短会导致知识不完整,太长会导致 LLM 输入过长", + "Chunk Size": "chunk_size(分割大小)", + "Chunk Overlap Info": "两个相邻块之间的重叠部分,有助于保持上下文的连续性", + "Chunk Overlap": "chunk_overlap(分割重叠大小)", + "Split Config": "文本分割配置", + "Quiz & Judge Config Info": "合成模型根据知识图谱中的每个知识单元,生成判断题,用于评估学生模型对知识的理解程度,得到理解误差。", + "Quiz Samples Info": "配置每个知识单元生成多少判断题", + "Quiz Samples": "quiz_samples(Quiz Samples)", + "Quiz & Judge Config": "测试与评判配置", + "Partition Config Info": "将知识图谱划分为多个社区(子图),每个社区是生成问答的最小单位。合适的分区方法可以提高关联性和多样性。", + "Which algorithm to use for graph partitioning.": "选择用于图划分的算法。", + "Partition Method": "method(划分方法)", + "DFS intro": "DFS划分方法使用深度优先搜索算法遍历知识图谱,从一个单元开始,沿着与之连接的单元深入探索,直到达到预设的社区大小或没有更多未访问的单元为止。然后,它会从另一个未访问的单元开始新的社区,重复这一过程,直到所有单元都被分配到社区中。", + "Max Units Per Community Info": "每个社区允许的知识单元(节点)的最大数量。如果一个社区超过这个限制,它将被进一步划分。一个单元指的是知识图谱中的一个节点,可以是实体或关系。", + "Max Units Per Community": "max_units_per_community(每个社区的最大单元数)", + "BFS intro": "BFS划分方法使用广度优先搜索算法遍历知识图谱,从一个单元开始,探索所有与之直接连接的单元,然后再从这些单元出发,继续探索它们的直接连接单元。这个过程会持续直到达到预设的社区大小或没有更多未访问的单元为止。然后,它会从另一个未访问的单元开始新的社区,重复这一过程,直到所有单元都被分配到社区中。", + "Leiden intro": "Leiden划分方法是一种基于模块度优化的社区检测算法,旨在识别图中的紧密连接子图。该算法通过迭代地优化节点的社区分配,最大化社区内的连接密度,同时最小化社区间的连接。Leiden算法能够有效处理大规模图数据,并且通常比其他社区检测算法(如Louvain算法)产生更高质量的社区划分结果。", + "Maximum Size of Communities Info": "一个社区中允许的最大节点数量。如果一个社区的节点数超过这个限制,它将被进一步划分。", + "Maximum Size of Communities": "max_size(社区的最大尺寸)", + "Use Largest Connected Component Info": "最大连通分量是指在图中节点之间存在路径连接的最大子集。启用此选项后,划分算法将仅考虑知识图谱中的最大连通分量进行社区划分,忽略其他较小的连通分量。这有助于确保生成的社区具有更高的连通性和相关性。", + "Use Largest Connected Component": "use_lcc(使用最大连通分量)", + "Random Seed Info": "随机种子改变图划分的初始状态,从而影响划分结果。通过设置不同的随机种子,可以生成不同的社区划分方案,有助于提高生成问答的多样性。", + "Random Seed": "random_seed(随机种子)", + "ECE intro": "ECE是一种基于模型校准原理的原创图划分方法。ECE通过计算单元的校准误差来评估其在当前模型下的表现(记为理解误差),并根据理解误差对图进行划分。", + "Min Units Per Community Info": "限制每个社区中允许的最小节点数量。如果一个社区的节点数少于这个限制,它将被舍弃。", + "Min Units Per Community": "min_units_per_community(每个社区的最小单元数)", + "Max Tokens Per Community Info": "每个社区允许的最大Token数量。如果一个社区的Token数超过这个限制,它将被进一步划分。", + "Max Tokens Per Community": "max_tokens_per_community(每个社区的最大Token数)", + "Unit Sampling Strategy Info": "单元采样策略决定在构建社区的时候如何从候选单元中选择单元。单元采样策略包括 random, max_loss, min_loss。 random表示随机选择单元,max_loss表示优先选择理解误差较大的单元,min_loss表示优先选择理解误差较小的单元。\n\n(注意:只有当学生模型启动时,经过评测后,才会有理解误差,才能使用 max_loss 和 min_loss 策略,否则只能使用 random 策略)", + "Unit Sampling Strategy": "unit_sampling(单元采样策略)", + "Partition Config": "知识图谱分区配置", + "Generation Config Info": "生成配置包括生成模式和输出数据格式。", + "Mode Info": "包括原子、聚合、多跳、思维链等多种生成模式,适用于不同复杂度的任务。", + "Mode": "mode(生成模式)", + "Output Data Format Info": "包括 Alpaca, Sharegpt, ChatML等多种输出格式。", + "Output Data Format": "data_format(输出数据格式)", + "Generation Config": "生成配置", + "Output File": "输出文件" } } \ No newline at end of file diff --git a/webui/utils/__init__.py b/webui/utils/__init__.py new file mode 100644 index 00000000..07e5d386 --- /dev/null +++ b/webui/utils/__init__.py @@ -0,0 +1,3 @@ +from .cache import cleanup_workspace, setup_workspace +from .count_tokens import count_tokens +from .preview_file import preview_file diff --git a/webui/cache_utils.py b/webui/utils/cache.py similarity index 99% rename from webui/cache_utils.py rename to webui/utils/cache.py index 96c7d4d9..0c9412f2 100644 --- a/webui/cache_utils.py +++ b/webui/utils/cache.py @@ -1,6 +1,7 @@ import os -import uuid import shutil +import uuid + def setup_workspace(folder): request_id = str(uuid.uuid4()) diff --git a/webui/count_tokens.py b/webui/utils/count_tokens.py similarity index 61% rename from webui/count_tokens.py rename to webui/utils/count_tokens.py index 53bed59a..82b5522c 100644 --- a/webui/count_tokens.py +++ b/webui/utils/count_tokens.py @@ -1,6 +1,7 @@ +import json import os import sys -import json + import pandas as pd # pylint: disable=wrong-import-position @@ -8,24 +9,29 @@ sys.path.append(root_dir) from graphgen.models import Tokenizer + def count_tokens(file, tokenizer_name, data_frame): if not file or not os.path.exists(file): return data_frame if file.endswith(".jsonl"): - with open(file, "r", encoding='utf-8') as f: + with open(file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] elif file.endswith(".json"): - with open(file, "r", encoding='utf-8') as f: + with open(file, "r", encoding="utf-8") as f: data = json.load(f) data = [item for sublist in data for item in sublist] elif file.endswith(".txt"): - with open(file, "r", encoding='utf-8') as f: + with open(file, "r", encoding="utf-8") as f: data = f.read() - chunks = [ - data[i:i + 512] for i in range(0, len(data), 512) - ] + chunks = [data[i : i + 512] for i in range(0, len(data), 512)] data = [{"content": chunk} for chunk in chunks] + elif file.endswith(".csv"): + df = pd.read_csv(file) + if "content" in df.columns: + data = df["content"].tolist() + else: + data = df.iloc[:, 0].tolist() else: raise ValueError(f"Unsupported file type: {file}") @@ -39,22 +45,15 @@ def count_tokens(file, tokenizer_name, data_frame): content = item.get("content", "") else: content = item - token_count += len(tokenizer.encode_string(content)) + token_count += len(tokenizer.encode(content)) - _update_data = [[ - str(token_count), - str(token_count * 50), - "N/A" - ]] + _update_data = [[str(token_count), str(token_count * 50), "N/A"]] try: - new_df = pd.DataFrame( - _update_data, - columns=data_frame.columns - ) + new_df = pd.DataFrame(_update_data, columns=data_frame.columns) data_frame = new_df - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print("[ERROR] DataFrame操作异常:", str(e)) return data_frame diff --git a/webui/utils/preview_file.py b/webui/utils/preview_file.py new file mode 100644 index 00000000..aba531c2 --- /dev/null +++ b/webui/utils/preview_file.py @@ -0,0 +1,29 @@ +import codecs +import os + +import gradio as gr +import pandas as pd + + +def preview_file(file): + if file is None: + return gr.update(visible=False), gr.update(visible=False) + + path = file.name + ext = os.path.splitext(path)[1].lower() + + try: + if ext == ".csv": + df = pd.read_csv(path, nrows=10) + return gr.update(visible=False), gr.update(value=df, visible=True) + with codecs.open(path, "r", encoding="utf-8") as f: + text = f.read(5000) + if len(text) == 5000: + text += "\n\n... (truncated at 5000 chars)" + return gr.update( + value=text, visible=True, language="json" if ext != ".txt" else None + ), gr.update(visible=False) + except Exception as e: # pylint: disable=broad-except + return gr.update( + value=f"Preview failed: {e}", visible=True, language=None + ), gr.update(visible=False)